Skip to content

Commit 94d529e

Browse files
authored
[Nested] Optimize List Type in list_value (duckdb#17063)
In duckdb#12468 `list_value` was optimized for primitive types. This PR aims to build on that and optimize list types. For example: ##### Large Tables ```sql CREATE TABLE large_list_table AS SELECT [i, i, i] AS a, [i + 1, i + 1] AS b, [i + 2] AS c FROM range(100000000) tbl(i); SELECT LIST_VALUE(a, b, c) FROM large_list_table; ``` | 1.2.2 | New | |---|---| | 28.55s | 8.68s | ##### Large Lists ```sql CREATE TABLE large_list AS SELECT list(i) AS a FROM range(1000000) t(i); SELECT list_value(a, a, a, a, a) FROM large_list; ``` | 1.2.2 | New | |---|---| | 0.487s | 0.0234s | ##### Nested Lists ```sql CREATE TABLE nested_lists AS SELECT [[i], [i + 1]] AS a, [[i, i], [i + 1, i + 1]] as b FROM range(10000) t(i); SELECT list_value(a, b, a, b, a, b, a, b, a, b, a, b, a, b) FROM nested_lists; ``` | 1.2.2 | New | |---|---| | 0.128s | 0.0075s | While these results show improvements, the timings are still slower than desired. Profiling suggests that most of the time is spent in VectorOperations::Copy. Any feedback or suggestions on how to further improve performance would be greatly appreciated! Some additional tests and benchmarks have also been included.
2 parents 09b7c4f + e4da3f4 commit 94d529e

5 files changed

Lines changed: 275 additions & 0 deletions

File tree

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# name: benchmark/micro/list/list_value_large_list.benchmark
2+
# description: LIST_VALUE performance with very large lists
3+
# group: [list]
4+
5+
name List Value
6+
group micro
7+
subgroup list
8+
9+
load
10+
CREATE TABLE large_list AS SELECT list(i) AS a FROM range(1000000) t(i);
11+
12+
run
13+
SELECT list_value(a, a, a, a, a) FROM large_list;
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# name: benchmark/micro/list/list_value_nested_list.benchmark
2+
# description: LIST_VALUE performance with nested lists
3+
# group: [list]
4+
5+
name List Value
6+
group micro
7+
subgroup list
8+
9+
load
10+
CREATE TABLE nested_lists AS SELECT [[i], [i + 1]] AS a, [[i, i], [i + 1, i + 1]] as b FROM range(10000) t(i);
11+
12+
run
13+
SELECT list_value(a, b, a, b, a, b, a, b, a, b, a, b, a, b) FROM nested_lists;

extension/core_functions/scalar/list/list_value.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,59 @@ static void TemplatedListValueFunction(DataChunk &args, Vector &result) {
5252
ListVector::SetListSize(result, args.size() * list_size);
5353
}
5454

55+
static void ListValueListFunction(DataChunk &args, Vector &result) {
56+
const idx_t list_size = args.ColumnCount();
57+
ListVector::Reserve(result, args.size() * list_size);
58+
59+
vector<idx_t> col_offsets;
60+
idx_t offset_sum = 0;
61+
for (idx_t i = 0; i < list_size; i++) {
62+
col_offsets.push_back(offset_sum);
63+
auto &list = args.data[i];
64+
const auto length = ListVector::GetListSize(list);
65+
offset_sum += length;
66+
}
67+
68+
auto &result_list = ListVector::GetEntry(result);
69+
ListVector::Reserve(result_list, offset_sum);
70+
71+
auto &result_child_vector = ListVector::GetEntry(result_list);
72+
for (idx_t i = 0; i < list_size; i++) {
73+
auto list = args.data[i];
74+
const auto length = ListVector::GetListSize(list);
75+
if (length == 0) {
76+
continue;
77+
}
78+
auto &child_vector = ListVector::GetEntry(list);
79+
VectorOperations::Copy(child_vector, result_child_vector, length, 0, col_offsets[i]);
80+
}
81+
82+
const auto result_data = FlatVector::GetData<list_entry_t>(result);
83+
const auto result_list_data = FlatVector::GetData<list_entry_t>(result_list);
84+
auto &result_list_validity = FlatVector::Validity(result_list);
85+
86+
const auto args_unified_format = args.ToUnifiedFormat();
87+
for (idx_t r = 0; r < args.size(); r++) {
88+
for (idx_t c = 0; c < list_size; c++) {
89+
const auto input_idx = args_unified_format[c].sel->get_index(r);
90+
const auto result_idx = r * list_size + c;
91+
const auto input_data = UnifiedVectorFormat::GetData<list_entry_t>(args_unified_format[c]);
92+
if (args_unified_format[c].validity.RowIsValid(input_idx)) {
93+
const auto length = input_data[input_idx].length;
94+
const auto offset = col_offsets[c] + input_data[input_idx].offset;
95+
result_list_data[result_idx] = list_entry_t(offset, length);
96+
} else {
97+
result_list_validity.SetInvalid(result_idx);
98+
}
99+
}
100+
result_data[r].offset = r * list_size;
101+
result_data[r].length = list_size;
102+
}
103+
104+
ListVector::SetListSize(result, args.size() * list_size);
105+
ListVector::SetListSize(result_list, offset_sum);
106+
}
107+
55108
static void TemplatedListValueFunctionFallback(DataChunk &args, Vector &result) {
56109
auto &child_type = ListType::GetChildType(result.GetType());
57110
auto result_data = FlatVector::GetData<list_entry_t>(result);
@@ -125,6 +178,9 @@ static void ListValueFunction(DataChunk &args, ExpressionState &state, Vector &r
125178
case PhysicalType::VARCHAR:
126179
TemplatedListValueFunction<string_t, ListValueStringAssign>(args, result);
127180
break;
181+
case PhysicalType::LIST:
182+
ListValueListFunction(args, result);
183+
break;
128184
default: {
129185
TemplatedListValueFunctionFallback(args, result);
130186
break;
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
# name: test/sql/function/list/list_value.test
2+
# description: Test the list_value function
3+
# group: [list]
4+
5+
statement ok
6+
pragma enable_verification
7+
8+
# nested lists
9+
query I
10+
SELECT LIST_VALUE([1, 7], [2], [3]);
11+
----
12+
[[1, 7], [2], [3]]
13+
14+
query I
15+
SELECT LIST_VALUE([1, 7], [2], [3], NULL);
16+
----
17+
[[1, 7], [2], [3], NULL]
18+
19+
query I
20+
SELECT LIST_VALUE([1, 7], [2], NULL, [3]);
21+
----
22+
[[1, 7], [2], NULL, [3]]
23+
24+
query I
25+
SELECT LIST_VALUE([1, 7], [NULL], [2], [3]);
26+
----
27+
[[1, 7], [NULL], [2], [3]]
28+
29+
# nested lists in table
30+
statement ok
31+
CREATE TABLE test_table (c1 INTEGER[], c2 INTEGER[], c3 INTEGER[]);
32+
33+
statement ok
34+
INSERT INTO test_table VALUES ([1, 1], [2, 2], [3]);
35+
36+
statement ok
37+
INSERT INTO test_table VALUES ([4], [5, 5, 5], [6, 6]);
38+
39+
statement ok
40+
INSERT INTO test_table VALUES ([7, 7, 7, 7], [8], [9, 9, 9]);
41+
42+
statement ok
43+
INSERT INTO test_table VALUES ([], [], []);
44+
45+
statement ok
46+
INSERT INTO test_table VALUES ([-1, -1, NULL], NULL, [-2, -2]);
47+
48+
query I
49+
SELECT LIST_VALUE(c1, c2, c3) FROM test_table;
50+
----
51+
[[1, 1], [2, 2], [3]]
52+
[[4], [5, 5, 5], [6, 6]]
53+
[[7, 7, 7, 7], [8], [9, 9, 9]]
54+
[[], [], []]
55+
[[-1, -1, NULL], NULL, [-2, -2]]
56+
57+
# structs in nested lists
58+
query I
59+
SELECT LIST_VALUE(ROW(1, 1), ROW(2, 2), ROW(3, 3));
60+
----
61+
[(1, 1), (2, 2), (3, 3)]
62+
63+
query I
64+
SELECT LIST_VALUE(ROW(1, 1), ROW(2, 2), ROW(3, 3), NULL);
65+
----
66+
[(1, 1), (2, 2), (3, 3), NULL]
67+
68+
query I
69+
SELECT LIST_VALUE(ROW(1, 1), ROW(NULL, 2), NULL, ROW(3, 3));
70+
----
71+
[(1, 1), (NULL, 2), NULL, (3, 3)]
72+
73+
statement ok
74+
CREATE TABLE struct_table(a ROW(a INTEGER, b INTEGER)[], b ROW(a INTEGER, b INTEGER)[], c ROW(a INTEGER, b INTEGER)[]);
75+
76+
statement ok
77+
INSERT INTO STRUCT_TABLE VALUES ([ROW(1, 1), ROW(2, 2)], [ROW(3, 3), ROW(4, 4)], [ROW(5, 5), ROW(6, 6)]);
78+
79+
statement ok
80+
INSERT INTO STRUCT_TABLE VALUES ([ROW(7, 7), ROW(8, 8)], [ROW(9, 9)], [ROW(10, 10), ROW(11, 11), ROW(12, 12)]);
81+
82+
statement ok
83+
INSERT INTO STRUCT_TABLE VALUES ([ROW(13, 13)], [ROW(14, 14), ROW(15, 15), ROW(16, 16)], [ROW(17, 17), ROW(18, 18)]);
84+
85+
statement ok
86+
INSERT INTO STRUCT_TABLE VALUES ([NULL, ROW(20, 20)], NULL, [ROW(23, 23), ROW(24, 24), ROW(25, 25), ROW(26, 26)]);
87+
88+
query I
89+
SELECT LIST_VALUE(a, b, c) FROM struct_table;
90+
----
91+
[[{'a': 1, 'b': 1}, {'a': 2, 'b': 2}], [{'a': 3, 'b': 3}, {'a': 4, 'b': 4}], [{'a': 5, 'b': 5}, {'a': 6, 'b': 6}]]
92+
[[{'a': 7, 'b': 7}, {'a': 8, 'b': 8}], [{'a': 9, 'b': 9}], [{'a': 10, 'b': 10}, {'a': 11, 'b': 11}, {'a': 12, 'b': 12}]]
93+
[[{'a': 13, 'b': 13}], [{'a': 14, 'b': 14}, {'a': 15, 'b': 15}, {'a': 16, 'b': 16}], [{'a': 17, 'b': 17}, {'a': 18, 'b': 18}]]
94+
[[NULL, {'a': 20, 'b': 20}], NULL, [{'a': 23, 'b': 23}, {'a': 24, 'b': 24}, {'a': 25, 'b': 25}, {'a': 26, 'b': 26}]]
95+
96+
# strings in nested lists
97+
query I
98+
SELECT LIST_VALUE(['a', 'a'], ['b', 'b', 'b'], ['c']);
99+
----
100+
[[a, a], [b, b, b], [c]]
101+
102+
query I
103+
SELECT LIST_VALUE(['a', 'a'], ['b', 'b', 'b'], ['c'], NULL);
104+
----
105+
[[a, a], [b, b, b], [c], NULL]
106+
107+
query I
108+
SELECT LIST_VALUE(['a', 'a'], ['b', 'b', NULL], NULL, ['c']);
109+
----
110+
[[a, a], [b, b, NULL], NULL, [c]]
111+
112+
statement ok
113+
CREATE TABLE string_table(a VARCHAR[], b VARCHAR[], c VARCHAR[]);
114+
115+
statement ok
116+
INSERT INTO string_table VALUES (['a', 'a'], ['b', 'b', 'b'], ['c']);
117+
118+
statement ok
119+
INSERT INTO string_table VALUES (['d'], ['e', 'e', 'e', 'e'], ['f', 'f']);
120+
121+
statement ok
122+
INSERT INTO string_table VALUES (['g', 'g', 'g', 'g'], ['h'], ['i', 'i', 'i']);
123+
124+
statement ok
125+
INSERT INTO string_table VALUES (['j', 'j'], NULL, ['k', 'k', 'k']);
126+
127+
statement ok
128+
INSERT INTO string_table VALUES (['l', 'l', 'l'], ['m', 'm'], [NULL, 'n', 'n']);
129+
130+
query I
131+
SELECT LIST_VALUE(a, b, c) FROM string_table;
132+
----
133+
[[a, a], [b, b, b], [c]]
134+
[[d], [e, e, e, e], [f, f]]
135+
[[g, g, g, g], [h], [i, i, i]]
136+
[[j, j], NULL, [k, k, k]]
137+
[[l, l, l], [m, m], [NULL, n, n]]
138+
139+
# nested lists in lists
140+
query I
141+
SELECT LIST_VALUE([[1, 1], [2]], [[3, 3], [4, 4, 4, 4], [5, 5]], [[6, 6]]);
142+
----
143+
[[[1, 1], [2]], [[3, 3], [4, 4, 4, 4], [5, 5]], [[6, 6]]]
144+
145+
query I
146+
SELECT LIST_VALUE([[1, 1], [2]], [[3, 3], [4, 4, 4, 4], [5, 5]], [[6, 6]], NULL);
147+
----
148+
[[[1, 1], [2]], [[3, 3], [4, 4, 4, 4], [5, 5]], [[6, 6]], NULL]
149+
150+
query I
151+
SELECT LIST_VALUE([[1, 1], [2]], [[3, 3], [4, 4, NULL, 4], [5, 5]], NULL, [[6, 6]]);
152+
----
153+
[[[1, 1], [2]], [[3, 3], [4, 4, NULL, 4], [5, 5]], NULL, [[6, 6]]]
154+
155+
statement ok
156+
CREATE TABLE nested_list_table(a INTEGER[][], b INTEGER[][], c INTEGER[][]);
157+
158+
statement ok
159+
INSERT INTO nested_list_table VALUES ([[1, 1, 1], [2]], [[3, 3], [4, 4, 4, 4], [5, 5]], [[6, 6]]);
160+
161+
statement ok
162+
INSERT INTO nested_list_table VALUES ([[7, 7], [8, 8, 8]], [[9]], [[10, 10], [11, 11, 11]]);
163+
164+
statement ok
165+
INSERT INTO nested_list_table VALUES ([[12, NULL]], [NULL, [13, 13, 13]], NULL);
166+
167+
query I
168+
SELECT LIST_VALUE(a, b, c) FROM nested_list_table;
169+
----
170+
[[[1, 1, 1], [2]], [[3, 3], [4, 4, 4, 4], [5, 5]], [[6, 6]]]
171+
[[[7, 7], [8, 8, 8]], [[9]], [[10, 10], [11, 11, 11]]]
172+
[[[12, NULL]], [NULL, [13, 13, 13]], NULL]
173+
174+
# errors
175+
statement error
176+
SELECT LIST_VALUE([1, 1], ['a', 'a'], [ROW(2, 2), ROW(3, 3)]);
177+
----
178+
Binder Error: Cannot create a list of types INTEGER[] and VARCHAR[] - an explicit cast is required
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# name: test/sql/function/list/list_value.test_slow
2+
# description: Test the list_value function
3+
# group: [list]
4+
5+
# test very large table
6+
statement ok
7+
CREATE TABLE large_list_table(a INTEGER[], b INTEGER[], c INTEGER[]);
8+
9+
statement ok
10+
INSERT INTO large_list_table SELECT [i, i, i], [i + 1, i + 1], [i + 2] FROM range(10000) tbl(i);
11+
12+
query I
13+
SELECT LIST_VALUE(a, b, c) FROM large_list_table WHERE a[1] = 1;
14+
----
15+
[[1, 1, 1], [2, 2], [3]]

0 commit comments

Comments
 (0)