Skip to content

Commit 81218cb

Browse files
authored
Merge branch 'main' into main
2 parents 2049537 + c1a4957 commit 81218cb

File tree

5 files changed

+88
-10
lines changed

5 files changed

+88
-10
lines changed

benchmarks/queries/clickbench/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ Results look like
112112
Note this query is somewhat synthetic as "WatchID" is almost unique (there are a few duplicates)
113113

114114
```sql
115-
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, MAX("ResponseStartTiming") tmax
115+
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax
116116
FROM 'hits.parquet'
117117
WHERE "JavaEnable" = 0 -- filters to 32M of 100M rows
118118
GROUP BY "ClientIP", "WatchID"

benchmarks/queries/clickbench/extended.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTI
33
SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
44
SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice") FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10;
55
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10;
6-
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
6+
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
77
SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein(CAST("UTMSource" AS STRING), CAST("UTMCampaign" AS STRING)) < 3;

datafusion/spark/src/function/utils.rs

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ pub mod test {
2828
let expected: datafusion_common::Result<Option<$EXPECTED_TYPE>> = $EXPECTED;
2929
let func = $FUNC;
3030

31-
let arg_fields_owned = $ARGS
31+
let arg_fields_owned: Vec<arrow::datatypes::Field> = $ARGS
3232
.iter()
3333
.enumerate()
3434
.map(|(idx, arg)| {
@@ -42,6 +42,8 @@ pub mod test {
4242
})
4343
.collect::<Vec<_>>();
4444

45+
let arg_fields: Vec<&arrow::datatypes::Field> = arg_fields_owned.iter().collect();
46+
4547
let cardinality = $ARGS
4648
.iter()
4749
.fold(Option::<usize>::None, |acc, arg| match arg {
@@ -67,7 +69,12 @@ pub mod test {
6769
let return_field = return_field.unwrap();
6870
assert_eq!(return_field.data_type(), &$EXPECTED_DATA_TYPE);
6971

70-
let result = func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS, number_rows: cardinality, return_field: &return_field, arg_fields: arg_fields_owned.iter().collect()});
72+
let result = func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{
73+
args: $ARGS,
74+
number_rows: cardinality,
75+
return_field: &return_field,
76+
arg_fields: arg_fields.clone(),
77+
});
7178
assert_eq!(result.is_ok(), true, "function returned an error: {}", result.unwrap_err());
7279

7380
let result = result.unwrap().to_array(cardinality).expect("Failed to convert to array");
@@ -91,7 +98,12 @@ pub mod test {
9198
let return_field = return_field.unwrap();
9299

93100
// invoke is expected error - cannot use .expect_err() due to Debug not being implemented
94-
match func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{args: $ARGS, number_rows: cardinality, return_field: &return_field, arg_fields: arg_fields_owned.iter().collect()}) {
101+
match func.invoke_with_args(datafusion_expr::ScalarFunctionArgs{
102+
args: $ARGS,
103+
number_rows: cardinality,
104+
return_field: &return_field,
105+
arg_fields,
106+
}) {
95107
Ok(_) => assert!(false, "expected error"),
96108
Err(error) => {
97109
assert!(expected_error.strip_backtrace().starts_with(&error.strip_backtrace()));
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
19+
# DataFusion specific ClickBench "Extended" Queries
20+
# See data provenance notes in clickbench.slt
21+
22+
statement ok
23+
CREATE EXTERNAL TABLE hits
24+
STORED AS PARQUET
25+
LOCATION '../core/tests/data/clickbench_hits_10.parquet';
26+
27+
# If you change any of these queries, please change the corresponding query in
28+
# benchmarks/queries/clickbench/extended.sql and update the README.
29+
30+
query III
31+
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
32+
----
33+
1 1 1
34+
35+
query III
36+
SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage") FROM hits;
37+
----
38+
1 1 1
39+
40+
query TIIII
41+
SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
42+
----
43+
� 1 1 1 1
44+
45+
query IIIRRRR
46+
SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice") FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10;
47+
----
48+
0 839 6 0 0 0 0
49+
0 197 2 0 0 0 0
50+
51+
query IIIIII
52+
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10;
53+
----
54+
55+
query IIIIII
56+
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
57+
----
58+
59+
query I
60+
SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein(CAST("UTMSource" AS STRING), CAST("UTMCampaign" AS STRING)) < 3;
61+
----
62+
0
63+
64+
65+
statement ok
66+
drop table hits;

datafusion/sqllogictest/test_files/listing_table_statistics.slt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
# Test file with different schema order but genenrating correct statistics for table
18+
# Test file with different schema order but generating correct statistics for table
1919
statement ok
20-
COPY (SELECT * FROM values (1, 'a'), (2, 'b') t(int_col, str_col)) to '/tmp/table/1.parquet';
20+
COPY (SELECT * FROM values (1, 'a'), (2, 'b') t(int_col, str_col)) to 'test_files/scratch/table/1.parquet';
2121

2222
statement ok
23-
COPY (SELECT * FROM values ('c', 3), ('d', -1) t(str_col, int_col)) to '/tmp/table/2.parquet';
23+
COPY (SELECT * FROM values ('c', 3), ('d', -1) t(str_col, int_col)) to 'test_files/scratch/table/2.parquet';
2424

2525
statement ok
2626
set datafusion.execution.collect_statistics = true;
@@ -29,13 +29,13 @@ statement ok
2929
set datafusion.explain.show_statistics = true;
3030

3131
statement ok
32-
create external table t stored as parquet location '/tmp/table';
32+
create external table t stored as parquet location 'test_files/scratch/table';
3333

3434
query TT
3535
explain format indent select * from t;
3636
----
3737
logical_plan TableScan: t projection=[int_col, str_col]
38-
physical_plan DataSourceExec: file_groups={2 groups: [[tmp/table/1.parquet], [tmp/table/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(288), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]]
38+
physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/table/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(288), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]]
3939

4040
statement ok
4141
drop table t;

0 commit comments

Comments
 (0)