Skip to content

Commit a10a0eb

Browse files
skontoyiheng
authored andcommitted
[SPARK-28279][SQL][PYTHON][TESTS] Convert and port 'group-analytics.sql' into UDF test base
## What changes were proposed in this pull request? This PR adds some tests converted from group-analytics.sql to test UDFs. Please see contribution guide of this umbrella ticket - SPARK-27921. <details><summary>Diff comparing to 'group-analytics.sql'</summary> <p> ```diff diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out index 31e9e08..3439a05 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out -13,9 +13,9 struct<> -- !query 1 -SELECT a + b, b, udf(SUM(a - b)) FROM testData GROUP BY a + b, b WITH CUBE +SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH CUBE -- !query 1 schema -struct<(a + b):int,b:int,CAST(udf(cast(sum(cast((a - b) as bigint)) as string)) AS BIGINT):bigint> +struct<(a + b):int,b:int,sum((a - b)):bigint> -- !query 1 output 2 1 0 2 NULL 0 -33,9 +33,9 NULL NULL 3 -- !query 2 -SELECT a, udf(b), SUM(b) FROM testData GROUP BY a, b WITH CUBE +SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE -- !query 2 schema -struct<a:int,CAST(udf(cast(b as string)) AS INT):int,sum(b):bigint> +struct<a:int,b:int,sum(b):bigint> -- !query 2 output 1 1 1 1 2 2 -52,9 +52,9 NULL NULL 9 -- !query 3 -SELECT udf(a + b), b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP +SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP -- !query 3 schema -struct<CAST(udf(cast((a + b) as string)) AS INT):int,b:int,sum((a - b)):bigint> +struct<(a + b):int,b:int,sum((a - b)):bigint> -- !query 3 output 2 1 0 2 NULL 0 -70,9 +70,9 NULL NULL 3 -- !query 4 -SELECT a, b, udf(SUM(b)) FROM testData GROUP BY a, b WITH ROLLUP +SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP -- !query 4 schema -struct<a:int,b:int,CAST(udf(cast(sum(cast(b as bigint)) as string)) AS BIGINT):bigint> +struct<a:int,b:int,sum(b):bigint> -- !query 4 output 1 1 1 1 2 2 -97,7 +97,7 struct<> -- !query 6 -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY udf(course), year +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY course, year -- !query 6 schema struct<course:string,year:int,sum(earnings):bigint> -- !query 6 output -111,7 +111,7 dotNET 2013 48000 -- !query 7 -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, udf(year) +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, year -- !query 7 schema struct<course:string,year:int,sum(earnings):bigint> -- !query 7 output -127,9 +127,9 dotNET 2013 48000 -- !query 8 -SELECT course, udf(year), SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year) +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year) -- !query 8 schema -struct<course:string,CAST(udf(cast(year as string)) AS INT):int,sum(earnings):bigint> +struct<course:string,year:int,sum(earnings):bigint> -- !query 8 output Java NULL 50000 NULL 2012 35000 -138,26 +138,26 dotNET NULL 63000 -- !query 9 -SELECT course, year, udf(SUM(earnings)) FROM courseSales GROUP BY course, year GROUPING SETS(course) +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course) -- !query 9 schema -struct<course:string,year:int,CAST(udf(cast(sum(cast(earnings as bigint)) as string)) AS BIGINT):bigint> +struct<course:string,year:int,sum(earnings):bigint> -- !query 9 output Java NULL 50000 dotNET NULL 63000 -- !query 10 -SELECT udf(course), year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year) +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year) -- !query 10 schema -struct<CAST(udf(cast(course as string)) AS STRING):string,year:int,sum(earnings):bigint> +struct<course:string,year:int,sum(earnings):bigint> -- !query 10 output NULL 2012 35000 NULL 2013 78000 -- !query 11 -SELECT course, udf(SUM(earnings)) AS sum FROM courseSales -GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, udf(sum) +SELECT course, SUM(earnings) AS sum FROM courseSales +GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum -- !query 11 schema struct<course:string,sum:bigint> -- !query 11 output -173,7 +173,7 dotNET 63000 -- !query 12 SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM courseSales -GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY udf(course), sum +GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum -- !query 12 schema struct<course:string,sum:bigint,grouping_id(course, earnings):int> -- !query 12 output -188,10 +188,10 dotNET 63000 1 -- !query 13 -SELECT udf(course), udf(year), GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales +SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year) -- !query 13 schema -struct<CAST(udf(cast(course as string)) AS STRING):string,CAST(udf(cast(year as string)) AS INT):int,grouping(course):tinyint,grouping(year):tinyint,grouping_id(course, year):int> +struct<course:string,year:int,grouping(course):tinyint,grouping(year):tinyint,grouping_id(course, year):int> -- !query 13 output Java 2012 0 0 0 Java 2013 0 0 0 -205,7 +205,7 dotNET NULL 0 1 1 -- !query 14 -SELECT course, udf(year), GROUPING(course) FROM courseSales GROUP BY course, year +SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year -- !query 14 schema struct<> -- !query 14 output -214,7 +214,7 grouping() can only be used with GroupingSets/Cube/Rollup; -- !query 15 -SELECT course, udf(year), GROUPING_ID(course, year) FROM courseSales GROUP BY course, year +SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY course, year -- !query 15 schema struct<> -- !query 15 output -223,7 +223,7 grouping_id() can only be used with GroupingSets/Cube/Rollup; -- !query 16 -SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, udf(year) +SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year -- !query 16 schema struct<course:string,year:int,grouping__id:int> -- !query 16 output -240,7 +240,7 NULL NULL 3 -- !query 17 SELECT course, year FROM courseSales GROUP BY CUBE(course, year) -HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, udf(year) +HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, year -- !query 17 schema struct<course:string,year:int> -- !query 17 output -250,7 +250,7 dotNET NULL -- !query 18 -SELECT course, udf(year) FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0 +SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0 -- !query 18 schema struct<> -- !query 18 output -259,7 +259,7 grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; -- !query 19 -SELECT course, udf(udf(year)) FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0 +SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0 -- !query 19 schema struct<> -- !query 19 output -268,9 +268,9 grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; -- !query 20 -SELECT udf(course), year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0 +SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0 -- !query 20 schema -struct<CAST(udf(cast(course as string)) AS STRING):string,year:int> +struct<course:string,year:int> -- !query 20 output Java NULL NULL 2012 -281,7 +281,7 dotNET NULL -- !query 21 SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales GROUP BY CUBE(course, year) -ORDER BY GROUPING(course), GROUPING(year), course, udf(year) +ORDER BY GROUPING(course), GROUPING(year), course, year -- !query 21 schema struct<course:string,year:int,grouping(course):tinyint,grouping(year):tinyint> -- !query 21 output -298,7 +298,7 NULL NULL 1 1 -- !query 22 SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year) -ORDER BY GROUPING(course), GROUPING(year), course, udf(year) +ORDER BY GROUPING(course), GROUPING(year), course, year -- !query 22 schema struct<course:string,year:int,grouping_id(course, year):int> -- !query 22 output -314,7 +314,7 NULL NULL 3 -- !query 23 -SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER BY GROUPING(course) +SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING(course) -- !query 23 schema struct<> -- !query 23 output -323,7 +323,7 grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; -- !query 24 -SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER BY GROUPING_ID(course) +SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING_ID(course) -- !query 24 schema struct<> -- !query 24 output -332,7 +332,7 grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; -- !query 25 -SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, udf(course), year +SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year -- !query 25 schema struct<course:string,year:int> -- !query 25 output -348,7 +348,7 NULL NULL -- !query 26 -SELECT udf(a + b) AS k1, udf(b) AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2) +SELECT a + b AS k1, b AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2) -- !query 26 schema struct<k1:int,k2:int,sum((a - b)):bigint> -- !query 26 output -368,7 +368,7 NULL NULL 3 -- !query 27 -SELECT udf(udf(a + b)) AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b) +SELECT a + b AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b) -- !query 27 schema struct<k:int,b:int,sum((a - b)):bigint> -- !query 27 output -386,9 +386,9 NULL NULL 3 -- !query 28 -SELECT udf(a + b), udf(udf(b)) AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING SETS(k) +SELECT a + b, b AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING SETS(k) -- !query 28 schema -struct<CAST(udf(cast((a + b) as string)) AS INT):int,k:int,sum((a - b)):bigint> +struct<(a + b):int,k:int,sum((a - b)):bigint> -- !query 28 output NULL 1 3 NULL 2 0 ``` </p> </details> ## How was this patch tested? Tested as guided in SPARK-27921. Verified pandas & pyarrow versions: ```$python3 Python 3.6.8 (default, Jan 14 2019, 11:02:34) [GCC 8.0.1 20180414 (experimental) [trunk revision 259383]] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import pandas >>> import pyarrow >>> pyarrow.__version__ '0.14.0' >>> pandas.__version__ '0.24.2' ``` From the sql output it seems that sql statements are evaluated correctly given that udf returns a string and may change results as Null will be returned as None and will be counted in returned values. Closes apache#25196 from skonto/group-analytics.sql. Authored-by: Stavros Kontopoulos <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
1 parent d20ed24 commit a10a0eb

2 files changed

Lines changed: 458 additions & 0 deletions

File tree

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
-- This test file was converted from group-analytics.sql.
2+
-- TODO: UDF should be inserted and tested at GROUP BY clause after SPARK-28445
3+
CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
4+
(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2)
5+
AS testData(a, b);
6+
7+
-- CUBE on overlapping columns
8+
SELECT a + b, b, udf(SUM(a - b)) FROM testData GROUP BY a + b, b WITH CUBE;
9+
10+
SELECT a, udf(b), SUM(b) FROM testData GROUP BY a, b WITH CUBE;
11+
12+
-- ROLLUP on overlapping columns
13+
SELECT udf(a + b), b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP;
14+
15+
SELECT a, b, udf(SUM(b)) FROM testData GROUP BY a, b WITH ROLLUP;
16+
17+
CREATE OR REPLACE TEMPORARY VIEW courseSales AS SELECT * FROM VALUES
18+
("dotNET", 2012, 10000), ("Java", 2012, 20000), ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 30000)
19+
AS courseSales(course, year, earnings);
20+
21+
-- ROLLUP
22+
SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY udf(course), year;
23+
24+
-- CUBE
25+
SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, udf(year);
26+
27+
-- GROUPING SETS
28+
SELECT course, udf(year), SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year);
29+
SELECT course, year, udf(SUM(earnings)) FROM courseSales GROUP BY course, year GROUPING SETS(course);
30+
SELECT udf(course), year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year);
31+
32+
-- GROUPING SETS with aggregate functions containing groupBy columns
33+
SELECT course, udf(SUM(earnings)) AS sum FROM courseSales
34+
GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, udf(sum);
35+
SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM courseSales
36+
GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY udf(course), sum;
37+
38+
-- GROUPING/GROUPING_ID
39+
SELECT udf(course), udf(year), GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales
40+
GROUP BY CUBE(course, year);
41+
SELECT course, udf(year), GROUPING(course) FROM courseSales GROUP BY course, year;
42+
SELECT course, udf(year), GROUPING_ID(course, year) FROM courseSales GROUP BY course, year;
43+
SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, udf(year);
44+
45+
-- GROUPING/GROUPING_ID in having clause
46+
SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
47+
HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, udf(year);
48+
SELECT course, udf(year) FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0;
49+
SELECT course, udf(udf(year)) FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0;
50+
SELECT udf(course), year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0;
51+
52+
-- GROUPING/GROUPING_ID in orderBy clause
53+
SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales GROUP BY CUBE(course, year)
54+
ORDER BY GROUPING(course), GROUPING(year), course, udf(year);
55+
SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year)
56+
ORDER BY GROUPING(course), GROUPING(year), course, udf(year);
57+
SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER BY GROUPING(course);
58+
SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER BY GROUPING_ID(course);
59+
SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, udf(course), year;
60+
61+
-- Aliases in SELECT could be used in ROLLUP/CUBE/GROUPING SETS
62+
SELECT udf(a + b) AS k1, udf(b) AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2);
63+
SELECT udf(udf(a + b)) AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b);
64+
SELECT udf(a + b), udf(udf(b)) AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING SETS(k)

0 commit comments

Comments
 (0)