@@ -1136,7 +1136,7 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
11361136 val df = createJoinTestDF(Seq (" arrive_time" -> " time" ))
11371137 val shuffles = collectShuffles(df.queryExecution.executedPlan)
11381138 if (shuffle) {
1139- assert(shuffles.size == 2 , " partitioning with transform not work now " )
1139+ assert(shuffles.size == 1 , " partitioning with transform should trigger SPJ " )
11401140 } else {
11411141 assert(shuffles.size == 2 , " should add two side shuffle when bucketing shuffle one side" +
11421142 " is not enabled" )
@@ -1991,22 +1991,19 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
19911991 " (6, 50.0, cast('2023-02-01' as timestamp))" )
19921992
19931993 Seq (true , false ).foreach { pushdownValues =>
1994- Seq (true , false ).foreach { partiallyClustered =>
1995- withSQLConf(
1996- SQLConf .V2_BUCKETING_SHUFFLE_ENABLED .key -> " true" ,
1997- SQLConf .V2_BUCKETING_PUSH_PART_VALUES_ENABLED .key -> pushdownValues.toString,
1998- SQLConf .V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED .key
1999- -> partiallyClustered.toString,
2000- SQLConf .V2_BUCKETING_ALLOW_JOIN_KEYS_SUBSET_OF_PARTITION_KEYS .key -> " true" ) {
2001- val df = createJoinTestDF(Seq (" id" -> " item_id" ))
2002- val shuffles = collectShuffles(df.queryExecution.executedPlan)
2003- assert(shuffles.size == 1 , " SPJ should be triggered" )
2004- checkAnswer(df, Seq (Row (1 , " aa" , 30.0 , 42.0 ),
2005- Row (1 , " aa" , 30.0 , 89.0 ),
2006- Row (1 , " aa" , 40.0 , 42.0 ),
2007- Row (1 , " aa" , 40.0 , 89.0 ),
2008- Row (3 , " bb" , 10.0 , 19.5 )))
2009- }
1994+ withSQLConf(
1995+ SQLConf .V2_BUCKETING_SHUFFLE_ENABLED .key -> " true" ,
1996+ SQLConf .V2_BUCKETING_PUSH_PART_VALUES_ENABLED .key -> pushdownValues.toString,
1997+ SQLConf .V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED .key -> " false" ,
1998+ SQLConf .V2_BUCKETING_ALLOW_JOIN_KEYS_SUBSET_OF_PARTITION_KEYS .key -> " true" ) {
1999+ val df = createJoinTestDF(Seq (" id" -> " item_id" ))
2000+ val shuffles = collectShuffles(df.queryExecution.executedPlan)
2001+ assert(shuffles.size == 1 , " SPJ should be triggered" )
2002+ checkAnswer(df, Seq (Row (1 , " aa" , 30.0 , 42.0 ),
2003+ Row (1 , " aa" , 30.0 , 89.0 ),
2004+ Row (1 , " aa" , 40.0 , 42.0 ),
2005+ Row (1 , " aa" , 40.0 , 89.0 ),
2006+ Row (3 , " bb" , 10.0 , 19.5 )))
20102007 }
20112008 }
20122009 }
@@ -2052,4 +2049,109 @@ class KeyGroupedPartitioningSuite extends DistributionAndOrderingSuiteBase {
20522049 }
20532050 }
20542051 }
2052+
2053+ test(" SPARK-48012: one-side shuffle with partition transforms" ) {
2054+ val items_partitions = Array (bucket(2 , " id" ), identity(" arrive_time" ))
2055+ val items_partitions2 = Array (identity(" arrive_time" ), bucket(2 , " id" ))
2056+
2057+ Seq (items_partitions, items_partitions2).foreach { partition =>
2058+ catalog.clearTables()
2059+
2060+ createTable(items, itemsColumns, partition)
2061+ sql(s " INSERT INTO testcat.ns. $items VALUES " +
2062+ " (1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
2063+ " (1, 'bb', 30.0, cast('2020-01-01' as timestamp)), " +
2064+ " (1, 'cc', 30.0, cast('2020-01-02' as timestamp)), " +
2065+ " (3, 'dd', 10.0, cast('2020-01-01' as timestamp)), " +
2066+ " (4, 'ee', 15.5, cast('2020-02-01' as timestamp)), " +
2067+ " (5, 'ff', 32.1, cast('2020-03-01' as timestamp))" )
2068+
2069+ createTable(purchases, purchasesColumns, Array .empty)
2070+ sql(s " INSERT INTO testcat.ns. $purchases VALUES " +
2071+ " (1, 42.0, cast('2020-01-01' as timestamp)), " +
2072+ " (2, 10.7, cast('2020-01-01' as timestamp))," +
2073+ " (3, 19.5, cast('2020-02-01' as timestamp))," +
2074+ " (4, 56.5, cast('2020-02-01' as timestamp))" )
2075+
2076+ withSQLConf(
2077+ SQLConf .V2_BUCKETING_SHUFFLE_ENABLED .key -> " true" ) {
2078+ val df = createJoinTestDF(Seq (" id" -> " item_id" , " arrive_time" -> " time" ))
2079+ val shuffles = collectShuffles(df.queryExecution.executedPlan)
2080+ assert(shuffles.size == 1 , " only shuffle side that does not report partitioning" )
2081+
2082+ checkAnswer(df, Seq (
2083+ Row (1 , " bb" , 30.0 , 42.0 ),
2084+ Row (1 , " aa" , 40.0 , 42.0 ),
2085+ Row (4 , " ee" , 15.5 , 56.5 )))
2086+ }
2087+ }
2088+ }
2089+
2090+ test(" SPARK-48012: one-side shuffle with partition transforms and pushdown values" ) {
2091+ val items_partitions = Array (bucket(2 , " id" ), identity(" arrive_time" ))
2092+ createTable(items, itemsColumns, items_partitions)
2093+
2094+ sql(s " INSERT INTO testcat.ns. $items VALUES " +
2095+ " (1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
2096+ " (1, 'bb', 30.0, cast('2020-01-01' as timestamp)), " +
2097+ " (1, 'cc', 30.0, cast('2020-01-02' as timestamp))" )
2098+
2099+ createTable(purchases, purchasesColumns, Array .empty)
2100+ sql(s " INSERT INTO testcat.ns. $purchases VALUES " +
2101+ " (1, 42.0, cast('2020-01-01' as timestamp)), " +
2102+ " (2, 10.7, cast('2020-01-01' as timestamp))" )
2103+
2104+ Seq (true , false ).foreach { pushDown => {
2105+ withSQLConf(
2106+ SQLConf .V2_BUCKETING_SHUFFLE_ENABLED .key -> " true" ,
2107+ SQLConf .V2_BUCKETING_PUSH_PART_VALUES_ENABLED .key ->
2108+ pushDown.toString) {
2109+ val df = createJoinTestDF(Seq (" id" -> " item_id" , " arrive_time" -> " time" ))
2110+ val shuffles = collectShuffles(df.queryExecution.executedPlan)
2111+ assert(shuffles.size == 1 , " only shuffle side that does not report partitioning" )
2112+
2113+ checkAnswer(df, Seq (
2114+ Row (1 , " bb" , 30.0 , 42.0 ),
2115+ Row (1 , " aa" , 40.0 , 42.0 )))
2116+ }
2117+ }
2118+ }
2119+ }
2120+
2121+ test(" SPARK-48012: one-side shuffle with partition transforms " +
2122+ " with fewer join keys than partition kes" ) {
2123+ val items_partitions = Array (bucket(2 , " id" ), identity(" name" ))
2124+ createTable(items, itemsColumns, items_partitions)
2125+
2126+ sql(s " INSERT INTO testcat.ns. $items VALUES " +
2127+ " (1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
2128+ " (1, 'aa', 30.0, cast('2020-01-02' as timestamp)), " +
2129+ " (3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
2130+ " (4, 'cc', 15.5, cast('2020-02-01' as timestamp))" )
2131+
2132+ createTable(purchases, purchasesColumns, Array .empty)
2133+ sql(s " INSERT INTO testcat.ns. $purchases VALUES " +
2134+ " (1, 42.0, cast('2020-01-01' as timestamp)), " +
2135+ " (1, 89.0, cast('2020-01-03' as timestamp)), " +
2136+ " (3, 19.5, cast('2020-02-01' as timestamp)), " +
2137+ " (5, 26.0, cast('2023-01-01' as timestamp)), " +
2138+ " (6, 50.0, cast('2023-02-01' as timestamp))" )
2139+
2140+ withSQLConf(
2141+ SQLConf .REQUIRE_ALL_CLUSTER_KEYS_FOR_CO_PARTITION .key -> " false" ,
2142+ SQLConf .V2_BUCKETING_SHUFFLE_ENABLED .key -> " true" ,
2143+ SQLConf .V2_BUCKETING_PUSH_PART_VALUES_ENABLED .key -> " true" ,
2144+ SQLConf .V2_BUCKETING_PARTIALLY_CLUSTERED_DISTRIBUTION_ENABLED .key -> " false" ,
2145+ SQLConf .V2_BUCKETING_ALLOW_JOIN_KEYS_SUBSET_OF_PARTITION_KEYS .key -> " true" ) {
2146+ val df = createJoinTestDF(Seq (" id" -> " item_id" ))
2147+ val shuffles = collectShuffles(df.queryExecution.executedPlan)
2148+ assert(shuffles.size == 2 , " SPJ should not be triggered for transform expression with" +
2149+ " less join keys than partition keys for now." )
2150+ checkAnswer(df, Seq (Row (1 , " aa" , 30.0 , 42.0 ),
2151+ Row (1 , " aa" , 30.0 , 89.0 ),
2152+ Row (1 , " aa" , 40.0 , 42.0 ),
2153+ Row (1 , " aa" , 40.0 , 89.0 ),
2154+ Row (3 , " bb" , 10.0 , 19.5 )))
2155+ }
2156+ }
20552157}
0 commit comments