-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-37627][SQL][FOLLOWUP] Separate SortedBucketTransform from BucketTransform #34914
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
73bd0e6
4db60f7
6b9f42c
00a90da
51ead2b
3f220d0
61dc795
77b2c12
dac5693
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -104,24 +104,29 @@ private[sql] final case class BucketTransform( | |
| columns: Seq[NamedReference], | ||
| sortedColumns: Seq[NamedReference] = Seq.empty[NamedReference]) extends RewritableTransform { | ||
|
|
||
| override val name: String = "bucket" | ||
| override val name: String = if (sortedColumns.nonEmpty) "sortedBucket" else "bucket" | ||
|
|
||
| override def references: Array[NamedReference] = { | ||
| arguments.collect { case named: NamedReference => named } | ||
| } | ||
|
|
||
| override def arguments: Array[Expression] = numBuckets +: columns.toArray | ||
|
|
||
| override def toString: String = | ||
| override def arguments: Array[Expression] = { | ||
| if (sortedColumns.nonEmpty) { | ||
| s"bucket(${arguments.map(_.describe).mkString(", ")}," + | ||
| s" ${sortedColumns.map(_.describe).mkString(", ")})" | ||
| (columns.toArray :+ numBuckets) ++ sortedColumns | ||
| } else { | ||
| s"bucket(${arguments.map(_.describe).mkString(", ")})" | ||
| numBuckets +: columns.toArray | ||
|
||
| } | ||
| } | ||
|
|
||
| override def toString: String = s"$name(${arguments.map(_.describe).mkString(", ")})" | ||
|
|
||
| override def withReferences(newReferences: Seq[NamedReference]): Transform = { | ||
| this.copy(columns = newReferences) | ||
| if (sortedColumns.isEmpty) { | ||
| this.copy(columns = newReferences) | ||
| } else { | ||
| val splits = newReferences.grouped(columns.length).toList | ||
| this.copy(columns = splits(0), sortedColumns = splits(1)) | ||
|
||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -140,15 +145,22 @@ private[sql] object BucketTransform { | |
| } | ||
|
|
||
| def unapply(transform: Transform): Option[(Int, NamedReference, NamedReference)] = | ||
| transform match { | ||
| case NamedTransform("bucket", Seq( | ||
| Lit(value: Int, IntegerType), | ||
| Ref(partCols: Seq[String]), | ||
| Ref(sortCols: Seq[String]))) => | ||
| Some((value, FieldReference(partCols), FieldReference(sortCols))) | ||
| case NamedTransform("bucket", Seq( | ||
| Lit(value: Int, IntegerType), | ||
| Ref(partCols: Seq[String]))) => | ||
| transform match { | ||
| case NamedTransform("sortedBucket", s) => | ||
| var index: Int = -1 | ||
| var posOfLit: Int = -1 | ||
| var numOfBucket: Int = -1 | ||
| s.foreach { | ||
| case Lit(value: Int, IntegerType) => | ||
| numOfBucket = value | ||
| index = index + 1 | ||
| posOfLit = index | ||
| case _ => index = index + 1 | ||
| } | ||
| val splits = s.splitAt(posOfLit) | ||
| Some(numOfBucket, FieldReference( | ||
| splits._1.map(_.describe)), FieldReference(splits._2.drop(1).map(_.describe))) | ||
| case NamedTransform("bucket", Seq(Lit(value: Int, IntegerType), Ref(partCols: Seq[String]))) => | ||
| Some((value, FieldReference(partCols), FieldReference(Seq.empty[String]))) | ||
| case _ => | ||
| None | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1566,18 +1566,22 @@ class DataSourceV2SQLSuite | |
| test("create table using - with sorted bucket") { | ||
| val identifier = "testcat.table_name" | ||
| withTable(identifier) { | ||
| sql(s"CREATE TABLE $identifier (a int, b string, c int) USING $v2Source PARTITIONED BY (c)" + | ||
| s" CLUSTERED BY (b) SORTED by (a) INTO 4 BUCKETS") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why changing this test?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just want to make sure multiple columns/sortedColumns work ok. |
||
| val table = getTableMetadata(identifier) | ||
| sql(s"CREATE TABLE $identifier (a int, b string, c int, d int, e int, f int) USING" + | ||
| s" $v2Source PARTITIONED BY (a, b) CLUSTERED BY (c, d) SORTED by (e, f) INTO 4 BUCKETS") | ||
| val describe = spark.sql(s"DESCRIBE $identifier") | ||
| describe.show(false) | ||
| val part1 = describe | ||
| .filter("col_name = 'Part 0'") | ||
| .select("data_type").head.getString(0) | ||
| assert(part1 === "c") | ||
| assert(part1 === "a") | ||
| val part2 = describe | ||
| .filter("col_name = 'Part 1'") | ||
| .select("data_type").head.getString(0) | ||
| assert(part2 === "bucket(4, b, a)") | ||
| assert(part2 === "b") | ||
| val part3 = describe | ||
| .filter("col_name = 'Part 2'") | ||
| .select("data_type").head.getString(0) | ||
| assert(part3 === "sortedBucket(c, d, 4, e, f)") | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we create a new class
SortedBucketTransformto be clearer?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a new class
SortedBucketTransform. Thanks!