-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-26709][SQL] OptimizeMetadataOnlyQuery does not handle empty records correctly #23635
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
6bd09b1
1c81586
15cc872
8399d77
1524205
f7dac39
9e99d4b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -585,9 +585,10 @@ object SQLConf { | |
| .doc("When true, enable the metadata-only query optimization that use the table's metadata " + | ||
| "to produce the partition columns instead of table scans. It applies when all the columns " + | ||
| "scanned are partition columns and the query has an aggregate operator that satisfies " + | ||
| "distinct semantics.") | ||
| "distinct semantics. By default the optimization is disabled, since it may return " + | ||
| "incorrect results with empty tables.") | ||
|
||
| .booleanConf | ||
| .createWithDefault(true) | ||
| .createWithDefault(false) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about making this conf. internal and always printing a warning message when this flag enabled and the rule applied? |
||
|
|
||
| val COLUMN_NAME_OF_CORRUPT_RECORD = buildConf("spark.sql.columnNameOfCorruptRecord") | ||
| .doc("The name of internal column for storing raw/un-parsed JSON and CSV records that fail " + | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2422,7 +2422,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { | |
| Row(s"$expected") :: Nil) | ||
| } | ||
|
|
||
| test("SPARK-15752 optimize metadata only query for datasource table") { | ||
| ignore("SPARK-15752 optimize metadata only query for datasource table") { | ||
|
||
| withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") { | ||
| withTable("srcpart_15752") { | ||
| val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b")) | ||
|
|
@@ -2966,6 +2966,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also add a case that illustrates that |
||
| withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") { | ||
| withTable("t") { | ||
| sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)") | ||
| sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)") | ||
| checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null)) | ||
| checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null)) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| case class Foo(bar: Option[String]) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The configuration is not only about Parquet. Since it is disabled, and to avoid confusing users, I think we can just remove the doc here.