-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31116][SQL] Fix nested schema case-sensitivity in ParquetRowConverter #27888
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
0de8d20
d6cbb67
9816f66
5e6dd75
fb4880b
4bac37f
6055d22
34e47fb
9709e96
ca07f74
44a9e72
a0d9a19
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -804,6 +804,162 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS | |
| } | ||
| } | ||
|
|
||
| test("SPARK-31116: Select simple parquet columns correctly in case insensitive manner") { | ||
|
||
| withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { | ||
| withTempPath { dir => { | ||
| val path = dir.getCanonicalPath | ||
|
|
||
| // Prepare values for testing specific parquet record reader | ||
| Seq("A").toDF("camelCase").write.parquet(path) | ||
|
|
||
| val exactSchema = new StructType().add("camelCase", StringType) | ||
| checkAnswer(spark.read.schema(exactSchema).parquet(path), Row("A")) | ||
|
|
||
| // In case insensitive manner, parquet's column cases are ignored | ||
| val caseInsensitiveSchema = new StructType().add("camelcase", StringType) | ||
| checkAnswer(spark.read.schema(caseInsensitiveSchema).parquet(path), Row("A")) | ||
| }} | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-31116: Select nested parquet columns correctly in case insensitive manner") { | ||
| withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { | ||
| withTempPath { dir => { | ||
| val path = dir.getCanonicalPath | ||
|
|
||
| // Prepare values for testing nested parquet data | ||
| spark | ||
| .range(1L) | ||
| .selectExpr("NAMED_STRUCT('lowercase', id, 'camelCase', id + 1) AS StructColumn") | ||
| .write.parquet(path) | ||
|
|
||
| val exactSchema = new StructType() | ||
| .add( | ||
| "StructColumn", | ||
| new StructType() | ||
| .add("lowercase", LongType) | ||
| .add("camelCase", LongType)) | ||
| checkAnswer(spark.read.schema(exactSchema).parquet(path), Row(Row(0, 1))) | ||
|
|
||
| // In case insensitive manner, parquet's column cases are ignored | ||
| val innerColumnCaseInsensitiveSchema = new StructType() | ||
| .add( | ||
| "StructColumn", | ||
| new StructType() | ||
| .add("LowerCase", LongType) | ||
| .add("camelcase", LongType)) | ||
| checkAnswer( | ||
| spark.read.schema(innerColumnCaseInsensitiveSchema).parquet(path), | ||
| Row(Row(0, 1))) | ||
|
|
||
| val rootColumnCaseInsensitiveSchema = new StructType() | ||
| .add( | ||
| "structColumn", | ||
| new StructType() | ||
| .add("lowercase", LongType) | ||
| .add("camelCase", LongType)) | ||
| checkAnswer( | ||
| spark.read.schema(rootColumnCaseInsensitiveSchema).parquet(path), | ||
| Row(Row(0, 1))) | ||
| }} | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-31116: Select simple parquet columns correctly in case sensitive manner") { | ||
| withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { | ||
| withTempPath { dir => { | ||
| val path = dir.getCanonicalPath | ||
|
|
||
| // Prepare values for testing specific parquet record reader | ||
| Seq("A").toDF("camelCase").write.parquet(path) | ||
|
|
||
| val exactSchema = new StructType().add("camelCase", StringType) | ||
| checkAnswer(spark.read.schema(exactSchema).parquet(path), Row("A")) | ||
|
|
||
| // In case sensitive manner, different letter case does not read column | ||
| val caseInsensitiveSchema = new StructType().add("camelcase", StringType) | ||
| checkAnswer(spark.read.schema(caseInsensitiveSchema).parquet(path), Row(null)) | ||
|
|
||
| // It also properly work in combined schema | ||
| val combinedSchema = new StructType() | ||
| .add("camelCase", StringType) | ||
| .add("camelcase", StringType) | ||
| checkAnswer(spark.read.schema(combinedSchema).parquet(path), Row("A", null)) | ||
| }} | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-31116: Select nested parquet columns correctly in case sensitive manner") { | ||
| withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { | ||
| withTempPath { dir => { | ||
| val path = dir.getCanonicalPath | ||
|
|
||
| // Prepare values for testing nested parquet data | ||
| spark | ||
| .range(1) | ||
| .selectExpr("NAMED_STRUCT('lowercase', id, 'camelCase', id + 1) AS StructColumn") | ||
| .write.parquet(path) | ||
|
|
||
| val exactSchema = new StructType() | ||
| .add( | ||
| "StructColumn", | ||
| new StructType() | ||
| .add("lowercase", LongType) | ||
| .add("camelCase", LongType)) | ||
| checkAnswer(spark.read.schema(exactSchema).parquet(path), Row(Row(0, 1))) | ||
|
|
||
| val innerColumnCaseInsensitiveSchema = new StructType() | ||
| .add( | ||
| "StructColumn", | ||
| new StructType() | ||
| .add("LowerCase", LongType) | ||
| .add("camelcase", LongType)) | ||
| checkAnswer( | ||
| spark.read.schema(innerColumnCaseInsensitiveSchema).parquet(path), | ||
| Row(null)) | ||
|
|
||
| val innerPartialColumnCaseInsensitiveSchema = new StructType() | ||
| .add( | ||
| "StructColumn", | ||
| new StructType() | ||
| .add("lowercase", LongType) | ||
| .add("camelcase", LongType)) | ||
| checkAnswer( | ||
| spark.read.schema(innerPartialColumnCaseInsensitiveSchema).parquet(path), | ||
| Row(Row(0, null))) | ||
|
|
||
| val rootColumnCaseInsensitiveSchema = new StructType() | ||
| .add( | ||
| "structColumn", | ||
| new StructType() | ||
| .add("lowercase", LongType) | ||
| .add("camelCase", LongType)) | ||
| checkAnswer( | ||
| spark.read.schema(rootColumnCaseInsensitiveSchema).parquet(path), | ||
| Row(null)) | ||
|
|
||
| val combinedSchema = new StructType() | ||
| .add( | ||
| "StructColumn", | ||
| new StructType() | ||
| .add("lowercase", LongType) | ||
| .add("camelCase", LongType) | ||
| .add("LowerCase", LongType) | ||
| .add("camelcase", LongType)) | ||
| .add( | ||
| "structColumn", | ||
| new StructType() | ||
| .add("lowercase", LongType) | ||
| .add("camelCase", LongType) | ||
| .add("LowerCase", LongType) | ||
| .add("camelcase", LongType)) | ||
| checkAnswer( | ||
| spark.read.schema(combinedSchema).parquet(path), | ||
| Row(Row(0, 1, null, null), null)) | ||
| }} | ||
| } | ||
| } | ||
|
|
||
| test("Migration from INT96 to TIMESTAMP_MICROS timestamp type") { | ||
| def testMigration(fromTsType: String, toTsType: String): Unit = { | ||
| def checkAppend(write: DataFrameWriter[_] => Unit, readback: => DataFrame): Unit = { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure why you need to pass
caseSensitiveacrossParquetRecordMaterializer,ParquetRowConverter. Can't we just get it atParquetRowConverter?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can I get runtime config at
ParquetRowConverter? I'm not concretely understand it's behavior.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SQLConf.getworks, even in executor sid, see dd37529There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I'll update to using SQLConf instead of passing argument across classes.