-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-5950][SQL] Enable inserting array into Hive table saved as Parquet using DataSource API #4729
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-5950][SQL] Enable inserting array into Hive table saved as Parquet using DataSource API #4729
Changes from 3 commits
4e3bd55
0e07bb8
175966f
2949324
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -424,7 +424,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with | |
| // Collects all `MetastoreRelation`s which should be replaced | ||
| val toBeReplaced = plan.collect { | ||
| // Write path | ||
| case InsertIntoTable(relation: MetastoreRelation, _, _, _) | ||
| case InsertIntoHiveTable(relation: MetastoreRelation, _, _, _) | ||
| // Inserting into partitioned table is not supported in Parquet data source (yet). | ||
| if !relation.hiveQlTable.isPartitioned && | ||
| hive.convertMetastoreParquet && | ||
|
|
@@ -458,6 +458,9 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with | |
|
|
||
| withAlias | ||
| } | ||
| case InsertIntoHiveTable(r: MetastoreRelation, p, c, o) if relationMap.contains(r) => | ||
| val parquetRelation = relationMap(r) | ||
| InsertIntoHiveTable(parquetRelation, p, c, o) | ||
|
||
| case other => other.transformExpressions { | ||
| case a: Attribute if a.resolved => attributedRewrites.getOrElse(a, a) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,8 @@ package org.apache.spark.sql.parquet | |
|
|
||
| import java.io.File | ||
|
|
||
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| import org.scalatest.BeforeAndAfterAll | ||
|
|
||
| import org.apache.spark.sql.{SQLConf, QueryTest} | ||
|
|
@@ -299,6 +301,37 @@ class ParquetDataSourceOnSourceSuite extends ParquetSourceSuiteBase { | |
| super.afterAll() | ||
| setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, originalConf.toString) | ||
| } | ||
|
|
||
| test("insert array into parquet hive table using data source api") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just tried this test with our master, it did not fail. I think you need to first turn off the conversion for the write path and then turn on the conversion for the read path. You can use
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| val data1="""{ "timestamp": 1422435598, "data_array": [ { "field0": null, "field1": 1, "field2": 2} ] }""" | ||
| val data2="""{ "timestamp": 1422435599, "data_array": [ { "field0": 0, "field1": null, "field2": 3} ] }""" | ||
|
|
||
| val json = sparkContext.makeRDD(data1 :: data2 :: Nil) | ||
| val rdd = jsonRDD(json) | ||
| rdd.registerTempTable("tmp_table") | ||
|
|
||
| val partitionedTableDir = File.createTempFile("persisted_table", "sparksql") | ||
| partitionedTableDir.delete() | ||
| partitionedTableDir.mkdir() | ||
|
|
||
| sql( | ||
| s""" | ||
| |create external table persisted_table | ||
| |( | ||
| | data_array ARRAY <STRUCT<field0: BIGINT, field1: BIGINT, field2: BIGINT>>, | ||
| | timestamp BIGINT | ||
| |) | ||
| |STORED AS PARQUET Location '${partitionedTableDir.getCanonicalPath}' | ||
| """.stripMargin) | ||
|
|
||
| sql("insert into table persisted_table select * from tmp_table").collect | ||
|
|
||
| checkAnswer( | ||
| sql("select data_array.field0, data_array.field1, data_array.field2 from persisted_table"), | ||
| Row(ArrayBuffer(null), ArrayBuffer(1), ArrayBuffer(2)) :: | ||
| Row (ArrayBuffer(0), ArrayBuffer(null), ArrayBuffer(3)) :: Nil | ||
| ) | ||
| } | ||
| } | ||
|
|
||
| class ParquetDataSourceOffSourceSuite extends ParquetSourceSuiteBase { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this is right here.
ParquetConversionsis an analysis rule, which only processes logical plans. However,InsertIntoHiveTableis a physical plan node.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
InsertIntoHiveTableis aLogicalPlandefined inHiveMetastoreCatalog.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh sorry, I mistook this for the physical plan with the same name...