-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-18752][hive] "isSrcLocal" value should be set from user query. #16179
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
f1e09f4
11f11ca
93e07db
a8482a5
b451a70
4e37c80
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -316,7 +316,8 @@ case class InsertIntoHiveTable( | |
| partitionSpec, | ||
| isOverwrite = doHiveOverwrite, | ||
| holdDDLTime = holdDDLTime, | ||
| inheritTableSpecs = inheritTableSpecs) | ||
| inheritTableSpecs = inheritTableSpecs, | ||
| isSrcLocal = false) | ||
| } | ||
| } | ||
| } else { | ||
|
|
@@ -325,7 +326,8 @@ case class InsertIntoHiveTable( | |
| table.catalogTable.identifier.table, | ||
| outputPath.toString, // TODO: URI | ||
| overwrite, | ||
| holdDDLTime) | ||
| holdDDLTime, | ||
| isSrcLocal = false) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then, how can we know this is always not a local file system (e.g., as you said above, if your warehouse directory is in the local file system too)?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need to. "isSrcLocal" comes from the user query. "LOAD DATA LOCAL" -> "isSrcLocal" = true
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see the reason why we can set it to |
||
| } | ||
|
|
||
| // Invalidate the cache. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,10 @@ | |
|
|
||
| package org.apache.spark.sql.hive.execution | ||
|
|
||
| import java.io.File | ||
|
|
||
| import com.google.common.io.Files | ||
|
|
||
| import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} | ||
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.analysis.NoSuchTableException | ||
|
|
@@ -232,31 +236,40 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto | |
| sql("""LOAD DATA LOCAL INPATH "/non-existing/data.txt" INTO TABLE non_part_table""") | ||
| } | ||
|
|
||
| val testData = hiveContext.getHiveFile("data/files/employee.dat").getCanonicalPath | ||
| val testData = hiveContext.getHiveFile("data/files/employee.dat") | ||
|
|
||
| // Non-local inpath: without URI Scheme and Authority | ||
| sql(s"""LOAD DATA INPATH "$testData" INTO TABLE non_part_table""") | ||
| withCopy(testData) { tmp => | ||
| sql(s"""LOAD DATA INPATH "${tmp.getCanonicalPath()}" INTO TABLE non_part_table""") | ||
| } | ||
|
|
||
| checkAnswer( | ||
| sql("SELECT * FROM non_part_table WHERE employeeID = 16"), | ||
| Row(16, "john") :: Nil) | ||
|
|
||
| // Use URI as LOCAL inpath: | ||
| // file:/path/to/data/files/employee.dat | ||
| val uri = "file:" + testData | ||
| val uri = "file:" + testData.getCanonicalPath() | ||
| sql(s"""LOAD DATA LOCAL INPATH "$uri" INTO TABLE non_part_table""") | ||
|
|
||
| checkAnswer( | ||
| sql("SELECT * FROM non_part_table WHERE employeeID = 16"), | ||
| Row(16, "john") :: Row(16, "john") :: Nil) | ||
|
|
||
| // Use URI as non-LOCAL inpath | ||
| sql(s"""LOAD DATA INPATH "$uri" INTO TABLE non_part_table""") | ||
| withCopy(testData) { tmp => | ||
| val tmpUri = "file:" + tmp.getCanonicalPath() | ||
| sql(s"""LOAD DATA INPATH "$tmpUri" INTO TABLE non_part_table""") | ||
| } | ||
|
|
||
| checkAnswer( | ||
| sql("SELECT * FROM non_part_table WHERE employeeID = 16"), | ||
| Row(16, "john") :: Row(16, "john") :: Row(16, "john") :: Nil) | ||
|
|
||
| sql(s"""LOAD DATA INPATH "$uri" OVERWRITE INTO TABLE non_part_table""") | ||
| withCopy(testData) { tmp => | ||
| val tmpUri = "file:" + tmp.getCanonicalPath() | ||
| sql(s"""LOAD DATA INPATH "$tmpUri" OVERWRITE INTO TABLE non_part_table""") | ||
| } | ||
|
|
||
| checkAnswer( | ||
| sql("SELECT * FROM non_part_table WHERE employeeID = 16"), | ||
|
|
@@ -418,4 +431,19 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto | |
| assert(sql("SHOW PARTITIONS part_datasrc").count() == 3) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Run a function with a copy of the input file. Use this for tests that use "LOAD DATA" | ||
| * (instead of "LOAD DATA LOCAL") since, according to Hive's semantics, files are moved | ||
|
||
| * into the target location in that case, and we need the original file to be preserved. | ||
| */ | ||
| private def withCopy(source: File)(fn: File => Unit): Unit = { | ||
| val tmp = File.createTempFile(source.getName(), ".tmp") | ||
| Files.copy(source, tmp) | ||
| try { | ||
| fn(tmp) | ||
| } finally { | ||
| tmp.delete() | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what does isSrcLocal mean? Can you document it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It means the source data comes from a "LOAD DATA LOCAL" query.
I can add a partial scaladoc to these methods, but I don't really know the meaning of some of the other arguments, so I can't write a complete one.