-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-14997][SQL] Fixed FileCatalog to return correct set of files when there is no partitioning scheme in the given paths #12856
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
b1f82ce
3bb42bd
2f7c523
4198d56
f1b793a
6779bd9
efd261f
f15ee32
072c28b
2b43ae4
9a64496
8c06d4e
7c5d7ba
33a1345
8abc999
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,8 @@ | |
|
|
||
| package org.apache.spark.sql.sources | ||
|
|
||
| import java.io.File | ||
|
|
||
| import scala.util.Random | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
|
|
@@ -486,7 +488,151 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes | |
| } | ||
| } | ||
|
|
||
| test("Hadoop style globbing") { | ||
| test("load() - with directory of unpartitioned data in nested subdirs") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we put it in partition discovery suite? If we put it at here, we will run it with every data source, right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well probably it should run with all data sources to make sure no data source violates this rule. this is not really partition discovery but more like what to do when there is not partition to discover. |
||
| withTempPath { dir => | ||
| val subdir = new File(dir, "subdir") | ||
|
|
||
| val dataInDir = Seq(1, 2, 3).toDF("value") | ||
| val dataInSubdir = Seq(4, 5, 6).toDF("value") | ||
|
|
||
| /* | ||
|
|
||
| Directory structure to be generated | ||
|
|
||
| dir | ||
| | | ||
| |___ [ files of dataInDir ] | ||
| | | ||
| |___ subsubdir | ||
| | | ||
| |___ [ files of dataInSubdir ] | ||
| */ | ||
|
|
||
| // Generated dataInSubdir, not data in dir | ||
| dataInSubdir.write | ||
| .format(dataSourceName) | ||
| .mode(SaveMode.Overwrite) | ||
| .save(subdir.getCanonicalPath) | ||
|
|
||
| // Inferring schema should throw error as it should not find any file to infer | ||
| val e = intercept[Exception] { | ||
| sqlContext.read.format(dataSourceName).load(dir.getCanonicalPath) | ||
| } | ||
|
|
||
| e match { | ||
| case _: AnalysisException => | ||
| assert(e.getMessage.contains("infer")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where will this error be thrown (the place where we complain that there is no file)?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| case _: java.util.NoSuchElementException if e.getMessage.contains("dataSchema") => | ||
| // ignore, the source format requires schema to be provided by user | ||
|
||
|
|
||
| case _ => | ||
| fail("Unexpected error trying to infer schema from empty dir", e) | ||
| } | ||
|
|
||
| /** Test whether data is read with the given path matches the expected answer */ | ||
| def testWithPath(path: File, expectedAnswer: Seq[Row]): Unit = { | ||
| val df = sqlContext.read | ||
| .format(dataSourceName) | ||
| .schema(dataInDir.schema) // avoid schema inference for any format | ||
| .load(path.getCanonicalPath) | ||
| checkAnswer(df, expectedAnswer) | ||
| } | ||
|
|
||
| // Verify that reading by path 'dir/' gives empty results as there are no files in 'file' | ||
| // and it should not pick up files in 'dir/subdir' | ||
| require(subdir.exists) | ||
| require(subdir.listFiles().exists(!_.isDirectory)) | ||
| testWithPath(dir, Seq.empty) | ||
|
|
||
| // Verify that if there is data in dir, then reading by path 'dir/' reads only dataInDir | ||
| dataInDir.write | ||
| .format(dataSourceName) | ||
| .mode(SaveMode.Append) // append to prevent subdir from being deleted | ||
| .save(dir.getCanonicalPath) | ||
| require(dir.listFiles().exists(!_.isDirectory)) | ||
| require(subdir.exists()) | ||
| require(subdir.listFiles().exists(!_.isDirectory)) | ||
| testWithPath(dir, dataInDir.collect()) | ||
| } | ||
| } | ||
|
|
||
| test("Hadoop style globbing - unpartitioned data") { | ||
| withTempPath { file => | ||
|
|
||
| val dir = file.getCanonicalPath | ||
| val subdir = new File(dir, "subdir") | ||
| val subsubdir = new File(subdir, "subsubdir") | ||
| val anotherSubsubdir = | ||
| new File(new File(dir, "another-subdir"), "another-subsubdir") | ||
|
|
||
| val dataInSubdir = Seq(1, 2, 3).toDF("value") | ||
| val dataInSubsubdir = Seq(4, 5, 6).toDF("value") | ||
| val dataInAnotherSubsubdir = Seq(7, 8, 9).toDF("value") | ||
|
|
||
| dataInSubdir.write | ||
| .format (dataSourceName) | ||
| .mode (SaveMode.Overwrite) | ||
| .save (subdir.getCanonicalPath) | ||
|
|
||
| dataInSubsubdir.write | ||
| .format (dataSourceName) | ||
| .mode (SaveMode.Overwrite) | ||
| .save (subsubdir.getCanonicalPath) | ||
|
|
||
| dataInAnotherSubsubdir.write | ||
| .format (dataSourceName) | ||
| .mode (SaveMode.Overwrite) | ||
| .save (anotherSubsubdir.getCanonicalPath) | ||
|
|
||
| require(subdir.exists) | ||
| require(subdir.listFiles().exists(!_.isDirectory)) | ||
| require(subsubdir.exists) | ||
| require(subsubdir.listFiles().exists(!_.isDirectory)) | ||
| require(anotherSubsubdir.exists) | ||
| require(anotherSubsubdir.listFiles().exists(!_.isDirectory)) | ||
|
|
||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove empty lines |
||
| /* | ||
| Directory structure generated | ||
|
|
||
| dir | ||
| | | ||
| |___ subdir | ||
| | | | ||
| | |___ [ files of dataInSubdir ] | ||
| | | | ||
| | |___ subsubdir | ||
| | | | ||
| | |___ [ files of dataInSubsubdir ] | ||
| | | ||
| | | ||
| |___ anotherSubdir | ||
| | | ||
| |___ anotherSubsubdir | ||
| | | ||
| |___ [ files of dataInAnotherSubsubdir ] | ||
| */ | ||
|
|
||
| val schema = dataInSubdir.schema | ||
|
|
||
| /** Test whether data is read with the given path matches the expected answer */ | ||
| def testWithPath(path: String, expectedDf: DataFrame): Unit = { | ||
| val df = sqlContext.read | ||
| .format(dataSourceName) | ||
| .schema(schema) // avoid schema inference for any format | ||
| .load(path) | ||
| checkAnswer(df, expectedDf) | ||
| } | ||
|
|
||
| testWithPath(s"$dir/*/", dataInSubdir) | ||
| testWithPath(s"$dir/sub*/*", dataInSubdir.union(dataInSubsubdir)) | ||
| testWithPath(s"$dir/another*/*", dataInAnotherSubsubdir) | ||
| testWithPath(s"$dir/*/another*", dataInAnotherSubsubdir) | ||
| testWithPath(s"$dir/*/*", dataInSubdir.union(dataInSubsubdir).union(dataInAnotherSubsubdir)) | ||
| } | ||
| } | ||
|
|
||
| test("Hadoop style globbing - partitioned data") { | ||
| withTempPath { file => | ||
| partitionedTestDF.write | ||
| .format(dataSourceName) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we use
qualifiedPathinstead ofpath?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
leafFilescontain all qualified paths, right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh right. I forgot to address that comment. Sorry!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about we add a test to make sure all paths in leafFiles contain the scheme?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a new FileCatalogSuite
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems we still need to update this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about we add a test that will not pass if we use
pathat here?