-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-14997][SQL] Fixed FileCatalog to return correct set of files when there is no partitioning scheme in the given paths #12856
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
b1f82ce
3bb42bd
2f7c523
4198d56
f1b793a
6779bd9
efd261f
f15ee32
072c28b
2b43ae4
9a64496
8c06d4e
7c5d7ba
33a1345
8abc999
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -291,8 +291,12 @@ class HDFSFileCatalog( | |
| refresh() | ||
|
|
||
| override def listFiles(filters: Seq[Expression]): Seq[Partition] = { | ||
|
|
||
| if (partitionSpec().partitionColumns.isEmpty) { | ||
| Partition(InternalRow.empty, allFiles().filterNot(_.getPath.getName startsWith "_")) :: Nil | ||
| Partition( | ||
| InternalRow.empty, | ||
| unpartitionedDataFiles().filterNot(_.getPath.getName startsWith "_") | ||
| ) :: Nil | ||
| } else { | ||
| prunePartitions(filters, partitionSpec()).map { | ||
| case PartitionDirectory(values, path) => | ||
|
|
@@ -337,7 +341,13 @@ class HDFSFileCatalog( | |
| } | ||
| } | ||
|
|
||
| def allFiles(): Seq[FileStatus] = leafFiles.values.toSeq | ||
| def allFiles(): Seq[FileStatus] = { | ||
| if (partitionSpec().partitionColumns.isEmpty) { | ||
| unpartitionedDataFiles() | ||
|
||
| } else { | ||
| leafFiles.values.toSeq | ||
| } | ||
| } | ||
|
|
||
| def getStatus(path: Path): Array[FileStatus] = leafDirToChildrenFiles(path) | ||
|
|
||
|
|
@@ -387,7 +397,7 @@ class HDFSFileCatalog( | |
| } | ||
| } | ||
|
|
||
| def inferPartitioning(schema: Option[StructType]): PartitionSpec = { | ||
| private def inferPartitioning(schema: Option[StructType]): PartitionSpec = { | ||
| // We use leaf dirs containing data files to discover the schema. | ||
| val leafDirs = leafDirToChildrenFiles.keys.toSeq | ||
| schema match { | ||
|
|
@@ -443,6 +453,22 @@ class HDFSFileCatalog( | |
| } | ||
| } | ||
|
|
||
| /** List of files to consider when there is not inferred partitioning scheme */ | ||
| private def unpartitionedDataFiles(): Seq[FileStatus] = { | ||
| // For each of the input paths, get the list of files inside them | ||
| paths.flatMap { path => | ||
| // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel). | ||
| val fs = path.getFileSystem(hadoopConf) | ||
| val qualifiedPath = path.makeQualified(fs.getUri, fs.getWorkingDirectory) | ||
|
|
||
| // If it is a directory (i.e. exists in leafDirToChildrenFiles), return its children files | ||
| // Or if it is a file (i.e. exists in leafFiles), return the path itself | ||
| leafDirToChildrenFiles.get(qualifiedPath).orElse { | ||
|
||
| leafFiles.get(path).map(Array(_)) | ||
| }.getOrElse(Array.empty) | ||
| } | ||
| } | ||
|
|
||
| def refresh(): Unit = { | ||
| val files = listLeafFiles(paths) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,8 @@ | |
|
|
||
| package org.apache.spark.sql.sources | ||
|
|
||
| import java.io.File | ||
|
|
||
| import scala.util.Random | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
|
|
@@ -486,7 +488,133 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes | |
| } | ||
| } | ||
|
|
||
| test("Hadoop style globbing") { | ||
| test("load() - with directory of unpartitioned data in nested subdirs") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we put it in partition discovery suite? If we put it at here, we will run it with every data source, right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well probably it should run with all data sources to make sure no data source violates this rule. this is not really partition discovery but more like what to do when there is not partition to discover. |
||
| withTempPath { file => | ||
| val dir = file.getCanonicalPath | ||
| val subdir = new File(dir, "subdir").getCanonicalPath | ||
|
|
||
| val dataInDir = Seq(1, 2, 3).toDF("value") | ||
| val dataInSubdir = Seq(4, 5, 6).toDF("value") | ||
|
|
||
| /* | ||
|
|
||
| Directory structure to be generated | ||
|
|
||
| dir | ||
| | | ||
| |___ [ files of dataInDir ] | ||
| | | ||
| |___ subsubdir | ||
| | | ||
| |___ [ files of dataInSubdir ] | ||
| */ | ||
|
|
||
| // Generated dataInSubdir, not data in dir | ||
| partitionedTestDF1.write | ||
| .format(dataSourceName) | ||
| .mode(SaveMode.Overwrite) | ||
| .save(subdir) | ||
|
|
||
| // Inferring schema should throw error as it should not find any file to infer | ||
| val e = intercept[AnalysisException] { | ||
| sqlContext.read.format(dataSourceName).load(dir) | ||
| } | ||
| assert(e.getMessage.contains("infer")) | ||
|
|
||
| /** Test whether data is read with the given path matches the expected answer */ | ||
| def testWithPath(path: String, expectedAnswer: Seq[Row]): Unit = { | ||
| val df = sqlContext.read | ||
| .format(dataSourceName) | ||
| .schema(dataInDir.schema) // avoid schema inference for any format | ||
| .load(path) | ||
| checkAnswer(df, expectedAnswer) | ||
| } | ||
|
|
||
| // Reading by the path 'file/' *not by 'file/subdir') should give empty results | ||
| // as there are no files in 'file' and it should not pick up files in 'file/subdir' | ||
| testWithPath(dir, Seq.empty) | ||
|
|
||
| dataInDir.write | ||
| .format(dataSourceName) | ||
| .mode(SaveMode.Overwrite) | ||
|
||
| .save(dir) | ||
|
|
||
| // Should give only rows from partitionedTestDF2 | ||
| testWithPath(dir, dataInDir.collect()) | ||
| } | ||
| } | ||
|
|
||
| test("Hadoop style globbing - unpartitioned data") { | ||
| withTempPath { file => | ||
|
|
||
| val dir = file.getCanonicalPath | ||
| val subdir = new File(dir, "subdir").getCanonicalPath | ||
| val subsubdir = new File(subdir, "subsubdir").getCanonicalPath | ||
| val anotherSubsubdir = | ||
| new File(new File(dir, "another-subdir"), "another-subsubdir").getCanonicalPath | ||
|
|
||
| val dataInSubdir = Seq(1, 2, 3).toDF("value") | ||
| val dataInSubsubdir = Seq(4, 5, 6).toDF("value") | ||
| val dataInAnotherSubsubdir = Seq(7, 8, 9).toDF("value") | ||
|
|
||
| dataInSubdir.write | ||
| .format (dataSourceName) | ||
| .mode (SaveMode.Overwrite) | ||
| .save (subdir) | ||
|
|
||
| dataInSubsubdir.write | ||
| .format (dataSourceName) | ||
| .mode (SaveMode.Overwrite) | ||
| .save (subsubdir) | ||
|
|
||
| dataInAnotherSubsubdir.write | ||
| .format (dataSourceName) | ||
| .mode (SaveMode.Overwrite) | ||
| .save (anotherSubsubdir) | ||
|
||
|
|
||
| /* | ||
|
|
||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove empty lines |
||
| Directory structure generated | ||
|
|
||
| dir | ||
| | | ||
| |___ subdir | ||
| | | | ||
| | |___ [ files of dataInSubdir ] | ||
| | | | ||
| | |___ subsubdir | ||
| | | | ||
| | |___ [ files of dataInSubsubdir ] | ||
| | | ||
| | | ||
| |___ anotherSubdir | ||
| | | ||
| |___ anotherSubsubdir | ||
| | | ||
| |___ [ files of dataInAnotherSubsubdir ] | ||
| */ | ||
|
|
||
| val schema = dataInSubdir.schema | ||
|
|
||
| /** Test whether data is read with the given path matches the expected answer */ | ||
| def testWithPath(path: String, expectedDf: DataFrame): Unit = { | ||
| val df = sqlContext.read | ||
| .format(dataSourceName) | ||
| .schema(schema) // avoid schema inference for any format | ||
| .load(path) | ||
| checkAnswer(df, expectedDf) | ||
| } | ||
|
|
||
| testWithPath(s"$dir/*/", dataInSubdir) | ||
| testWithPath(s"$dir/sub*/*", dataInSubdir.union(dataInSubsubdir)) | ||
| testWithPath(s"$dir/another*/*", dataInAnotherSubsubdir) | ||
| testWithPath(s"$dir/*/another*", dataInAnotherSubsubdir) | ||
| testWithPath(s"$dir/*/*", dataInSubdir.union(dataInSubsubdir).union(dataInAnotherSubsubdir)) | ||
|
|
||
|
||
| } | ||
| } | ||
|
|
||
| test("Hadoop style globbing - partitioned data") { | ||
| withTempPath { file => | ||
| partitionedTestDF.write | ||
| .format(dataSourceName) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we call
allFilesat here?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I dont know for sure. if there is a partitioning scheme, there is an additional filter on files that start with "_" in
listFiles, that does not seem to be present inallFiles. So I am not sure where its best to merge.Also, I think this way is slightly cleaner than listFiles conditionally depending on allFiles