[SPARK-16975][SQL][FOLLOWUP] Do not duplicately check file paths in data sources implementing FileFormat

HyukjinKwon · uzadude · commit fe3242f48d79 · 2017-01-27T22:22:08.000+02:00
## What changes were proposed in this pull request? This PR cleans up duplicated checking for file paths in implemented data sources and prevent to attempt to list twice in ORC data source. apache#14585 handles a problem for the partition column name having `_` and the issue itself is resolved correctly. However, it seems the data sources implementing `FileFormat` are validating the paths duplicately. Assuming from the comment in `CSVFileFormat`, `// TODO: Move filtering.`, I guess we don't have to check this duplicately. Currently, this seems being filtered in `PartitioningAwareFileIndex.shouldFilterOut` and`PartitioningAwareFileIndex.isDataPath`. So, `FileFormat.inferSchema` will always receive leaf files. For example, running to codes below: ``` scala spark.range(10).withColumn("_locality_code", $"id").write.partitionBy("_locality_code").save("/tmp/parquet") spark.read.parquet("/tmp/parquet") ``` gives the paths below without directories but just valid data files: ``` bash /tmp/parquet/_col=0/part-r-00000-094a8efa-bece-4b50-b54c-7918d1f7b3f8.snappy.parquet /tmp/parquet/_col=1/part-r-00000-094a8efa-bece-4b50-b54c-7918d1f7b3f8.snappy.parquet /tmp/parquet/_col=2/part-r-00000-25de2b50-225a-4bcf-a2bc-9eb9ed407ef6.snappy.parquet ... ``` to `FileFormat.inferSchema`. ## How was this patch tested? Unit test added in `HadoopFsRelationTest` and related existing tests. Author: hyukjinkwon <gurwls223@gmail.com> Closes apache#14627 from HyukjinKwon/SPARK-16975.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -55,10 +55,9 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
     require(files.nonEmpty, "Cannot infer schema from an empty set of files")
-    val csvOptions = new CSVOptions(options)
 
-    // TODO: Move filtering.
-    val paths = files.filterNot(_.getPath.getName startsWith "_").map(_.getPath.toString)
+    val csvOptions = new CSVOptions(options)
+    val paths = files.map(_.getPath.toString)
     val lines: Dataset[String] = readText(sparkSession, csvOptions, paths)
     val firstLine: String = findFirstLine(csvOptions, lines)
     val firstRow = new CsvReader(csvOptions).parseLine(firstLine)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -51,13 +51,8 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
       val columnNameOfCorruptRecord =
         parsedOptions.columnNameOfCorruptRecord
           .getOrElse(sparkSession.sessionState.conf.columnNameOfCorruptRecord)
-      val jsonFiles = files.filterNot { status =>
-        val name = status.getPath.getName
-        (name.startsWith("_") && !name.contains("=")) || name.startsWith(".")
-      }.toArray
-
       val jsonSchema = InferSchema.infer(
-        createBaseRdd(sparkSession, jsonFiles),
+        createBaseRdd(sparkSession, files),
         columnNameOfCorruptRecord,
         parsedOptions)
       checkConstraints(jsonSchema)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -241,12 +241,7 @@ class ParquetFileFormat
       commonMetadata: Seq[FileStatus])
 
   private def splitFiles(allFiles: Seq[FileStatus]): FileTypes = {
-    // Lists `FileStatus`es of all leaf nodes (files) under all base directories.
-    val leaves = allFiles.filter { f =>
-      isSummaryFile(f.getPath) ||
-        !((f.getPath.getName.startsWith("_") && !f.getPath.getName.contains("=")) ||
-          f.getPath.getName.startsWith("."))
-    }.toArray.sortBy(_.getPath.toString)
+    val leaves = allFiles.toArray.sortBy(_.getPath.toString)
 
     FileTypes(
       data = leaves.filterNot(f => isSummaryFile(f.getPath)),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala
@@ -877,6 +877,23 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
       }
     }
   }
+
+  test("SPARK-16975: Partitioned table with the column having '_' should be read correctly") {
+    withTempDir { dir =>
+      val childDir = new File(dir, dataSourceName).getCanonicalPath
+      val dataDf = spark.range(10).toDF()
+      val df = dataDf.withColumn("_col", $"id")
+      df.write.format(dataSourceName).partitionBy("_col").save(childDir)
+      val reader = spark.read.format(dataSourceName)
+
+      // This is needed for SimpleTextHadoopFsRelationSuite as SimpleTextSource needs schema.
+      if (dataSourceName == classOf[SimpleTextSource].getCanonicalName) {
+        reader.option("dataSchema", dataDf.schema.json)
+      }
+      val readBack = reader.load(childDir)
+      checkAnswer(df, readBack)
+    }
+  }
 }
 
 // This class is used to test SPARK-8578. We should not use any custom output committer when

Original file line number	Diff line number	Diff line change
`@@ -877,6 +877,23 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes`
`877`	`877`	`}`
`878`	`878`	`}`
`879`	`879`	`}`
	`880`	`+`
	`881`	`+ test("SPARK-16975: Partitioned table with the column having '_' should be read correctly") {`
	`882`	`+ withTempDir { dir =>`
	`883`	`+ val childDir = new File(dir, dataSourceName).getCanonicalPath`
	`884`	`+ val dataDf = spark.range(10).toDF()`
	`885`	`+ val df = dataDf.withColumn("_col", $"id")`
	`886`	`+ df.write.format(dataSourceName).partitionBy("_col").save(childDir)`
	`887`	`+ val reader = spark.read.format(dataSourceName)`
	`888`	`+`
	`889`	`+ // This is needed for SimpleTextHadoopFsRelationSuite as SimpleTextSource needs schema.`
	`890`	`+ if (dataSourceName == classOf[SimpleTextSource].getCanonicalName) {`
	`891`	`+ reader.option("dataSchema", dataDf.schema.json)`
	`892`	`+ }`
	`893`	`+ val readBack = reader.load(childDir)`
	`894`	`+ checkAnswer(df, readBack)`
	`895`	`+ }`
	`896`	`+ }`
`880`	`897`	`}`
`881`	`898`
`882`	`899`	`// This class is used to test SPARK-8578. We should not use any custom output committer when`