apache · gengliangwang · May 4, 2019 · May 6, 2019 · May 6, 2019 · May 6, 2019
diff --git a/docs/sql-data-sources-binaryFile.md b/docs/sql-data-sources-binaryFile.md
@@ -28,50 +28,36 @@ It produces a DataFrame with the following columns and possibly partition column
 * `length`: LongType
 * `content`: BinaryType
 
-It supports the following read option:
-<table class="table">
-  <tr><th><b>Property Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th></tr>
-  <tr>
-    <td><code>pathGlobFilter</code></td>
-    <td>none (accepts all)</td>
-    <td>
-    An optional glob pattern to only include files with paths matching the pattern.
-    The syntax follows <code>org.apache.hadoop.fs.GlobFilter</code>.
-    It does not change the behavior of partition discovery.
-    </td>
-  </tr>
-</table>
-
 To read whole binary files, you need to specify the data source `format` as `binaryFile`.
-For example, the following code reads all PNG files from the input directory:
+For example, the following code reads all the files from the input directory:
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
 {% highlight scala %}
 
-spark.read.format("binaryFile").option("pathGlobFilter", "*.png").load("/path/to/data")
+spark.read.format("binaryFile").load("/path/to/data")
 
 {% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
 {% highlight java %}
 
-spark.read().format("binaryFile").option("pathGlobFilter", "*.png").load("/path/to/data");
+spark.read().format("binaryFile").load("/path/to/data");
 
 {% endhighlight %}
 </div>
 <div data-lang="python" markdown="1">
 {% highlight python %}
 
-spark.read.format("binaryFile").option("pathGlobFilter", "*.png").load("/path/to/data")
+spark.read.format("binaryFile").load("/path/to/data")
 
 {% endhighlight %}
 </div>
 <div data-lang="r" markdown="1">
 {% highlight r %}
 
-read.df("/path/to/data", source = "binaryFile", pathGlobFilter = "*.png")
+read.df("/path/to/data", source = "binaryFile")
 
 {% endhighlight %}
 </div>

diff --git a/docs/sql-data-sources-load-save-functions.md b/docs/sql-data-sources-load-save-functions.md
@@ -102,6 +102,28 @@ To load a CSV file you can use:
 </div>
 </div>
 
+To load files with paths matching a given glob pattern recursively while keeping the behavior of partition discovery,
+you can use:
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% include_example load_with_path_glob_filter scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
+</div>
+
+<div data-lang="java"  markdown="1">
+{% include_example load_with_path_glob_filter java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
+</div>
+
+<div data-lang="python"  markdown="1">
+{% include_example load_with_path_glob_filter python/sql/datasource.py %}
+</div>
+
+<div data-lang="r"  markdown="1">
+{% include_example load_with_path_glob_filter r/RSparkSQLExample.R %}
+
+</div>
+</div>
+
 The extra options are also used during write operation.
 For example, you can control bloom filters and dictionary encodings for ORC data sources.
 The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`.

diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
@@ -123,6 +123,11 @@ private static void runBasicDataSourceExample(SparkSession spark) {
       .option("header", "true")
       .load("examples/src/main/resources/people.csv");
     // $example off:manual_load_options_csv$
+    // $example on:load_with_path_glob_filter$
+    Dataset<Row> partitionedUsersDF = spark.read().format("orc")
+      .option("pathGlobFilter", "*.orc")
+      .load("examples/src/main/resources/partitioned_users.orc");
+    // $example off:load_with_path_glob_filter$
     // $example on:manual_save_options_orc$
     usersDF.write().format("orc")
       .option("orc.bloom.filter.columns", "favorite_color")

diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
@@ -57,6 +57,11 @@ def basic_datasource_example(spark):
                          format="csv", sep=":", inferSchema="true", header="true")
     # $example off:manual_load_options_csv$
 
+    # $example on:load_with_path_glob_filter$
+    df = spark.read.load("examples/src/main/resources/partitioned_users.orc",
+                         format="orc", pathGlobFilter="*.orc")
+    # $example off:load_with_path_glob_filter$
+
     # $example on:manual_save_options_orc$
     df = spark.read.orc("examples/src/main/resources/users.orc")
     (df.write.format("orc")

diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R
@@ -118,6 +118,10 @@ df <- read.df("examples/src/main/resources/people.csv", "csv", sep = ";", inferS
 namesAndAges <- select(df, "name", "age")
 # $example off:manual_load_options_csv$
 
+# $example on:load_with_path_glob_filter$
+df <- read.df("examples/src/main/resources/partitioned_users.orc", "orc", pathGlobFilter = "*.orc")
+# $example off:load_with_path_glob_filter$
+
 # $example on:manual_save_options_orc$
 df <- read.df("examples/src/main/resources/users.orc", "orc")
 write.orc(df, "users_with_options.orc", orc.bloom.filter.columns = "favorite_color", orc.dictionary.key.threshold = 1.0, orc.column.encoding.direct = "name")

diff --git a/examples/src/main/resources/partitioned_users.orc/do_not_read_this.txt b/examples/src/main/resources/partitioned_users.orc/do_not_read_this.txt
@@ -0,0 +1 @@
+do not read this
diff --git a/.../main/resources/partitioned_users.orc/favorite_color=__HIVE_DEFAULT_PARTITION__/users.orc b/.../main/resources/partitioned_users.orc/favorite_color=__HIVE_DEFAULT_PARTITION__/users.orc
diff --git a/examples/src/main/resources/partitioned_users.orc/favorite_color=red/users.orc b/examples/src/main/resources/partitioned_users.orc/favorite_color=red/users.orc
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
@@ -56,6 +56,11 @@ object SQLDataSourceExample {
       .option("header", "true")
       .load("examples/src/main/resources/people.csv")
     // $example off:manual_load_options_csv$
+    // $example on:load_with_path_glob_filter$
+    val partitionedUsersDF = spark.read.format("orc")
+      .option("pathGlobFilter", "*.orc")
+      .load("examples/src/main/resources/partitioned_users.orc")
+    // $example off:load_with_path_glob_filter$
     // $example on:manual_save_options_orc$
     usersDF.write.format("orc")
       .option("orc.bloom.filter.columns", "favorite_color")

diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala
@@ -57,6 +57,10 @@ private[avro] class AvroFileFormat extends FileFormat
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
     val conf = spark.sessionState.newHadoopConf()
+    if (options.contains("ignoreExtension")) {
+      logWarning(s"Option ${AvroOptions.ignoreExtensionKey} is deprecated. Please use the " +
+        "general data source option pathGlobFilter for filtering file names.")
+    }
     val parsedOptions = new AvroOptions(options, conf)
 
     // User can specify an optional avro json schema.

diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala
@@ -59,14 +59,15 @@ class AvroOptions(
    * If the option is not set, the Hadoop's config `avro.mapred.ignore.inputs.without.extension`
    * is taken into account. If the former one is not set too, file extensions are ignored.
    */
+  @deprecated("Use the general data source option pathGlobFilter for filtering file names", "3.0")
   val ignoreExtension: Boolean = {
     val ignoreFilesWithoutExtensionByDefault = false
     val ignoreFilesWithoutExtension = conf.getBoolean(
       AvroFileFormat.IgnoreFilesWithoutExtensionProperty,
       ignoreFilesWithoutExtensionByDefault)
 
     parameters
-      .get("ignoreExtension")
+      .get(AvroOptions.ignoreExtensionKey)
       .map(_.toBoolean)
       .getOrElse(!ignoreFilesWithoutExtension)
   }
@@ -93,4 +94,6 @@ object AvroOptions {
       .getOrElse(new Configuration())
     new AvroOptions(CaseInsensitiveMap(parameters), hadoopConf)
   }
+
+  val ignoreExtensionKey = "ignoreExtension"
 }
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -120,6 +120,9 @@ def option(self, key, value):
             * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
                 in the JSON/CSV datasources or partition values.
                 If it isn't set, it uses the default value, session local timezone.
+            * ``pathGlobFilter``: an optional glob pattern to only include files with paths matching
+                the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter.
+                It does not change the behavior of partition discovery.
         """
         self._jreader = self._jreader.option(key, to_str(value))
         return self
@@ -132,6 +135,9 @@ def options(self, **options):
             * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
                 in the JSON/CSV datasources or partition values.
                 If it isn't set, it uses the default value, session local timezone.
+            * ``pathGlobFilter``: an optional glob pattern to only include files with paths matching
+                the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter.
+                It does not change the behavior of partition discovery.
         """
         for k in options:
             self._jreader = self._jreader.option(k, to_str(options[k]))
@@ -624,6 +630,9 @@ def option(self, key, value):
             * ``timeZone``: sets the string that indicates a timezone to be used to format
                 timestamps in the JSON/CSV datasources or partition values.
                 If it isn't set, it uses the default value, session local timezone.
+            * ``pathGlobFilter``: an optional glob pattern to only include files with paths matching
+                the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter.
+                It does not change the behavior of partition discovery.
         """
         self._jwrite = self._jwrite.option(key, to_str(value))
         return self
@@ -636,6 +645,9 @@ def options(self, **options):
             * ``timeZone``: sets the string that indicates a timezone to be used to format
                 timestamps in the JSON/CSV datasources or partition values.
                 If it isn't set, it uses the default value, session local timezone.
+            * ``pathGlobFilter``: an optional glob pattern to only include files with paths matching
+                the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter.
+                It does not change the behavior of partition discovery.
         """
         for k in options:
             self._jwrite = self._jwrite.option(k, to_str(options[k]))

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
@@ -341,6 +341,9 @@ def option(self, key, value):
             * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
                 in the JSON/CSV datasources or partition values.
                 If it isn't set, it uses the default value, session local timezone.
+            * ``pathGlobFilter``: an optional glob pattern to only include files with paths matching
+                the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter.
+                It does not change the behavior of partition discovery.
 
         .. note:: Evolving.
 
@@ -357,6 +360,9 @@ def options(self, **options):
             * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps
                 in the JSON/CSV datasources or partition values.
                 If it isn't set, it uses the default value, session local timezone.
+            * ``pathGlobFilter``: an optional glob pattern to only include files with paths matching
+                the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter.
+                It does not change the behavior of partition discovery.
 
         .. note:: Evolving.
 
@@ -769,6 +775,9 @@ def option(self, key, value):
             * ``timeZone``: sets the string that indicates a timezone to be used to format
                 timestamps in the JSON/CSV datasources or partition values.
                 If it isn't set, it uses the default value, session local timezone.
+            * ``pathGlobFilter``: an optional glob pattern to only include files with paths matching
+                the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter.
+                It does not change the behavior of partition discovery.
 
         .. note:: Evolving.
         """
@@ -783,6 +792,9 @@ def options(self, **options):
             * ``timeZone``: sets the string that indicates a timezone to be used to format
                 timestamps in the JSON/CSV datasources or partition values.
                 If it isn't set, it uses the default value, session local timezone.
+            * ``pathGlobFilter``: an optional glob pattern to only include files with paths matching
+                the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter.
+                It does not change the behavior of partition discovery.
 
        .. note:: Evolving.
         """

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -98,6 +98,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <ul>
    * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
    * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
+   * <li>`pathGlobFilter`: an optional glob pattern to only include files with paths matching
+   * the pattern. The syntax follows <code>org.apache.hadoop.fs.GlobFilter</code>.
+   * It does not change the behavior of partition discovery.</li>
    * </ul>
    *
    * @since 1.4.0
@@ -135,6 +138,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <ul>
    * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
    * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
+   * <li>`pathGlobFilter`: an optional glob pattern to only include files with paths matching
+   * the pattern. The syntax follows <code>org.apache.hadoop.fs.GlobFilter</code>.
+   * It does not change the behavior of partition discovery.</li>
    * </ul>
    *
    * @since 1.4.0
@@ -151,6 +157,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <ul>
    * <li>`timeZone` (default session local timezone): sets the string that indicates a timezone
    * to be used to parse timestamps in the JSON/CSV datasources or partition values.</li>
+   * <li>`pathGlobFilter`: an optional glob pattern to only include files with paths matching
+   * the pattern. The syntax follows <code>org.apache.hadoop.fs.GlobFilter</code>.
+   * It does not change the behavior of partition discovery.</li>
    * </ul>
    *
    * @since 1.4.0

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -348,7 +348,8 @@ case class DataSource(
             sparkSession.sessionState.newHadoopConf(),
             sparkSession.sessionState.conf) =>
         val basePath = new Path((caseInsensitiveOptions.get("path").toSeq ++ paths).head)
-        val fileCatalog = new MetadataLogFileIndex(sparkSession, basePath, userSpecifiedSchema)
+        val fileCatalog = new MetadataLogFileIndex(sparkSession, basePath,
+          caseInsensitiveOptions, userSpecifiedSchema)
         val dataSchema = userSpecifiedSchema.orElse {
           format.inferSchema(
             sparkSession,

diff --git a/...rc/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/...rc/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
@@ -56,6 +56,12 @@ abstract class PartitioningAwareFileIndex(
 
   protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]]
 
+  protected lazy val pathGlobFilter = parameters.get("pathGlobFilter").map(new GlobFilter(_))
+
+  protected def matchGlobPattern(file: FileStatus): Boolean = {
+    pathGlobFilter.forall(_.accept(file.getPath))
+  }
+
   override def listFiles(
       partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
     def isNonEmptyFile(f: FileStatus): Boolean = {
@@ -69,7 +75,7 @@ abstract class PartitioningAwareFileIndex(
           val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match {
             case Some(existingDir) =>
               // Directory has children files in it, return them
-              existingDir.filter(isNonEmptyFile)
+              existingDir.filter(f => matchGlobPattern(f) && isNonEmptyFile(f))
 
             case None =>
               // Directory does not exist, or has no children files
@@ -89,7 +95,7 @@ abstract class PartitioningAwareFileIndex(
   override def sizeInBytes: Long = allFiles().map(_.getLen).sum
 
   def allFiles(): Seq[FileStatus] = {
-    if (partitionSpec().partitionColumns.isEmpty) {
+    val files = if (partitionSpec().partitionColumns.isEmpty) {
       // For each of the root input paths, get the list of files inside them
       rootPaths.flatMap { path =>
         // Make the path qualified (consistent with listLeafFiles and bulkListLeafFiles).
@@ -118,6 +124,7 @@ abstract class PartitioningAwareFileIndex(
     } else {
       leafFiles.values.toSeq
     }
+    files.filter(matchGlobPattern)
   }
 
   protected def inferPartitioning(): PartitionSpec = {