apache · LantaoJin · Nov 13, 2019 · Nov 13, 2019 · Nov 14, 2019 · Nov 14, 2019
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -330,7 +330,9 @@ case class DataSource(
    *                        is considered as a non-streaming file based data source. Since we know
    *                        that files already exist, we don't need to check them again.
    */
-  def resolveRelation(checkFilesExist: Boolean = true): BaseRelation = {
+  def resolveRelation(
+      checkFilesExist: Boolean = true,
+      needPartitionInferring: Boolean = true): BaseRelation = {
     val relation = (providingInstance(), userSpecifiedSchema) match {
       // TODO: Throw when too much is given.
       case (dataSource: SchemaRelationProvider, Some(schema)) =>
@@ -391,7 +393,7 @@ case class DataSource(
         } else {
           val globbedPaths = checkAndGlobPathIfNecessary(
             checkEmptyGlobPath = true, checkFilesExist = checkFilesExist)
-          val index = createInMemoryFileIndex(globbedPaths)
+          val index = createInMemoryFileIndex(globbedPaths, needPartitionInferring)
           val (resultDataSchema, resultPartitionSchema) =
             getOrInferFileFormatSchema(format, () => index)
           (index, resultDataSchema, resultPartitionSchema)
@@ -427,7 +429,6 @@ case class DataSource(
           "in the data schema",
           equality)
     }
-
     relation
   }
 
@@ -551,10 +552,12 @@ case class DataSource(
   }
 
   /** Returns an [[InMemoryFileIndex]] that can be used to get partition schema and file list. */
-  private def createInMemoryFileIndex(globbedPaths: Seq[Path]): InMemoryFileIndex = {
+  private def createInMemoryFileIndex(
+      globbedPaths: Seq[Path],
+      needPartitionInferring: Boolean = true): InMemoryFileIndex = {
     val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
-    new InMemoryFileIndex(
-      sparkSession, globbedPaths, options, userSpecifiedSchema, fileStatusCache)
+    new InMemoryFileIndex(sparkSession, globbedPaths, options, userSpecifiedSchema,
+      fileStatusCache, needPartitionInferring)
   }
 
   /**

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -50,7 +50,8 @@ class InMemoryFileIndex(
     rootPathsSpecified: Seq[Path],
     parameters: Map[String, String],
     userSpecifiedSchema: Option[StructType],
-    fileStatusCache: FileStatusCache = NoopCache)
+    fileStatusCache: FileStatusCache = NoopCache,
+    needPartitionInferring: Boolean = true)
   extends PartitioningAwareFileIndex(
     sparkSession, parameters, userSpecifiedSchema, fileStatusCache) {
 
@@ -69,7 +70,11 @@ class InMemoryFileIndex(
 
   override def partitionSpec(): PartitionSpec = {
     if (cachedPartitionSpec == null) {
-      cachedPartitionSpec = inferPartitioning()
+      cachedPartitionSpec = if (needPartitionInferring) {
+        inferPartitioning()
+      } else {
+        PartitionSpec.emptySpec
+      }
     }
     logTrace(s"Partition spec: $cachedPartitionSpec")
     cachedPartitionSpec

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -245,7 +245,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
                 userSpecifiedSchema = Option(updatedTable.dataSchema),
                 bucketSpec = None,
                 options = options,
-                className = fileType).resolveRelation(),
+                className = fileType).resolveRelation(needPartitionInferring = false),
               table = updatedTable)
 
           catalogProxy.cacheTable(tableIdentifier, created)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -362,29 +362,20 @@ class DataSourceWithHiveMetastoreCatalogSuite
     }
   }
 
-  test("SPARK-29869: HiveMetastoreCatalog#convertToLogicalRelation throws AssertionError") {
+  test("SPARK-29869: fix HiveMetastoreCatalog#convertToLogicalRelation throws AssertionError") {
     withTempPath(dir => {
       val baseDir = s"${dir.getCanonicalFile.toURI.toString}/test"
-      val partitionDir = s"${dir.getCanonicalFile.toURI.toString}/test/dt=20191113"
-      val file = new Path(partitionDir, "file.parquet")
-      val fs = file.getFileSystem(new Configuration())
-      fs.createNewFile(file)
-      withTable("test") {
-        withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "true",
-          SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD.key -> "0") {
+      val partitionLikeDir = s"${dir.getCanonicalFile.toURI.toString}/test/dt=20191113"
+      spark.range(3).selectExpr("id").write.parquet(partitionLikeDir)
+      withTable("non_partition_table") {
+        withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "true") {
           spark.sql(
             s"""
-               |CREATE EXTERNAL TABLE `test`(key string)
-               |ROW FORMAT SERDE
-               |  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
-               |STORED AS INPUTFORMAT
-               |  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
-               |OUTPUTFORMAT
-               |  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-               |LOCATION '$baseDir'
+               |CREATE TABLE non_partition_table (id bigint)
+               |STORED AS PARQUET LOCATION '$baseDir'
                |""".stripMargin)
-          val e = intercept[AssertionError](spark.sql("select * from test")).getMessage
-          assert(e.contains("assertion failed"))
+          assert(spark.sql("select * from non_partition_table").collect() ===
+            Array(Row(0), Row(1), Row(2)))
         }
       }
     })