apache · gatorsmile · Jul 14, 2016 · Jul 14, 2016 · Jul 14, 2016 · Jul 19, 2016
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
@@ -223,6 +223,9 @@ abstract class Catalog {
    * If this table is cached as an InMemoryRelation, drop the original cached version and make the
    * new version cached lazily.
    *
+   * If the table's schema is inferred at runtime, infer the schema again and update the schema
+   * in the external catalog.
+   *
    * @since 2.0.0
    */
   def refreshTable(tableName: String): Unit

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -52,7 +52,7 @@ case class CreateDataSourceTableCommand(
     userSpecifiedSchema: Option[StructType],
     provider: String,
     options: Map[String, String],
-    partitionColumns: Array[String],
+    userSpecifiedPartitionColumns: Array[String],
     bucketSpec: Option[BucketSpec],
     ignoreIfExists: Boolean,
     managedIfNoPath: Boolean)
@@ -95,17 +95,37 @@ case class CreateDataSourceTableCommand(
       }
 
     // Create the relation to validate the arguments before writing the metadata to the metastore.
-    DataSource(
-      sparkSession = sparkSession,
-      userSpecifiedSchema = userSpecifiedSchema,
-      className = provider,
-      bucketSpec = None,
-      options = optionsWithPath).resolveRelation(checkPathExist = false)
+    val dataSource: HadoopFsRelation =
+      DataSource(
+        sparkSession = sparkSession,
+        userSpecifiedSchema = userSpecifiedSchema,
+        className = provider,
+        bucketSpec = None,
+        options = optionsWithPath)
+        .resolveRelation(checkPathExist = false).asInstanceOf[HadoopFsRelation]
+
+    val partitionColumns =
+      if (userSpecifiedSchema.isEmpty && userSpecifiedPartitionColumns.length > 0) {
+        // The table does not have a specified schema, which means that the schema will be inferred
+        // when we load the table. So, we are not expecting partition columns and we will discover
+        // partitions when we load the table. However, if there are specified partition columns,
+        // we simply ignore them and provide a warning message.
+        logWarning(
+          s"The schema and partitions of table $tableIdent will be inferred when it is loaded. " +
+            s"Specified partition columns (${userSpecifiedPartitionColumns.mkString(",")}) will " +
+            "be ignored.")
+        dataSource.partitionSchema.fieldNames
+      } else {
+        userSpecifiedPartitionColumns
+      }
+
+    val schemaType = if (userSpecifiedSchema.isEmpty) SchemaType.INFERRED else SchemaType.USER
 
     CreateDataSourceTableUtils.createDataSourceTable(
       sparkSession = sparkSession,
       tableIdent = tableIdent,
-      userSpecifiedSchema = userSpecifiedSchema,
+      schema = dataSource.schema,
 dataSchema = dataSchema.asNullable, 
 dataSchema = dataSchema.asNullable, 
+      schemaType = schemaType,
       partitionColumns = partitionColumns,
       bucketSpec = bucketSpec,
       provider = provider,
@@ -256,7 +276,8 @@ case class CreateDataSourceTableAsSelectCommand(
       CreateDataSourceTableUtils.createDataSourceTable(
         sparkSession = sparkSession,
         tableIdent = tableIdent,
-        userSpecifiedSchema = Some(result.schema),
+        schema = result.schema,
+        schemaType = SchemaType.USER,
         partitionColumns = partitionColumns,
         bucketSpec = bucketSpec,
         provider = provider,
@@ -270,6 +291,11 @@ case class CreateDataSourceTableAsSelectCommand(
   }
 }
 
+case class SchemaType private(name: String)
+object SchemaType {
+  val USER = new SchemaType("USER")
+  val INFERRED = new SchemaType("INFERRED")
+}
 
 object CreateDataSourceTableUtils extends Logging {
 
@@ -279,6 +305,7 @@ object CreateDataSourceTableUtils extends Logging {
   val DATASOURCE_OUTPUTPATH = DATASOURCE_PREFIX + "output.path"
   val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema"
   val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_SCHEMA + "."
+  val DATASOURCE_SCHEMA_TYPE = DATASOURCE_SCHEMA_PREFIX + "type"
   val DATASOURCE_SCHEMA_NUMPARTS = DATASOURCE_SCHEMA_PREFIX + "numParts"
   val DATASOURCE_SCHEMA_NUMPARTCOLS = DATASOURCE_SCHEMA_PREFIX + "numPartCols"
   val DATASOURCE_SCHEMA_NUMSORTCOLS = DATASOURCE_SCHEMA_PREFIX + "numSortCols"
@@ -303,10 +330,40 @@ object CreateDataSourceTableUtils extends Logging {
     matcher.matches()
   }
 
+  /**
+   * Saves the schema (including partition info) into the table properties.
+   * Overwrites the schema, if already existed.
+   */
+  def saveSchema(
+      sparkSession: SparkSession,
+      schema: StructType,
+      partitionColumns: Array[String],
+      tableProperties: mutable.HashMap[String, String]): Unit = {
+    // Serialized JSON schema string may be too long to be stored into a single
+    // metastore SerDe property.  In this case, we split the JSON string and store each part as
+    // a separate table property.
+    val threshold = sparkSession.sessionState.conf.schemaStringLengthThreshold
+    val schemaJsonString = schema.json
+    // Split the JSON string.
+    val parts = schemaJsonString.grouped(threshold).toSeq
+    tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
+    parts.zipWithIndex.foreach { case (part, index) =>
+      tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
+    }
+
+    if (partitionColumns.length > 0) {
+      tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString)
+      partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
+        tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol)
+      }
+    }
+  }
+
   def createDataSourceTable(
       sparkSession: SparkSession,
       tableIdent: TableIdentifier,
-      userSpecifiedSchema: Option[StructType],
+      schema: StructType,
+      schemaType: SchemaType,
       partitionColumns: Array[String],
       bucketSpec: Option[BucketSpec],
       provider: String,
@@ -315,28 +372,10 @@ object CreateDataSourceTableUtils extends Logging {
     val tableProperties = new mutable.HashMap[String, String]
     tableProperties.put(DATASOURCE_PROVIDER, provider)
 
-    // Saves optional user specified schema.  Serialized JSON schema string may be too long to be
-    // stored into a single metastore SerDe property.  In this case, we split the JSON string and
-    // store each part as a separate SerDe property.
-    userSpecifiedSchema.foreach { schema =>
-      val threshold = sparkSession.sessionState.conf.schemaStringLengthThreshold
-      val schemaJsonString = schema.json
-      // Split the JSON string.
-      val parts = schemaJsonString.grouped(threshold).toSeq
-      tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
-      parts.zipWithIndex.foreach { case (part, index) =>
-        tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part)
-      }
-    }
+    tableProperties.put(DATASOURCE_SCHEMA_TYPE, schemaType.name)
+    saveSchema(sparkSession, schema, partitionColumns, tableProperties)
 
-    if (userSpecifiedSchema.isDefined && partitionColumns.length > 0) {
-      tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString)
-      partitionColumns.zipWithIndex.foreach { case (partCol, index) =>
-        tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol)
-      }
-    }
-
-    if (userSpecifiedSchema.isDefined && bucketSpec.isDefined) {
+    if (bucketSpec.isDefined) {
       val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get
 
       tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString)
@@ -353,16 +392,6 @@ object CreateDataSourceTableUtils extends Logging {
       }
     }
 
-    if (userSpecifiedSchema.isEmpty && partitionColumns.length > 0) {
-      // The table does not have a specified schema, which means that the schema will be inferred
-      // when we load the table. So, we are not expecting partition columns and we will discover
-      // partitions when we load the table. However, if there are specified partition columns,
-      // we simply ignore them and provide a warning message.
-      logWarning(
-        s"The schema and partitions of table $tableIdent will be inferred when it is loaded. " +
-          s"Specified partition columns (${partitionColumns.mkString(",")}) will be ignored.")
-    }
-
     val tableType = if (isExternal) {
       tableProperties.put("EXTERNAL", "TRUE")
       CatalogTableType.EXTERNAL
@@ -375,7 +404,7 @@ object CreateDataSourceTableUtils extends Logging {
     val dataSource =
       DataSource(
         sparkSession,
-        userSpecifiedSchema = userSpecifiedSchema,
+        userSpecifiedSchema = Some(schema),
         partitionColumns = partitionColumns,
         bucketSpec = bucketSpec,
         className = provider,

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -487,6 +487,10 @@ object DDLUtils {
     isDatasourceTable(table.properties)
   }
 
+  def isSchemaInferred(table: CatalogTable): Boolean = {
+    table.properties.get(DATASOURCE_SCHEMA_TYPE) == Option(SchemaType.INFERRED.name)
+  }
+
   /**
    * If the command ALTER VIEW is to alter a table or ALTER TABLE is to alter a view,
    * issue an exception [[AnalysisException]].

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.internal
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.reflect.runtime.universe.TypeTag
 
 import org.apache.spark.annotation.Experimental
@@ -27,7 +28,8 @@ import org.apache.spark.sql.catalyst.{DefinedByConstructorParams, TableIdentifie
 import org.apache.spark.sql.catalyst.catalog.SessionCatalog
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.execution.datasources.CreateTableUsing
+import org.apache.spark.sql.execution.command.{CreateDataSourceTableUtils, DDLUtils}
+import org.apache.spark.sql.execution.datasources.{CreateTableUsing, DataSource, HadoopFsRelation}
 import org.apache.spark.sql.types.StructType
 
 
@@ -350,6 +352,44 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
     sparkSession.sharedState.cacheManager.lookupCachedData(qName).nonEmpty
   }
 
+  /**
+   * Refresh the inferred schema stored in the external catalog for data source tables.
+   */
+  private def refreshInferredSchema(tableIdent: TableIdentifier): Unit = {
+    val table = sessionCatalog.getTableMetadataOption(tableIdent)
+    table.foreach { tableDesc =>
+      if (DDLUtils.isDatasourceTable(tableDesc) && DDLUtils.isSchemaInferred(tableDesc)) {
+        val partitionColumns = DDLUtils.getPartitionColumnsFromTableProperties(tableDesc)
+        val bucketSpec = DDLUtils.getBucketSpecFromTableProperties(tableDesc)
+        val dataSource =
+          DataSource(
+            sparkSession,
+            userSpecifiedSchema = None,
+            partitionColumns = partitionColumns,
+            bucketSpec = bucketSpec,
+            className = tableDesc.properties(CreateDataSourceTableUtils.DATASOURCE_PROVIDER),
+            options = tableDesc.storage.serdeProperties)
+            .resolveRelation().asInstanceOf[HadoopFsRelation]
+
+        val schemaProperties = new mutable.HashMap[String, String]
+        CreateDataSourceTableUtils.saveSchema(
+          sparkSession, dataSource.schema, dataSource.partitionSchema.fieldNames, schemaProperties)
+
+        val tablePropertiesWithoutSchema = tableDesc.properties.filterKeys { k =>
+          // Keep the properties that are not for schema or partition columns
+          k != CreateDataSourceTableUtils.DATASOURCE_SCHEMA_NUMPARTS &&
+            !k.startsWith(CreateDataSourceTableUtils.DATASOURCE_SCHEMA_PART_PREFIX) &&
+            k != CreateDataSourceTableUtils.DATASOURCE_SCHEMA_NUMPARTCOLS &&
+            !k.startsWith(CreateDataSourceTableUtils.DATASOURCE_SCHEMA_PARTCOL_PREFIX) }
+
+        val newTable = tableDesc.copy(properties = tablePropertiesWithoutSchema ++ schemaProperties)
+
+        // Alter the schema-related table properties that are stored in external catalog.
+        sessionCatalog.alterTable(newTable)
+      }
+    }
+  }
+
   /**
    * Refresh the cache entry for a table, if any. For Hive metastore table, the metadata
    * is refreshed.
@@ -359,6 +399,13 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
    */
   override def refreshTable(tableName: String): Unit = {
     val tableIdent = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
+    // Refresh the schema in external catalog, if it is a data source table whose schema is inferred
+    // at runtime. For user-specified schema, we do not infer and update the schema.
+    // TODO: Support column-related ALTER TABLE DDL commands, and then users can update
+    // the user-specified schema.
+    refreshInferredSchema(tableIdent)
+    // Temp tables: refresh (or invalidate) any metadata/data cached in the plan recursively.
+    // Non-temp tables: refresh the metadata cache.
     sessionCatalog.refreshTable(tableIdent)
 
     // If this table is cached as an InMemoryRelation, drop the original

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._
+import org.apache.spark.sql.execution.command.SchemaType
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
@@ -703,7 +704,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
           createDataSourceTable(
             sparkSession = spark,
             tableIdent = TableIdentifier("wide_schema"),
-            userSpecifiedSchema = Some(schema),
+            schema = schema,
+            schemaType = SchemaType.USER,
             partitionColumns = Array.empty[String],
             bucketSpec = None,
             provider = "json",
@@ -988,7 +990,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       createDataSourceTable(
         sparkSession = spark,
         tableIdent = TableIdentifier("not_skip_hive_metadata"),
-        userSpecifiedSchema = Some(schema),
+        schema = schema,
+        schemaType = SchemaType.USER,
         partitionColumns = Array.empty[String],
         bucketSpec = None,
         provider = "parquet",
@@ -1003,7 +1006,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       createDataSourceTable(
         sparkSession = spark,
         tableIdent = TableIdentifier("skip_hive_metadata"),
-        userSpecifiedSchema = Some(schema),
+        schema = schema,
+        schemaType = SchemaType.USER,
         partitionColumns = Array.empty[String],
         bucketSpec = None,
         provider = "parquet",