Rework the patch.

Marcelo Vanzin · Marcelo Vanzin · commit cc7cd955e262 · 2017-08-03T11:50:31.000-07:00
- Use the same code to translate between Spark and Hive tables when
  creating or altering the table.
- Fix the test so that it doesn't try to create a new SparkSession,
  which conflicts with TestHiveSingleton.
- Use 2.1's EnvironmentContext to disable auto updating of stats
  for DS tables.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -238,9 +238,9 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     }
 
     if (DDLUtils.isDatasourceTable(tableDefinition)) {
-      createDataSourceTable(
-        tableDefinition.withNewStorage(locationUri = tableLocation),
-        ignoreIfExists)
+      saveDataSourceTable(tableDefinition.withNewStorage(locationUri = tableLocation)) { table =>
+        saveTableIntoHive(table, ignoreIfExists)
+      }
     } else {
       val tableWithDataSourceProps = tableDefinition.copy(
         // We can't leave `locationUri` empty and count on Hive metastore to set a default table
@@ -257,7 +257,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     }
   }
 
-  private def createDataSourceTable(table: CatalogTable, ignoreIfExists: Boolean): Unit = {
+  private def saveDataSourceTable(table: CatalogTable)(saveFn: CatalogTable => Unit): Unit = {
     // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`.
     val provider = table.provider.get
 
@@ -363,19 +363,19 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         // specific way.
         try {
           logInfo(message)
-          saveTableIntoHive(table, ignoreIfExists)
+          saveFn(table)
         } catch {
           case NonFatal(e) =>
             val warningMessage =
               s"Could not persist ${table.identifier.quotedString} in a Hive " +
                 "compatible way. Persisting it into Hive metastore in Spark SQL specific format."
             logWarning(warningMessage, e)
-            saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
+            saveFn(newSparkSQLSpecificMetastoreTable())
         }
 
       case (None, message) =>
         logWarning(message)
-        saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists)
+        saveFn(newSparkSQLSpecificMetastoreTable())
     }
   }
 
@@ -610,30 +610,14 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
   override def alterTableSchema(db: String, table: String, schema: StructType): Unit = withClient {
     requireTableExists(db, table)
-    val rawTable = getRawTable(db, table)
-    val withNewSchema = rawTable.copy(schema = schema)
-    verifyColumnNames(withNewSchema)
-    // Add table metadata such as table schema, partition columns, etc. to table properties.
-    val updatedTable = withNewSchema.copy(
-      properties = withNewSchema.properties ++ tableMetaToTableProps(withNewSchema))
-
-    // If it's a data source table, make sure the original schema is left unchanged; the
-    // actual schema is recorded as a table property.
-    val tableToStore = if (DDLUtils.isDatasourceTable(updatedTable)) {
-      updatedTable.copy(schema = rawTable.schema)
+    val updatedTable = getTable(db, table).copy(schema = schema)
+    verifyColumnNames(updatedTable)
+    if (DDLUtils.isDatasourceTable(updatedTable)) {
+      saveDataSourceTable(updatedTable) { table =>
+        client.alterTable(table)
+      }
     } else {
-      updatedTable
-    }
-
-    try {
-      client.alterTable(tableToStore)
-    } catch {
-      case NonFatal(e) =>
-        val warningMessage =
-          s"Could not alter schema of table  ${rawTable.identifier.quotedString} in a Hive " +
-            "compatible way. Updating Hive metastore in Spark SQL specific format."
-        logWarning(warningMessage, e)
-        client.alterTable(updatedTable.copy(schema = tableToStore.partitionSchema))
+      client.alterTable(updatedTable)
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -416,9 +416,6 @@ private[hive] class HiveClientImpl(
 
       val properties = Option(h.getParameters).map(_.asScala.toMap).getOrElse(Map())
 
-      val provider = properties.get(HiveExternalCatalog.DATASOURCE_PROVIDER)
-        .orElse(Some(DDLUtils.HIVE_PROVIDER))
-
       // Hive-generated Statistics are also recorded in ignoredProperties
       val ignoredProperties = scala.collection.mutable.Map.empty[String, String]
       for (key <- HiveStatisticsProperties; value <- properties.get(key)) {
@@ -472,7 +469,6 @@ private[hive] class HiveClientImpl(
             throw new AnalysisException("Hive index table is not supported.")
         },
         schema = schema,
-        provider = provider,
         partitionColumnNames = partCols.map(_.name),
         // If the table is written by Spark, we will put bucketing information in table properties,
         // and will always overwrite the bucket spec in hive metastore by the bucketing information
@@ -533,7 +529,7 @@ private[hive] class HiveClientImpl(
       table.copy(properties = table.ignoredProperties ++ table.properties), Some(userName))
     // Do not use `table.qualifiedName` here because this may be a rename
     val qualifiedTableName = s"${table.database}.$tableName"
-    shim.alterTable(client, qualifiedTableName, hiveTable)
+    shim.alterTable(client, qualifiedTableName, hiveTable, table.storage.locationUri.isDefined)
   }
 
   override def createPartitions(
@@ -612,8 +608,10 @@ private[hive] class HiveClientImpl(
       db: String,
       table: String,
       newParts: Seq[CatalogTablePartition]): Unit = withHiveState {
-    val hiveTable = toHiveTable(getTable(db, table), Some(userName))
-    shim.alterPartitions(client, table, newParts.map { p => toHivePartition(p, hiveTable) }.asJava)
+    val sparkTable = getTable(db, table)
+    val hiveTable = toHiveTable(sparkTable, Some(userName))
+    shim.alterPartitions(client, table, newParts.map { p => toHivePartition(p, hiveTable) }.asJava,
+      sparkTable.storage.locationUri.isDefined)
   }
 
   /**
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.hive.client
 import java.lang.{Boolean => JBoolean, Integer => JInteger, Long => JLong}
 import java.lang.reflect.{InvocationTargetException, Method, Modifier}
 import java.net.URI
-import java.util.{ArrayList => JArrayList, List => JList, Locale, Map => JMap, Set => JSet}
+import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Locale, Map => JMap, Set => JSet}
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
@@ -86,9 +86,13 @@ private[client] sealed abstract class Shim {
 
   def getMetastoreClientConnectRetryDelayMillis(conf: HiveConf): Long
 
-  def alterTable(hive: Hive, tableName: String, table: Table): Unit
+  def alterTable(hive: Hive, tableName: String, table: Table, allowGatherStats: Boolean): Unit
 
-  def alterPartitions(hive: Hive, tableName: String, newParts: JList[Partition]): Unit
+  def alterPartitions(
+      hive: Hive,
+      tableName: String,
+      newParts: JList[Partition],
+      allowGatherStats: Boolean): Unit
 
   def createPartitions(
       hive: Hive,
@@ -397,11 +401,19 @@ private[client] class Shim_v0_12 extends Shim with Logging {
     hive.dropTable(dbName, tableName, deleteData, ignoreIfNotExists)
   }
 
-  override def alterTable(hive: Hive, tableName: String, table: Table): Unit = {
+  override def alterTable(
+      hive: Hive,
+      tableName: String,
+      table: Table,
+      allowGatherStats: Boolean): Unit = {
     alterTableMethod.invoke(hive, tableName, table)
   }
 
-  override def alterPartitions(hive: Hive, tableName: String, newParts: JList[Partition]): Unit = {
+  override def alterPartitions(
+      hive: Hive,
+      tableName: String,
+      newParts: JList[Partition],
+      allowGatherStats: Boolean): Unit = {
     alterPartitionsMethod.invoke(hive, tableName, newParts)
   }
 
@@ -1008,9 +1020,6 @@ private[client] class Shim_v2_1 extends Shim_v2_0 {
 
   // true if there is any following stats task
   protected lazy val hasFollowingStatsTask = JBoolean.FALSE
-  // TODO: Now, always set environmentContext to null. In the future, we should avoid setting
-  // hive-generated stats to -1 when altering tables by using environmentContext. See Hive-12730
-  protected lazy val environmentContextInAlterTable = null
 
   private lazy val loadPartitionMethod =
     findMethod(
@@ -1102,11 +1111,35 @@ private[client] class Shim_v2_1 extends Shim_v2_0 {
       hasFollowingStatsTask, AcidUtils.Operation.NOT_ACID)
   }
 
-  override def alterTable(hive: Hive, tableName: String, table: Table): Unit = {
-    alterTableMethod.invoke(hive, tableName, table, environmentContextInAlterTable)
+  override def alterTable(
+      hive: Hive,
+      tableName: String,
+      table: Table,
+      allowGatherStats: Boolean): Unit = {
+    alterTableMethod.invoke(hive, tableName, table, createEnvironmentContext(allowGatherStats))
+  }
+
+  override def alterPartitions(
+      hive: Hive,
+      tableName: String,
+      newParts: JList[Partition],
+      allowGatherStats: Boolean): Unit = {
+    alterPartitionsMethod.invoke(hive, tableName, newParts,
+      createEnvironmentContext(allowGatherStats))
   }
 
-  override def alterPartitions(hive: Hive, tableName: String, newParts: JList[Partition]): Unit = {
-    alterPartitionsMethod.invoke(hive, tableName, newParts, environmentContextInAlterTable)
+  // TODO: In the future, we should avoid setting hive-generated stats to -1 when altering tables.
+  // See HIVE-12730.
+  private def createEnvironmentContext(allowGatherStats: Boolean): EnvironmentContext = {
+    if (!allowGatherStats) {
+      val properties = new JHashMap[String, String]()
+      properties.put("DO_NOT_UPDATE_STATS", "true")
+
+      val ctx = new EnvironmentContext()
+      ctx.setProperties(properties)
+      ctx
+    } else {
+      null
+    }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -22,10 +22,11 @@ import java.net.URI
 
 import scala.language.existentials
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException}
@@ -39,6 +40,7 @@ import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 import org.apache.spark.sql.internal.StaticSQLConf._
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.tags.ExtendedHiveTest
 import org.apache.spark.util.Utils
 
 // TODO(gatorsmile): combine HiveCatalogedDDLSuite and HiveDDLSuite
@@ -2004,41 +2006,56 @@ class HiveDDLSuite
  * A separate set of DDL tests that uses Hive 2.1 libraries, which behave a little differently
  * from the built-in ones.
  */
-class HiveDDLSuite_2_1 extends SparkFunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
+@ExtendedHiveTest
+class Hive_2_1_DDLSuite extends SparkFunSuite with TestHiveSingleton with BeforeAndAfterEach
+  with BeforeAndAfterAll {
 
-  private val spark = {
+  // Create a custom HiveExternalCatalog instance with the desired configuration. We cannot
+  // use SparkSession here since there's already an active on managed by the TestHive object.
+  private var catalog = {
     val warehouse = Utils.createTempDir()
     val metastore = Utils.createTempDir()
     metastore.delete()
-    SparkSession.builder()
-      .config(SparkLauncher.SPARK_MASTER, "local")
-      .config(WAREHOUSE_PATH.key, warehouse.toURI().toString())
-      .config(CATALOG_IMPLEMENTATION.key, "hive")
-      .config(HiveUtils.HIVE_METASTORE_VERSION.key, "2.1")
-      .config(HiveUtils.HIVE_METASTORE_JARS.key, "maven")
-      .config("spark.hadoop.javax.jdo.option.ConnectionURL",
-        s"jdbc:derby:;databaseName=${metastore.getAbsolutePath()};create=true")
-      // These options are needed since the defaults in Hive 2.1 cause exceptions with an
-      // empty metastore db.
-      .config("spark.hadoop.datanucleus.schema.autoCreateAll", "true")
-      .config("spark.hadoop.hive.metastore.schema.verification", "false")
-      .getOrCreate()
+    val sparkConf = new SparkConf()
+      .set(SparkLauncher.SPARK_MASTER, "local")
+      .set(WAREHOUSE_PATH.key, warehouse.toURI().toString())
+      .set(CATALOG_IMPLEMENTATION.key, "hive")
+      .set(HiveUtils.HIVE_METASTORE_VERSION.key, "2.1")
+      .set(HiveUtils.HIVE_METASTORE_JARS.key, "maven")
+
+    val hadoopConf = new Configuration()
+    hadoopConf.set("hive.metastore.warehouse.dir", warehouse.toURI().toString())
+    hadoopConf.set("javax.jdo.option.ConnectionURL",
+      s"jdbc:derby:;databaseName=${metastore.getAbsolutePath()};create=true")
+    // These options are needed since the defaults in Hive 2.1 cause exceptions with an
+    // empty metastore db.
+    hadoopConf.set("datanucleus.schema.autoCreateAll", "true")
+    hadoopConf.set("hive.metastore.schema.verification", "false")
+
+    new HiveExternalCatalog(sparkConf, hadoopConf)
   }
 
   override def afterEach: Unit = {
+    catalog.listTables("default").foreach { t =>
+      catalog.dropTable("default", t, true, false)
+    }
     spark.sessionState.catalog.reset()
   }
 
   override def afterAll(): Unit = {
-    spark.close()
+    catalog = null
   }
 
   test("SPARK-21617: ALTER TABLE..ADD COLUMNS for DataSource tables") {
     spark.sql("CREATE TABLE t1 (c1 int) USING json")
-    spark.sql("ALTER TABLE t1 ADD COLUMNS (c2 int)")
+    val oldTable = spark.sessionState.catalog.externalCatalog.getTable("default", "t1")
+    catalog.createTable(oldTable, true)
+
+    val newSchema = StructType(oldTable.schema.fields ++ Array(StructField("c2", IntegerType)))
+    catalog.alterTableSchema("default", "t1", newSchema)
 
-    val df = spark.table("t1")
-    assert(df.schema.fieldNames === Array("c1", "c2"))
+    val updatedTable = catalog.getTable("default", "t1")
+    assert(updatedTable.schema.fieldNames === Array("c1", "c2"))
   }
 
 }