apache · gatorsmile · May 17, 2016 · May 17, 2016 · May 17, 2016 · May 17, 2016
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala
@@ -211,4 +211,17 @@ abstract class Catalog {
    */
   def clearCache(): Unit
 
+  /**
+   * Invalidate and refresh all the cached the metadata of the given table. For performance reasons,
+   * Spark SQL or the external data source library it uses might cache certain metadata about a
+   * table, such as the location of blocks. When those change outside of Spark SQL, users should
+   * call this function to invalidate the cache.
+   *
+   * If this table is cached as an InMemoryRelation, drop the original cached version and make the
+   * new version cached lazily.
+   *
+   * @since 2.0.0
+   */
+  def refreshTable(tableName: String): Unit
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -126,24 +126,9 @@ case class RefreshTable(tableIdent: TableIdentifier)
   extends RunnableCommand {
 
   override def run(sparkSession: SparkSession): Seq[Row] = {
-    // Refresh the given table's metadata first.
-    sparkSession.sessionState.catalog.refreshTable(tableIdent)
-
-    // If this table is cached as a InMemoryColumnarRelation, drop the original
-    // cached version and make the new version cached lazily.
-    val logicalPlan = sparkSession.sessionState.catalog.lookupRelation(tableIdent)
-    // Use lookupCachedData directly since RefreshTable also takes databaseName.
-    val isCached = sparkSession.cacheManager.lookupCachedData(logicalPlan).nonEmpty
-    if (isCached) {
-      // Create a data frame to represent the table.
-      // TODO: Use uncacheTable once it supports database name.
-      val df = Dataset.ofRows(sparkSession, logicalPlan)
-      // Uncache the logicalPlan.
-      sparkSession.cacheManager.tryUncacheQuery(df, blocking = true)
-      // Cache it again.
-      sparkSession.cacheManager.cacheQuery(df, Some(tableIdent.table))
-    }
-
+    // Refresh the given table's metadata. If this table is cached as an InMemoryRelation,
+    // drop the original cached version and make the new version cached lazily.
+    sparkSession.catalog.refreshTable(tableIdent.quotedString)
     Seq.empty[Row]
   }
 }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -345,6 +345,32 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
     sparkSession.cacheManager.lookupCachedData(qName).nonEmpty
   }
 
+  /**
+   * Refresh the cache entry for a metastore table, if any.
+   *
+   * @group cachemgmt
+   * @since 2.0.0
+   */
+  override def refreshTable(tableName: String): Unit = {
+    val tableIdent = sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
+    sessionCatalog.refreshTable(tableIdent)
+
+    // If this table is cached as a InMemoryRelation, drop the original
+    // cached version and make the new version cached lazily.
+    val logicalPlan = sparkSession.sessionState.catalog.lookupRelation(tableIdent)
+    // Use lookupCachedData directly since RefreshTable also takes databaseName.
+    val isCached = sparkSession.cacheManager.lookupCachedData(logicalPlan).nonEmpty
+    if (isCached) {
+      // Create a data frame to represent the table.
+      // TODO: Use uncacheTable once it supports database name.
+      val df = Dataset.ofRows(sparkSession, logicalPlan)
+      // Uncache the logicalPlan.
+      sparkSession.cacheManager.tryUncacheQuery(df, blocking = true)
+      // Cache it again.
+      sparkSession.cacheManager.cacheQuery(df, Some(tableIdent.table))
+    }
+  }
+
 }
 
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -163,6 +163,9 @@ private[sql] class SessionState(sparkSession: SparkSession) {
   def executePlan(plan: LogicalPlan): QueryExecution = new QueryExecution(sparkSession, plan)
 
   def refreshTable(tableName: String): Unit = {
+    // Different from SparkSession.catalog.refreshTable, this API only refreshes the metadata.
+    // It does not reload the cached data. That means, if this table is cached as
+    // an InMemoryRelation, we do not refresh the cached data.
     catalog.refreshTable(sqlParser.parseTableIdentifier(tableName))
   }
 

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -622,7 +622,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         .mode(SaveMode.Append)
         .saveAsTable("arrayInParquet")
 
-      sessionState.refreshTable("arrayInParquet")
+      sparkSession.catalog.refreshTable("arrayInParquet")
 
       checkAnswer(
         sql("SELECT a FROM arrayInParquet"),
@@ -681,7 +681,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
         .mode(SaveMode.Append)
         .saveAsTable("mapInParquet")
 
-      sessionState.refreshTable("mapInParquet")
+      sparkSession.catalog.refreshTable("mapInParquet")
 
       checkAnswer(
         sql("SELECT a FROM mapInParquet"),

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala
@@ -217,7 +217,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
 
           df.write.parquet(s"$path/p=2")
           sql("ALTER TABLE t ADD PARTITION (p=2)")
-          hiveContext.sessionState.refreshTable("t")
+          spark.catalog.refreshTable("t")
           checkAnswer(
             spark.table("t"),
             df.withColumn("p", lit(1)).union(df.withColumn("p", lit(2))))
@@ -249,7 +249,7 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle
 
         df.write.parquet(s"$path/p=2")
         sql(s"ALTER TABLE $db.t ADD PARTITION (p=2)")
-        hiveContext.sessionState.refreshTable(s"$db.t")
+        spark.catalog.refreshTable(s"$db.t")
         checkAnswer(
           spark.table(s"$db.t"),
           df.withColumn("p", lit(1)).union(df.withColumn("p", lit(2))))

diff --git a/sql/hivecontext-compatibility/pom.xml b/sql/hivecontext-compatibility/pom.xml
@@ -48,6 +48,13 @@
             <type>test-jar</type>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_${scala.binary.version}</artifactId>
+            <type>test-jar</type>
+            <version>${project.version}</version>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
 
     <build>

diff --git a/sql/hivecontext-compatibility/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hivecontext-compatibility/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -58,4 +58,16 @@ class HiveContext private[hive](
     sparkSession.sharedState.asInstanceOf[HiveSharedState]
   }
 
+  /**
+   * Invalidate and refresh all the cached the metadata of the given table. For performance reasons,
+   * Spark SQL or the external data source library it uses might cache certain metadata about a
+   * table, such as the location of blocks. When those change outside of Spark SQL, users should
+   * call this function to invalidate the cache.
+   *
+   * @since 1.3.0
+   */
+  def refreshTable(tableName: String): Unit = {
+    sparkSession.catalog.refreshTable(tableName)
+  }
+
 }
diff --git a/...ompatibility/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala b/...ompatibility/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala
@@ -20,12 +20,17 @@ package org.apache.spark.sql.hive
 import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.{SparkContext, SparkFunSuite}
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.util.Utils
 
 
-class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEach {
+class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEach
+    with SQLTestUtils {
 
   private var sc: SparkContext = null
   private var hc: HiveContext = null
+  protected var spark: SparkSession = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
@@ -34,6 +39,7 @@ class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEac
       sc.hadoopConfiguration.set(k, v)
     }
     hc = new HiveContext(sc)
+    spark = hc.sparkSession
   }
 
   override def afterEach(): Unit = {
@@ -99,4 +105,41 @@ class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEac
     assert(databases3.toSeq == Seq("default"))
   }
 
+  test("check change after refresh") {
 test("REFRESH TABLE also needs to recache the data (data source tables)") { 
   val tempPath: File = Utils.createTempDir() 
   tempPath.delete() 
   table("src").write.mode(SaveMode.Overwrite).parquet(tempPath.toString) 
   sql("DROP TABLE IF EXISTS refreshTable") 
   sparkSession.catalog.createExternalTable("refreshTable", tempPath.toString, "parquet") 
   checkAnswer( 
     table("refreshTable"), 
     table("src").collect()) 
   // Cache the table. 
   sql("CACHE TABLE refreshTable") 
   assertCached(table("refreshTable")) 
   // Append new data. 
   table("src").write.mode(SaveMode.Append).parquet(tempPath.toString) 
   // We are still using the old data. 
   assertCached(table("refreshTable")) 
   checkAnswer( 
     table("refreshTable"), 
     table("src").collect()) 
   // Refresh the table. 
   sql("REFRESH TABLE refreshTable") 
   // We are using the new data. 
   assertCached(table("refreshTable")) 
   checkAnswer( 
     table("refreshTable"), 
     table("src").union(table("src")).collect()) 
   // Drop the table and create it again. 
   sql("DROP TABLE refreshTable") 
   sparkSession.catalog.createExternalTable("refreshTable", tempPath.toString, "parquet") 
   // It is not cached. 
   assert(!isCached("refreshTable"), "refreshTable should not be cached.") 
   // Refresh the table. REFRESH TABLE command should not make a uncached 
   // table cached. 
   sql("REFRESH TABLE refreshTable") 
   checkAnswer( 
     table("refreshTable"), 
     table("src").union(table("src")).collect()) 
   // It is not cached. 
   assert(!isCached("refreshTable"), "refreshTable should not be cached.") 
   sql("DROP TABLE refreshTable") 
   Utils.deleteRecursively(tempPath) 
 } 
 test("REFRESH TABLE also needs to recache the data (data source tables)") { 
   val tempPath: File = Utils.createTempDir() 
   tempPath.delete() 
   table("src").write.mode(SaveMode.Overwrite).parquet(tempPath.toString) 
   sql("DROP TABLE IF EXISTS refreshTable") 
   sparkSession.catalog.createExternalTable("refreshTable", tempPath.toString, "parquet") 
   checkAnswer( 
     table("refreshTable"), 
     table("src").collect()) 
   // Cache the table. 
   sql("CACHE TABLE refreshTable") 
   assertCached(table("refreshTable")) 
   // Append new data. 
   table("src").write.mode(SaveMode.Append).parquet(tempPath.toString) 
   // We are still using the old data. 
   assertCached(table("refreshTable")) 
   checkAnswer( 
     table("refreshTable"), 
     table("src").collect()) 
   // Refresh the table. 
   sql("REFRESH TABLE refreshTable") 
   // We are using the new data. 
   assertCached(table("refreshTable")) 
   checkAnswer( 
     table("refreshTable"), 
     table("src").union(table("src")).collect()) 
  
   // Drop the table and create it again. 
   sql("DROP TABLE refreshTable") 
   sparkSession.catalog.createExternalTable("refreshTable", tempPath.toString, "parquet") 
   // It is not cached. 
   assert(!isCached("refreshTable"), "refreshTable should not be cached.") 
   // Refresh the table. REFRESH TABLE command should not make a uncached 
   // table cached. 
   sql("REFRESH TABLE refreshTable") 
   checkAnswer( 
     table("refreshTable"), 
     table("src").union(table("src")).collect()) 
   // It is not cached. 
   assert(!isCached("refreshTable"), "refreshTable should not be cached.") 
  
   sql("DROP TABLE refreshTable") 
   Utils.deleteRecursively(tempPath) 
 } 
+    val _hc = hc
+    import _hc.implicits._
+
+    withTempPath { tempDir =>
+      withTable("jsonTable") {
+        (("a", "b") :: Nil).toDF().toJSON.rdd.saveAsTextFile(tempDir.getCanonicalPath)
+
+        hc.sql(
+          s"""
+             |CREATE TABLE jsonTable
+             |USING org.apache.spark.sql.json
+             |OPTIONS (
+             |  path '${tempDir.getCanonicalPath}'
+             |)
+           """.stripMargin)
+
+        assert(
+          hc.sql("SELECT * FROM jsonTable").collect().toSeq == Row("a", "b") :: Nil)
+
+        Utils.deleteRecursively(tempDir)
+        (("a1", "b1", "c1") :: Nil).toDF().toJSON.rdd.saveAsTextFile(tempDir.getCanonicalPath)
+
+        // Schema is cached so the new column does not show. The updated values in existing columns
+        // will show.
+        assert(
+          hc.sql("SELECT * FROM jsonTable").collect().toSeq == Row("a1", "b1") :: Nil)
+
+        hc.refreshTable("jsonTable")
+
+        // Check that the refresh worked
+        assert(
+          hc.sql("SELECT * FROM jsonTable").collect().toSeq == Row("a1", "b1", "c1") :: Nil)
+      }
+    }
+  }
+
 }