apache · jzhuge · May 16, 2019 · Jul 31, 2019 · Aug 2, 2019 · Aug 14, 2019
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -79,12 +79,13 @@ class CacheManager extends Logging {
       logWarning("Asked to cache already cached data.")
     } else {
       val sparkSession = query.sparkSession
+      val qe = sparkSession.sessionState.executePlan(planToCache)
       val inMemoryRelation = InMemoryRelation(
         sparkSession.sessionState.conf.useCompression,
         sparkSession.sessionState.conf.columnBatchSize, storageLevel,
-        sparkSession.sessionState.executePlan(planToCache).executedPlan,
+        qe.executedPlan,
         tableName,
-        planToCache)
+        optimizedPlan = qe.optimizedPlan)
       this.synchronized {
         if (lookupCachedData(planToCache).nonEmpty) {
           logWarning("Data has already been cached.")
@@ -184,10 +185,10 @@ class CacheManager extends Logging {
     }
     needToRecache.map { cd =>
       cd.cachedRepresentation.cacheBuilder.clearCache()
-      val plan = spark.sessionState.executePlan(cd.plan).executedPlan
+      val qe = spark.sessionState.executePlan(cd.plan)
       val newCache = InMemoryRelation(
-        cacheBuilder = cd.cachedRepresentation.cacheBuilder.copy(cachedPlan = plan),
-        logicalPlan = cd.plan)
+        cacheBuilder = cd.cachedRepresentation.cacheBuilder.copy(cachedPlan = qe.executedPlan),
+        optimizedPlan = qe.optimizedPlan)
       val recomputedPlan = cd.copy(cachedRepresentation = newCache)
       this.synchronized {
         if (lookupCachedData(recomputedPlan.plan).nonEmpty) {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -146,17 +146,17 @@ object InMemoryRelation {
       storageLevel: StorageLevel,
       child: SparkPlan,
       tableName: Option[String],
-      logicalPlan: LogicalPlan): InMemoryRelation = {
+      optimizedPlan: LogicalPlan): InMemoryRelation = {
     val cacheBuilder = CachedRDDBuilder(useCompression, batchSize, storageLevel, child, tableName)
-    val relation = new InMemoryRelation(child.output, cacheBuilder, logicalPlan.outputOrdering)
-    relation.statsOfPlanToCache = logicalPlan.stats
+    val relation = new InMemoryRelation(child.output, cacheBuilder, optimizedPlan.outputOrdering)
+    relation.statsOfPlanToCache = optimizedPlan.stats
     relation
   }
 
-  def apply(cacheBuilder: CachedRDDBuilder, logicalPlan: LogicalPlan): InMemoryRelation = {
+  def apply(cacheBuilder: CachedRDDBuilder, optimizedPlan: LogicalPlan): InMemoryRelation = {
     val relation = new InMemoryRelation(
-      cacheBuilder.cachedPlan.output, cacheBuilder, logicalPlan.outputOrdering)
-    relation.statsOfPlanToCache = logicalPlan.stats
+      cacheBuilder.cachedPlan.output, cacheBuilder, optimizedPlan.outputOrdering)
+    relation.statsOfPlanToCache = optimizedPlan.stats
     relation
   }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala
@@ -246,4 +246,24 @@ class DatasetCacheSuite extends QueryTest with SharedSQLContext with TimeLimits
     assert(df2LimitInnerPlan.isDefined &&
       df2LimitInnerPlan.get.find(_.isInstanceOf[InMemoryTableScanExec]).isEmpty)
   }
+
+  test("SPARK-27739 Save stats from optimized plan") {
+    withTable("a") {
+      spark.range(4)
+        .selectExpr("id", "id % 2 AS p")
+        .write
+        .partitionBy("p")
+        .saveAsTable("a")
+
+      val df = sql("SELECT * FROM a WHERE p = 0")
+      df.cache()
+      df.count()
+      df.queryExecution.withCachedData match {
+        case i: InMemoryRelation =>
+          // Optimized plan has non-default size in bytes
+          assert(i.statsOfPlanToCache.sizeInBytes !==
+            df.sparkSession.sessionState.conf.defaultSizeInBytes)
+      }
+    }
+  }
 }