[SPARK-54157][SQL] Fix refresh of DSv2 tables between Dataset executions

aokolnychyi · aokolnychyi · commit 7b1fade16500 · 2025-11-06T07:37:15.000-08:00
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -2165,6 +2165,31 @@
     ],
     "sqlState" : "42000"
   },
+  "INCOMPATIBLE_TABLE_CHANGE_AFTER_ANALYSIS" : {
+    "message" : [
+      "Detected incompatible changes to table <tableName> after DataFrame/Dataset has been resolved and analyzed, meaning the underlying plan is out of sync. Please, re-create DataFrame/Dataset before attempting to execute the query again."
+    ],
+    "subClass" : {
+      "COLUMNS_MISMATCH" : {
+        "message" : [
+          "Data columns have changed:",
+          "<errors>"
+        ]
+      },
+      "METADATA_COLUMNS_MISMATCH" : {
+        "message" : [
+          "Metadata columns have changed:",
+          "<errors>"
+        ]
+      },
+      "TABLE_ID_MISMATCH" : {
+        "message" : [
+          "Table ID has changed from <capturedTableId> to <detectedTableId>."
+        ]
+      }
+    },
+    "sqlState" : "51024"
+  },
   "INCOMPATIBLE_VIEW_SCHEMA_CHANGE" : {
     "message" : [
       "The SQL query of view <viewName> has an incompatible schema change and column <colName> cannot be resolved. Expected <expectedNum> columns named <colName> but got <actualCols>.",
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Table.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/Table.java
@@ -50,6 +50,15 @@ public interface Table {
    */
   String name();
 
+  /**
+   * An ID of the table that can be used to reliably check if two table objects refer to the same
+   * metastore entity. If a table is dropped and recreated again with the same name, the new table ID
+   * must be different. This method must return null if connectors don't support the notion of table ID.
+   */
+  default String id() {
+    return null;
+  }
+
   /**
    * Returns the schema of this table. If the table is not readable and doesn't have a schema, an
    * empty schema can be returned here.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V2TableUtil.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/V2TableUtil.scala
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.catalog
+
+import java.util.Locale
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.SQLConfHelper
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.types.DataType
+import org.apache.spark.util.ArrayImplicits._
+
+private[sql] object V2TableUtil extends SQLConfHelper {
+
+  def validateCapturedColumns(table: Table, relation: DataSourceV2Relation): Seq[String] = {
+    validateCapturedColumns(table, relation.table.columns.toImmutableArraySeq)
+  }
+
+  def validateCapturedColumns(table: Table, originCols: Seq[Column]): Seq[String] = {
+    val errors = mutable.ArrayBuffer[String]()
+
+    val colsByNormalizedName = indexByNormalizedName(table.columns.toImmutableArraySeq)
+    val originColsByNormalizedName = indexByNormalizedName(originCols)
+
+    originColsByNormalizedName.foreach { case (normalizedName, originCol) =>
+      colsByNormalizedName.get(normalizedName) match {
+        case Some(col) =>
+          if (originCol.dataType != col.dataType || originCol.nullable != col.nullable) {
+            val oldType = formatType(originCol.dataType, originCol.nullable)
+            val newType = formatType(col.dataType, col.nullable)
+            errors += s"`${originCol.name}` type has changed from $oldType to $newType"
+          }
+        case None =>
+          errors += s"${formatColumn(originCol)} is missing"
+      }
+    }
+
+    colsByNormalizedName.foreach { case (normalizedName, col) =>
+      if (!originColsByNormalizedName.contains(normalizedName)) {
+        errors += s"${formatColumn(col)} has been added"
+      }
+    }
+
+    errors.toSeq
+  }
+
+  def validateCapturedMetadataColumns(
+      table: Table,
+      metaAttrs: Seq[AttributeReference]): Seq[String] = {
+    val errors = mutable.ArrayBuffer[String]()
+    val metaColsByNormalizedName = metadataColumnsByNormalizedName(table)
+
+    metaAttrs.foreach { metaAttr =>
+      val normalizedName = normalize(metaAttr.name)
+      metaColsByNormalizedName.get(normalizedName) match {
+        case Some(metaCol) =>
+          if (metaAttr.dataType != metaCol.dataType || metaAttr.nullable != metaCol.isNullable) {
+            val oldType = formatType(metaAttr.dataType, metaAttr.nullable)
+            val newType = formatType(metaCol.dataType, metaCol.isNullable)
+            errors += s"`${metaAttr.name}` type has changed from $oldType to $newType"
+          }
+        case None =>
+          errors += s"${formatAttr(metaAttr)} is missing"
+      }
+    }
+
+    errors.toSeq
+  }
+
+  private def metadataColumnsByNormalizedName(table: Table): Map[String, MetadataColumn] = {
+    table match {
+      case hasMeta: SupportsMetadataColumns =>
+        hasMeta.metadataColumns.map(col => normalize(col.name) -> col).toMap
+      case _ =>
+        Map.empty
+    }
+  }
+
+  private def formatColumn(col: Column): String = {
+    s"`${col.name}` ${formatType(col.dataType, col.nullable)}"
+  }
+
+  private def formatAttr(attr: AttributeReference): String = {
+    s"`${attr.name}` ${formatType(attr.dataType, attr.nullable)}"
+  }
+
+  private def formatType(dataType: DataType, nullable: Boolean): String = {
+    if (nullable) dataType.sql else s"${dataType.sql} NOT NULL"
+  }
+
+  private def indexByNormalizedName(cols: Seq[Column]): Map[String, Column] = {
+    cols.map(col => normalize(col.name) -> col).toMap
+  }
+
+  private def normalize(name: String): String = {
+    if (conf.caseSensitiveAnalysis) name else name.toLowerCase(Locale.ROOT)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -2113,6 +2113,38 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat
     }
   }
 
+  def tableIdChangedAfterAnalysis(
+      tableName: String,
+      capturedTableId: String,
+      detectedTableId: String): Throwable = {
+    new AnalysisException(
+      errorClass = "INCOMPATIBLE_TABLE_CHANGE_AFTER_ANALYSIS.TABLE_ID_MISMATCH",
+      messageParameters = Map(
+        "tableName" -> toSQLId(tableName),
+        "capturedTableId" -> capturedTableId,
+        "detectedTableId" -> detectedTableId))
+  }
+
+  def columnsChangedAfterAnalysis(
+      tableName: String,
+      errors: Seq[String]): Throwable = {
+    new AnalysisException(
+      errorClass = "INCOMPATIBLE_TABLE_CHANGE_AFTER_ANALYSIS.COLUMNS_MISMATCH",
+      messageParameters = Map(
+        "tableName" -> toSQLId(tableName),
+        "errors" -> errors.mkString("\n- ", "\n- ", "")))
+  }
+
+  def metadataColumnsChangedAfterAnalysis(
+      tableName: String,
+      errors: Seq[String]): Throwable = {
+    new AnalysisException(
+      errorClass = "INCOMPATIBLE_TABLE_CHANGE_AFTER_ANALYSIS.METADATA_COLUMNS_MISMATCH",
+      messageParameters = Map(
+        "tableName" -> toSQLId(tableName),
+        "errors" -> errors.mkString("\n- ", "\n- ", "")))
+  }
+
   def numberOfPartitionsNotAllowedWithUnspecifiedDistributionError(): Throwable = {
     new AnalysisException(
       errorClass = "INVALID_WRITE_DISTRIBUTION.PARTITION_NUM_WITH_UNSPECIFIED_DISTRIBUTION",
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -24,7 +24,8 @@ import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, ExposesMetadataColumns, Histogram, HistogramBin, LeafNode, LogicalPlan, Statistics}
 import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes
 import org.apache.spark.sql.catalyst.util.{quoteIfNeeded, truncatedString, CharVarcharUtils}
-import org.apache.spark.sql.connector.catalog.{CatalogPlugin, FunctionCatalog, Identifier, SupportsMetadataColumns, Table, TableCapability}
+import org.apache.spark.sql.connector.catalog.{CatalogPlugin, FunctionCatalog, Identifier, SupportsMetadataColumns, Table, TableCapability, TableCatalog}
+import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.CatalogHelper
 import org.apache.spark.sql.connector.read.{Scan, Statistics => V2Statistics, SupportsReportStatistics}
 import org.apache.spark.sql.connector.read.streaming.{Offset, SparkDataStream}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
@@ -259,10 +260,10 @@ object ExtractV2Table {
 }
 
 object ExtractV2CatalogAndIdentifier {
-  def unapply(relation: DataSourceV2Relation): Option[(CatalogPlugin, Identifier)] = {
+  def unapply(relation: DataSourceV2Relation): Option[(TableCatalog, Identifier)] = {
     relation match {
       case DataSourceV2Relation(_, _, Some(catalog), Some(identifier), _, _) =>
-        Some((catalog, identifier))
+        Some((catalog.asTableCatalog, identifier))
       case _ =>
         None
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTable.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.connector.catalog
 
 import java.util
+import java.util.{Objects, UUID}
 
 import org.apache.spark.sql.connector.catalog.constraints.Constraint
 import org.apache.spark.sql.connector.distributions.{Distribution, Distributions}
@@ -42,7 +43,8 @@ class InMemoryTable(
     numPartitions: Option[Int] = None,
     advisoryPartitionSize: Option[Long] = None,
     isDistributionStrictlyRequired: Boolean = true,
-    override val numRowsPerSplit: Int = Int.MaxValue)
+    override val numRowsPerSplit: Int = Int.MaxValue,
+    override val id: String = UUID.randomUUID().toString)
   extends InMemoryBaseTable(name, columns, partitioning, properties, constraints, distribution,
     ordering, numPartitions, advisoryPartitionSize, isDistributionStrictlyRequired,
     numRowsPerSplit) with SupportsDelete {
@@ -137,7 +139,8 @@ class InMemoryTable(
       numPartitions,
       advisoryPartitionSize,
       isDistributionStrictlyRequired,
-      numRowsPerSplit)
+      numRowsPerSplit,
+      id)
 
     dataMap.synchronized {
       dataMap.foreach { case (key, splits) =>
@@ -160,6 +163,16 @@ class InMemoryTable(
     copiedTable
   }
 
+  override def equals(other: Any): Boolean = other match {
+    case that: InMemoryTable =>
+      this.id == that.id && this.currentVersion() == that.currentVersion()
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    Objects.hash(id, currentVersion())
+  }
+
   class InMemoryWriterBuilderWithOverWrite(override val info: LogicalWriteInfo)
     extends InMemoryWriterBuilder(info) with SupportsOverwrite {
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableCatalog.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryTableCatalog.scala
@@ -57,7 +57,22 @@ class BasicInMemoryTableCatalog extends TableCatalog {
     tables.keySet.asScala.filter(_.namespace.sameElements(namespace)).toArray
   }
 
+  // load table for scans
   override def loadTable(ident: Identifier): Table = {
+    Option(tables.get(ident)) match {
+      case Some(table: InMemoryTable) =>
+        table.copy() // copy to validate logical table equality
+      case Some(table) =>
+        table
+      case _ =>
+        throw new NoSuchTableException(ident.asMultipartIdentifier)
+    }
+  }
+
+  // load table for writes
+  override def loadTable(
+      ident: Identifier,
+      writePrivileges: util.Set[TableWritePrivilege]): Table = {
     Option(tables.get(ident)) match {
       case Some(table) =>
         table
@@ -169,7 +184,8 @@ class BasicInMemoryTableCatalog extends TableCatalog {
       columns = CatalogV2Util.structTypeToV2Columns(schema),
       partitioning = finalPartitioning,
       properties = properties,
-      constraints = constraints)
+      constraints = constraints,
+      id = table.id)
       .alterTableWithData(table.data, schema)
     newTable.setCurrentVersion(currentVersion)
     changes.foreach {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat
 import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.classic.SparkSession
 import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan}
+import org.apache.spark.sql.execution.analysis.RefreshTableVersions
 import org.apache.spark.sql.execution.bucketing.{CoalesceBucketsInJoin, DisableUnnecessaryBucketedScan}
 import org.apache.spark.sql.execution.dynamicpruning.PlanDynamicPruningFilters
 import org.apache.spark.sql.execution.exchange.EnsureRequirements
@@ -203,8 +204,15 @@ class QueryExecution(
     }
   }
 
+  // refresh table versions before looking up cache
+  private val lazyTableVersionsPinned = LazyTry {
+    RefreshTableVersions(commandExecuted)
+  }
+
+  private[sql] def tableVersionsPinned: LogicalPlan = lazyTableVersionsPinned.get
+
   private val lazyNormalized = LazyTry {
-    QueryExecution.normalize(sparkSession, commandExecuted, Some(tracker))
+    QueryExecution.normalize(sparkSession, tableVersionsPinned, Some(tracker))
   }
 
   // The plan that has been normalized by custom rules, so that it's more likely to hit cache.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/RefreshTableVersions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/RefreshTableVersions.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala