apache · karenfeng · Mar 31, 2021 · Mar 31, 2021 · Apr 1, 2021 · Apr 1, 2021
diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala
@@ -61,4 +61,6 @@ case class EventTimeWatermark(
       a
     }
   }
+
+  override val metadataOutput: Seq[Attribute] = child.metadataOutput
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -34,8 +34,11 @@ abstract class LogicalPlan
   with QueryPlanConstraints
   with Logging {
 
-  /** Metadata fields that can be projected from this node */
-  def metadataOutput: Seq[Attribute] = children.flatMap(_.metadataOutput)
+  /**
+   * Metadata fields that can be projected from this node.
+   * Should be non-empty if the plan propagates its children's output.
+   */
+  def metadataOutput: Seq[Attribute] = Nil
 
   /** Returns true if this subtree has data from a streaming data source. */
   def isStreaming: Boolean = children.exists(_.isStreaming)

diff --git a/...st/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/...st/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.types._
 import org.apache.spark.util.random.RandomSampler
 
 /**
- * When planning take() or collect() operations, this special node that is inserted at the top of
+ * When planning take() or collect() operations, this special node is inserted at the top of
  * the logical plan before invoking the query planner.
  *
  * Rules can pattern-match on this node in order to apply transformations that only take effect
@@ -40,6 +40,7 @@ import org.apache.spark.util.random.RandomSampler
 case class ReturnAnswer(child: LogicalPlan) extends UnaryNode {
   override def maxRows: Option[Long] = child.maxRows
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
 
 /**
@@ -51,6 +52,7 @@ case class ReturnAnswer(child: LogicalPlan) extends UnaryNode {
  */
 case class Subquery(child: LogicalPlan, correlated: Boolean) extends OrderPreservingUnaryNode {
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
 
 object Subquery {
@@ -134,11 +136,13 @@ case class Generate(
   }
 
   def output: Seq[Attribute] = requiredChildOutput ++ qualifiedGeneratorOutput
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
 
 case class Filter(condition: Expression, child: LogicalPlan)
   extends OrderPreservingUnaryNode with PredicateHelper {
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 
   override def maxRows: Option[Long] = child.maxRows
 
@@ -364,6 +368,17 @@ case class Join(
     }
   }
 
+  override def metadataOutput: Seq[Attribute] = {
+    joinType match {
+      case ExistenceJoin(_) =>
+        left.metadataOutput
+      case LeftExistence(_) =>
+        left.metadataOutput
+      case _ =>
+        children.flatMap(_.metadataOutput)
+    }
+  }
+
   override protected lazy val validConstraints: ExpressionSet = {
     joinType match {
       case _: InnerLike if condition.isDefined =>
@@ -520,6 +535,8 @@ object View {
 case class With(child: LogicalPlan, cteRelations: Seq[(String, SubqueryAlias)]) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
+
   override def simpleString(maxFields: Int): String = {
     val cteAliases = truncatedString(cteRelations.map(_._1), "[", ", ", "]", maxFields)
     s"CTE $cteAliases"
@@ -532,6 +549,7 @@ case class WithWindowDefinition(
     windowDefinitions: Map[String, WindowSpecDefinition],
     child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
 
 /**
@@ -545,6 +563,7 @@ case class Sort(
     global: Boolean,
     child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
   override def maxRows: Option[Long] = child.maxRows
   override def outputOrdering: Seq[SortOrder] = order
 }
@@ -669,6 +688,7 @@ case class Window(
   override def maxRows: Option[Long] = child.maxRows
   override def output: Seq[Attribute] =
     child.output ++ windowExpressions.map(_.toAttribute)
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 
   override def producedAttributes: AttributeSet = windowOutputSet
 
@@ -861,6 +881,7 @@ object Limit {
  */
 case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode {
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
   override def maxRows: Option[Long] = {
     limitExpr match {
       case IntegerLiteral(limit) => Some(limit)
@@ -877,6 +898,7 @@ case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderP
  */
 case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode {
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 
   override def maxRowsPerPartition: Option[Long] = {
     limitExpr match {
@@ -898,6 +920,7 @@ case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends OrderPr
  */
 case class Tail(limitExpr: Expression, child: LogicalPlan) extends OrderPreservingUnaryNode {
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
   override def maxRows: Option[Long] = {
     limitExpr match {
       case IntegerLiteral(limit) => Some(limit)
@@ -924,11 +947,6 @@ case class SubqueryAlias(
     child.output.map(_.withQualifier(qualifierList))
   }
 
-  override def metadataOutput: Seq[Attribute] = {
-    val qualifierList = identifier.qualifier :+ alias
-    child.metadataOutput.map(_.withQualifier(qualifierList))
-  }
-
   override def doCanonicalize(): LogicalPlan = child.canonicalized
 }
 
@@ -983,6 +1001,7 @@ case class Sample(
 
   override def maxRows: Option[Long] = child.maxRows
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
 
 /**
@@ -991,6 +1010,7 @@ case class Sample(
 case class Distinct(child: LogicalPlan) extends UnaryNode {
   override def maxRows: Option[Long] = child.maxRows
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
 
 /**
@@ -1001,6 +1021,7 @@ abstract class RepartitionOperation extends UnaryNode {
   def numPartitions: Int
   override final def maxRows: Option[Long] = child.maxRows
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
   def partitioning: Partitioning
 }
 
@@ -1095,6 +1116,7 @@ case class Deduplicate(
     child: LogicalPlan) extends UnaryNode {
   override def maxRows: Option[Long] = child.maxRows
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
 
 /**
@@ -1123,4 +1145,5 @@ case class CollectMetrics(
   }
 
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
@@ -31,6 +31,7 @@ case class UnresolvedHint(name: String, parameters: Seq[Any], child: LogicalPlan
 
   override lazy val resolved: Boolean = false
   override def output: Seq[Attribute] = child.output
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
 
 /**
@@ -42,6 +43,8 @@ case class ResolvedHint(child: LogicalPlan, hints: HintInfo = HintInfo())
 
   override def output: Seq[Attribute] = child.output
 
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
+
   override def doCanonicalize(): LogicalPlan = child.canonicalized
 }
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/object.scala
@@ -238,6 +238,8 @@ case class TypedFilter(
 
   override def output: Seq[Attribute] = child.output
 
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
+
   def withObjectProducerChild(obj: LogicalPlan): Filter = {
     assert(obj.output.length == 1)
     Filter(typedCondition(obj.output.head), obj)
@@ -333,6 +335,8 @@ case class AppendColumns(
 
   override def output: Seq[Attribute] = child.output ++ newColumns
 
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
+
   def newColumns: Seq[Attribute] = serializer.map(_.toAttribute)
 }
 
@@ -346,6 +350,8 @@ case class AppendColumnsWithObject(
     child: LogicalPlan) extends ObjectConsumer {
 
   override def output: Seq[Attribute] = (childSerializer ++ newColumnsSerializer).map(_.toAttribute)
+
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
 }
 
 /** Factory for constructing new `MapGroups` nodes. */

diff --git a/...t/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala b/...t/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala
@@ -74,6 +74,8 @@ trait BaseEvalPython extends UnaryNode {
 
   override def output: Seq[Attribute] = child.output ++ resultAttrs
 
+  override def metadataOutput: Seq[Attribute] = child.metadataOutput
+
   override def producedAttributes: AttributeSet = AttributeSet(resultAttrs)
 }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -2690,6 +2690,73 @@ class DataSourceV2SQLSuite
     }
   }
 
+  test("SPARK-34923: do not propagate metadata columns through Project") {
+    val t1 = s"${catalogAndNamespace}table"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+      assertThrows[AnalysisException] {
+        sql(s"SELECT index, _partition from (SELECT id, data FROM $t1)")
+      }
+      assertThrows[AnalysisException] {
+        spark.table(t1).select("id", "data").select("index", "_partition")
+      }
+    }
+  }
+
+  test("SPARK-34923: propagate metadata columns through Filter") {
+    val t1 = s"${catalogAndNamespace}table"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+      val sqlQuery = spark.sql(s"SELECT id, data, index, _partition FROM $t1 WHERE id > 1")
+      val dfQuery = spark.table(t1).where("id > 1").select("id", "data", "index", "_partition")
+
+      Seq(sqlQuery, dfQuery).foreach { query =>
+        checkAnswer(query, Seq(Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3")))
+      }
+    }
+  }
+
+  test("SPARK-34923: propagate metadata columns through Sort") {
+    val t1 = s"${catalogAndNamespace}table"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+      val sqlQuery = spark.sql(s"SELECT id, data, index, _partition FROM $t1 ORDER BY id")
+      val dfQuery = spark.table(t1).orderBy("id").select("id", "data", "index", "_partition")
+
+      Seq(sqlQuery, dfQuery).foreach { query =>
+        checkAnswer(query, Seq(Row(1, "a", 0, "3/1"), Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3")))
+      }
+    }
+  }
+
+  test("SPARK-34923: propagate metadata columns through RepartitionBy") {
+    val t1 = s"${catalogAndNamespace}table"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+      val sqlQuery = spark.sql(
+        s"SELECT /*+ REPARTITION_BY_RANGE(3, id) */ id, data, index, _partition FROM $t1")
+      val tbl = spark.table(t1)
+      val dfQuery = tbl.repartitionByRange(3, tbl.col("id"))
+        .select("id", "data", "index", "_partition")
+
+      Seq(sqlQuery, dfQuery).foreach { query =>
+        checkAnswer(query, Seq(Row(1, "a", 0, "3/1"), Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3")))
+      }
+    }
+  }
+
   private def testNotSupportedV2Command(sqlCommand: String, sqlParams: String): Unit = {
     val e = intercept[AnalysisException] {
       sql(s"$sqlCommand $sqlParams")
-Original file line number
+Diff line change
@@ Expand Up / @@ -61,4 +61,6 @@ case class EventTimeWatermark( @@
           a
         }
       }
+      override val metadataOutput: Seq[Attribute] = child.metadataOutput
     }