-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-34331][SQL] Speed up DS v2 metadata col resolution #31440
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -964,11 +964,37 @@ class Analyzer(override val catalogManager: CatalogManager) | |
| * columns are not accidentally selected by *. | ||
| */ | ||
| object AddMetadataColumns extends Rule[LogicalPlan] { | ||
| import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ | ||
|
|
||
| private def hasMetadataCol(plan: LogicalPlan): Boolean = { | ||
| plan.expressions.exists(_.find { | ||
| case a: Attribute => a.isMetadataCol | ||
| case _ => false | ||
| }.isDefined) | ||
| } | ||
|
|
||
| private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { | ||
| case r: DataSourceV2Relation => r.withMetadataColumns() | ||
| case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { | ||
| case node if node.resolved && node.children.nonEmpty && node.missingInput.nonEmpty => | ||
| node resolveOperatorsUp { | ||
| case rel: DataSourceV2Relation => | ||
| rel.withMetadataColumns() | ||
| case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) => | ||
| val inputAttrs = AttributeSet(node.children.flatMap(_.output)) | ||
| val metaCols = node.expressions.flatMap(_.collect { | ||
| case a: Attribute if a.isMetadataCol && !inputAttrs.contains(a) => a | ||
| }) | ||
| if (metaCols.isEmpty) { | ||
| node | ||
| } else { | ||
| val newNode = addMetadataCol(node) | ||
| // We should not change the output schema of the plan. We should project away the extr | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: extr -> extra |
||
| // metadata columns if necessary. | ||
| if (newNode.sameOutput(node)) { | ||
| newNode | ||
| } else { | ||
| Project(node.output, newNode) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,12 +21,14 @@ import scala.collection.JavaConverters._ | |
|
|
||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.catalyst.analysis.{PartitionSpec, ResolvedPartitionSpec, UnresolvedPartitionSpec} | ||
| import org.apache.spark.sql.catalyst.expressions.AttributeReference | ||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} | ||
| import org.apache.spark.sql.connector.catalog.{MetadataColumn, SupportsAtomicPartitionManagement, SupportsDelete, SupportsPartitionManagement, SupportsRead, SupportsWrite, Table, TableCapability} | ||
| import org.apache.spark.sql.types.{StructField, StructType} | ||
| import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType} | ||
| import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
|
|
||
| object DataSourceV2Implicits { | ||
| private val METADATA_COL_ATTR_KEY = "__metadata_col" | ||
|
|
||
| implicit class TableHelper(table: Table) { | ||
| def asReadable: SupportsRead = { | ||
| table match { | ||
|
|
@@ -83,7 +85,8 @@ object DataSourceV2Implicits { | |
| implicit class MetadataColumnsHelper(metadata: Array[MetadataColumn]) { | ||
| def asStruct: StructType = { | ||
| val fields = metadata.map { metaCol => | ||
| val field = StructField(metaCol.name, metaCol.dataType, metaCol.isNullable) | ||
| val fieldMeta = new MetadataBuilder().putBoolean(METADATA_COL_ATTR_KEY, true).build() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can be created outside the loop (or even at |
||
| val field = StructField(metaCol.name, metaCol.dataType, metaCol.isNullable, fieldMeta) | ||
| Option(metaCol.comment).map(field.withComment).getOrElse(field) | ||
| } | ||
| StructType(fields) | ||
|
|
@@ -92,6 +95,11 @@ object DataSourceV2Implicits { | |
| def toAttributes: Seq[AttributeReference] = asStruct.toAttributes | ||
| } | ||
|
|
||
| implicit class MetadataColumnHelper(attr: Attribute) { | ||
| def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) && | ||
| attr.metadata.getBoolean(METADATA_COL_ATTR_KEY) | ||
| } | ||
|
|
||
| implicit class OptionsHelper(options: Map[String, String]) { | ||
| def asOptions: CaseInsensitiveStringMap = { | ||
| new CaseInsensitiveStringMap(options.asJava) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,7 +36,7 @@ import org.apache.spark.sql.connector.read._ | |
| import org.apache.spark.sql.connector.write._ | ||
| import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} | ||
| import org.apache.spark.sql.sources.{And, EqualNullSafe, EqualTo, Filter, IsNotNull, IsNull} | ||
| import org.apache.spark.sql.types.{DataType, DateType, StringType, StructField, StructType, TimestampType} | ||
| import org.apache.spark.sql.types.{DataType, DateType, IntegerType, StringType, StructField, StructType, TimestampType} | ||
| import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
|
|
||
|
|
@@ -61,7 +61,7 @@ class InMemoryTable( | |
|
|
||
| private object IndexColumn extends MetadataColumn { | ||
| override def name: String = "index" | ||
| override def dataType: DataType = StringType | ||
| override def dataType: DataType = IntegerType | ||
|
||
| override def comment: String = "Metadata column used to conflict with a data column" | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No matter how many meta cols we actually refer, we always add all meta cols, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yea, it's good enough because: