-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-15616] [SQL] CatalogRelation should fallback to HDFS size of partitions that are involved in Query for JoinSelection. #18193
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
5591a1c
754af2f
c44a589
171a9e6
260202b
4344fbd
ff59140
37b20c7
118f4bc
fdd63c3
fd95fb3
72f63fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,16 +21,17 @@ import java.io.IOException | |
| import java.util.Locale | ||
|
|
||
| import org.apache.hadoop.fs.{FileSystem, Path} | ||
| import org.apache.hadoop.hive.common.StatsSetupConst | ||
|
|
||
| import org.apache.spark.sql._ | ||
| import org.apache.spark.sql.catalyst.catalog._ | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.planning._ | ||
| import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoTable, LogicalPlan, | ||
| ScriptTransformation} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{Filter, InsertIntoDir, InsertIntoTable, | ||
| LogicalPlan, ScriptTransformation} | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.execution._ | ||
| import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils} | ||
| import org.apache.spark.sql.execution.command.{CommandUtils, CreateTableCommand, DDLUtils} | ||
| import org.apache.spark.sql.execution.datasources.{CreateTable, LogicalRelation} | ||
| import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions} | ||
| import org.apache.spark.sql.hive.execution._ | ||
|
|
@@ -139,6 +140,62 @@ class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * | ||
| * TODO: merge this with PruneFileSourcePartitions after we completely make hive as a data source. | ||
| */ | ||
| case class PruneHiveTablePartitions( | ||
| session: SparkSession) extends Rule[LogicalPlan] with PredicateHelper { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { | ||
| case filter @ Filter(condition, relation: HiveTableRelation) if relation.isPartitioned => | ||
| val predicates = splitConjunctivePredicates(condition) | ||
| val normalizedFilters = predicates.map { e => | ||
| e transform { | ||
| case a: AttributeReference => | ||
| a.withName(relation.output.find(_.semanticEquals(a)).get.name) | ||
| } | ||
| } | ||
| val partitionSet = AttributeSet(relation.partitionCols) | ||
| val pruningPredicates = normalizedFilters.filter { predicate => | ||
| !predicate.references.isEmpty && | ||
| predicate.references.subsetOf(partitionSet) | ||
| } | ||
| if (pruningPredicates.nonEmpty && session.sessionState.conf.fallBackToHdfsForStatsEnabled && | ||
| session.sessionState.conf.metastorePartitionPruning) { | ||
| val prunedPartitions = session.sharedState.externalCatalog.listPartitionsByFilter( | ||
| relation.tableMeta.database, | ||
| relation.tableMeta.identifier.table, | ||
| pruningPredicates, | ||
| session.sessionState.conf.sessionLocalTimeZone) | ||
| val sizeInBytes = try { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if we already have partition level statistics at hive metastore?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we already have partition level statistics, But we cannot know total number of partition, so it cannot compute the statistics for pruned partitions. |
||
| prunedPartitions.map { part => | ||
| val totalSize = part.parameters.get(StatsSetupConst.TOTAL_SIZE).map(_.toLong) | ||
| val rawDataSize = part.parameters.get(StatsSetupConst.RAW_DATA_SIZE).map(_.toLong) | ||
| if (totalSize.isDefined && totalSize.get > 0L) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should first use rawDataSize, because 1MB orc file is equal to 5MB textfile...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cenyuhai Yes,I think what you said is right.Thanks. |
||
| totalSize.get | ||
| } else if (rawDataSize.isDefined && rawDataSize.get > 0) { | ||
| rawDataSize.get | ||
| } else { | ||
| CommandUtils.calculateLocationSize( | ||
| session.sessionState, relation.tableMeta.identifier, part.storage.locationUri) | ||
| } | ||
| }.sum | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if there are too many partitions, it will be very slow. |
||
| } catch { | ||
| case e: IOException => | ||
| logWarning("Failed to get table size from hdfs.", e) | ||
| session.sessionState.conf.defaultSizeInBytes | ||
| } | ||
| val withStats = relation.tableMeta.copy( | ||
| stats = Some(CatalogStatistics(sizeInBytes = BigInt(sizeInBytes)))) | ||
| val prunedCatalogRelation = relation.copy(tableMeta = withStats) | ||
| val filterExpression = predicates.reduceLeft(And) | ||
| Filter(filterExpression, prunedCatalogRelation) | ||
| } else { | ||
| filter | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Replaces generic operations with specific variants that are designed to work with Hive. | ||
| * | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add a todo that we should merge this rule with
PruneFileSourcePartitions, after we completely make hive a data source.