-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-6635] Hudi Spark Integration Redesign MOR and Bootstrap reading #9276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 29 commits
0217715
9325f13
37d3b93
4e77337
54a4e7e
ee25b44
3a1eadb
67f298d
6f357c6
d28be3b
d7612ac
bb2cd1b
9ea1398
7b7d90e
a6f97ed
a52dacd
0e91a54
bb0acc5
3e2626a
c05f009
662f3b3
72c0bb1
793964b
646edf5
663aa88
3d6f947
4e33648
26bb36c
b8f1f89
2089508
eb91c86
d6025b9
87e8f76
54bb07b
b695af3
b54a365
f179c08
293ae46
6ce7ff6
af76828
1875a19
ef8eaad
89a4c7f
def394b
f13bb9c
e5a805e
65cfcdf
c458337
44a63c8
fa681fd
83f6b8b
996c798
69aa9e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.hudi | ||
|
|
||
| import org.apache.hudi.common.config.HoodieMetadataConfig | ||
| import org.apache.hudi.internal.schema.InternalSchema | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
||
| case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String, internalSchema: Option[InternalSchema] = None) | ||
|
|
||
| case class HoodieTableState(tablePath: String, | ||
jonvex marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| latestCommitTimestamp: Option[String], | ||
| recordKeyField: String, | ||
| preCombineFieldOpt: Option[String], | ||
| usesVirtualKeys: Boolean, | ||
| recordPayloadClassName: String, | ||
| metadataConfig: HoodieMetadataConfig, | ||
| recordMergerImpls: List[String], | ||
| recordMergerStrategy: String) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,6 @@ import org.apache.hudi.common.table.timeline.HoodieInstant | |
| import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} | ||
| import org.apache.hudi.common.util.ConfigUtils | ||
| import org.apache.hudi.common.util.ValidationUtils.checkState | ||
| import org.apache.hudi.config.HoodieBootstrapConfig.DATA_QUERIES_ONLY | ||
| import org.apache.hudi.config.HoodieWriteConfig.{WRITE_CONCURRENCY_MODE, SPARK_SQL_MERGE_INTO_PREPPED_KEY} | ||
| import org.apache.hudi.exception.HoodieException | ||
| import org.apache.hudi.util.PathUtils | ||
|
|
@@ -102,8 +101,7 @@ class DefaultSource extends RelationProvider | |
| ) | ||
| } else { | ||
| Map() | ||
| }) ++ DataSourceOptionsHelper.parametersWithReadDefaults(optParams + | ||
| (DATA_QUERIES_ONLY.key() -> sqlContext.getConf(DATA_QUERIES_ONLY.key(), optParams.getOrElse(DATA_QUERIES_ONLY.key(), DATA_QUERIES_ONLY.defaultValue())))) | ||
jonvex marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| }) ++ DataSourceOptionsHelper.parametersWithReadDefaults(optParams) | ||
|
|
||
| // Get the table base path | ||
| val tablePath = if (globPaths.nonEmpty) { | ||
|
|
@@ -247,6 +245,9 @@ object DefaultSource { | |
| Option(schema) | ||
| } | ||
|
|
||
| val useMORBootstrapFF = parameters.getOrElse(MOR_BOOTSTRAP_FILE_READER.key, | ||
| MOR_BOOTSTRAP_FILE_READER.defaultValue).toBoolean && (globPaths == null || globPaths.isEmpty) | ||
|
||
|
|
||
| if (metaClient.getCommitsTimeline.filterCompletedInstants.countInstants() == 0) { | ||
| new EmptyRelation(sqlContext, resolveSchema(metaClient, parameters, Some(schema))) | ||
| } else if (isCdcQuery) { | ||
|
|
@@ -262,16 +263,30 @@ object DefaultSource { | |
| new IncrementalRelation(sqlContext, parameters, userSchema, metaClient) | ||
|
|
||
| case (MERGE_ON_READ, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) => | ||
| new MergeOnReadSnapshotRelation(sqlContext, parameters, metaClient, globPaths, userSchema) | ||
| val relation = new MergeOnReadSnapshotRelation(sqlContext, parameters, metaClient, globPaths, userSchema) | ||
| if (useMORBootstrapFF && !relation.hasSchemaOnRead) { | ||
| relation.toHadoopFsRelation | ||
| } else { | ||
| relation | ||
| } | ||
|
|
||
| case (MERGE_ON_READ, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) => | ||
| new MergeOnReadIncrementalRelation(sqlContext, parameters, metaClient, userSchema) | ||
|
|
||
| case (MERGE_ON_READ, QUERY_TYPE_SNAPSHOT_OPT_VAL, true) => | ||
| new HoodieBootstrapMORRelation(sqlContext, userSchema, globPaths, metaClient, parameters) | ||
|
|
||
| val relation = new HoodieBootstrapMORRelation(sqlContext, userSchema, globPaths, metaClient, parameters) | ||
| if (useMORBootstrapFF && !relation.hasSchemaOnRead) { | ||
| relation.toHadoopFsRelation | ||
| } else { | ||
| relation | ||
| } | ||
| case (_, _, true) => | ||
| resolveHoodieBootstrapRelation(sqlContext, globPaths, userSchema, metaClient, parameters) | ||
| val relation = new HoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient, parameters) | ||
| if (useMORBootstrapFF && !relation.hasSchemaOnRead) { | ||
| relation.toHadoopFsRelation | ||
| } else { | ||
| relation | ||
jonvex marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| case (_, _, _) => | ||
| throw new HoodieException(s"Invalid query type : $queryType for tableType: $tableType," + | ||
|
|
@@ -280,24 +295,6 @@ object DefaultSource { | |
| } | ||
| } | ||
|
|
||
| private def resolveHoodieBootstrapRelation(sqlContext: SQLContext, | ||
| globPaths: Seq[Path], | ||
| userSchema: Option[StructType], | ||
| metaClient: HoodieTableMetaClient, | ||
| parameters: Map[String, String]): BaseRelation = { | ||
| val enableFileIndex = HoodieSparkConfUtils.getConfigValue(parameters, sqlContext.sparkSession.sessionState.conf, | ||
| ENABLE_HOODIE_FILE_INDEX.key, ENABLE_HOODIE_FILE_INDEX.defaultValue.toString).toBoolean | ||
| val isSchemaEvolutionEnabledOnRead = HoodieSparkConfUtils.getConfigValue(parameters, | ||
| sqlContext.sparkSession.sessionState.conf, DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key, | ||
| DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.defaultValue.toString).toBoolean | ||
| if (!enableFileIndex || isSchemaEvolutionEnabledOnRead | ||
| || globPaths.nonEmpty || !parameters.getOrElse(DATA_QUERIES_ONLY.key, DATA_QUERIES_ONLY.defaultValue).toBoolean) { | ||
| HoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient, parameters + (DATA_QUERIES_ONLY.key() -> "false")) | ||
| } else { | ||
| HoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient, parameters).toHadoopFsRelation | ||
| } | ||
| } | ||
|
|
||
| private def resolveBaseFileOnlyRelation(sqlContext: SQLContext, | ||
| globPaths: Seq[Path], | ||
| userSchema: Option[StructType], | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add elaboration on what this config controls? Currently, it's not apparent.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added this config when we were trying to find ways to speed up bootstrap reading . What it did was not do the skeleton base file stitching and don't return the meta fields. With this pr, it isn't necessary anymore, because now if you don't request any meta cols, that will automatically happen inside the new reader so I think it is just confusing to introduce this config and then have it not be necessary very soon.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it. Could you update the config documentation? As discussed, we'll keep this config since it can still be used when the existing file format and relations are used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
btw, the config doc update should be in a separate PR.