-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-5998] Speed up reads from bootstrapped tables in spark #8303
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
27b8276
221af92
4bce566
7f9a12f
4a20074
498f23e
78befd9
5bca709
6a7ae70
9cda89b
732fbf0
c6908a1
3cfef7f
76394b7
e779563
3ad5ae5
e4144fb
2e63f7a
0ed2644
b8772a7
f361b40
27375ab
551c52d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieTableType.{COPY_ON_WRITE, MERGE_ON_REA | |
| import org.apache.hudi.common.table.timeline.HoodieInstant | ||
| import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} | ||
| import org.apache.hudi.common.util.ValidationUtils.checkState | ||
| import org.apache.hudi.config.HoodieBootstrapConfig | ||
| import org.apache.hudi.config.HoodieWriteConfig.WRITE_CONCURRENCY_MODE | ||
| import org.apache.hudi.exception.HoodieException | ||
| import org.apache.hudi.util.PathUtils | ||
|
|
@@ -100,7 +101,7 @@ class DefaultSource extends RelationProvider | |
| ) | ||
| } else { | ||
| Map() | ||
| }) ++ DataSourceOptionsHelper.parametersWithReadDefaults(optParams) | ||
| }) ++ DataSourceOptionsHelper.parametersWithReadDefaults(sqlContext.getAllConfs.filter(k => k._1.startsWith("hoodie.")) ++ optParams) | ||
|
||
|
|
||
| // Get the table base path | ||
| val tablePath = if (globPaths.nonEmpty) { | ||
|
|
@@ -261,7 +262,7 @@ object DefaultSource { | |
| new MergeOnReadIncrementalRelation(sqlContext, parameters, metaClient, userSchema) | ||
|
|
||
| case (_, _, true) => | ||
| new HoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient, parameters) | ||
| resolveHoodieBootstrapRelation(sqlContext, globPaths, userSchema, metaClient, parameters) | ||
|
|
||
| case (_, _, _) => | ||
| throw new HoodieException(s"Invalid query type : $queryType for tableType: $tableType," + | ||
|
|
@@ -270,6 +271,21 @@ object DefaultSource { | |
| } | ||
| } | ||
|
|
||
| private def resolveHoodieBootstrapRelation(sqlContext: SQLContext, | ||
| globPaths: Seq[Path], | ||
| userSchema: Option[StructType], | ||
| metaClient: HoodieTableMetaClient, | ||
| parameters: Map[String, String]): BaseRelation = { | ||
| val enableFileIndex = HoodieSparkConfUtils.getConfigValue(parameters, sqlContext.sparkSession.sessionState.conf, | ||
| ENABLE_HOODIE_FILE_INDEX.key, ENABLE_HOODIE_FILE_INDEX.defaultValue.toString).toBoolean | ||
| if (!enableFileIndex || globPaths.nonEmpty || parameters.getOrElse(HoodieBootstrapConfig.DATA_QUERIES_ONLY.key(), "true") != "true") { | ||
|
||
| HoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient, parameters) | ||
| } else { | ||
| HoodieBootstrapRelation(sqlContext, userSchema, globPaths, metaClient, parameters + | ||
| (HoodieBootstrapRelation.USE_FAST_BOOTSTRAP_READ -> "true")).toHadoopFsRelation | ||
| } | ||
| } | ||
|
|
||
| private def resolveBaseFileOnlyRelation(sqlContext: SQLContext, | ||
| globPaths: Seq[Path], | ||
| userSchema: Option[StructType], | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there any integration test for bootstrap where we test with this feature on?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I updated it so now it will use the feature in this test on the queries that don't use the meta fields