-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-4588][HUDI-4472] Addressing schema handling issues in the write path #6358
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
412c4a9
adfee8c
fc0c9ef
cc2cb49
0b45836
aa806af
c45473c
1216aed
ac5c218
9a77d29
dd5dd78
3938134
0cbd9d2
ee41dc0
bb935fc
dcfa147
0c5871f
a664fb4
6d91d84
0b6d2c1
2c60179
305b93d
1c157d9
bbe1f68
266fd5b
d85b03d
986e6a3
880cb05
55476a5
e2881d2
dfef2a4
4d97cd0
0b90a0f
ac6d2cf
0437b75
edc702d
5efb6c2
3e484ad
08b3655
5eff6ee
e78760e
b229090
1e057c4
93fa568
0b15651
06486cb
472719a
1a07651
0c7e12a
4908462
909e161
261bc5e
c5beb69
e1018fb
d71e76d
55dae7e
95b5206
39b1a6e
db0d494
2322871
f7afb21
73f2567
9c35a5d
692c6d0
dad1c63
19a417b
69e0333
afce319
4bb9519
2a069c4
7d16aa9
ac873ad
6a508ab
539ad42
797e6f5
21b3bec
479fce0
1906ed7
f154048
7585467
dd2b7b0
aa6c423
e57473b
8815638
033ebd3
f7315f5
4a0576d
282c869
0cfcf8b
1b2738b
1e14d26
6e55b0c
3afb9c7
071a8a6
dc12728
a1f2f2f
f40ad2f
1a79e83
bce6364
63ad6d0
0274f46
1c4f8f9
384b7a1
dbd4eb9
79b7ebd
626ef5b
58ce679
d1289b6
43eb094
c835245
31dada7
d549ba0
d7bf9b1
7fa57e3
e243707
acdb4eb
49fb34b
12760b5
a433f57
17ae7ad
829cee3
ad261b7
9f52f1f
9e3a35e
bf93139
5b3c71b
35c0f3b
996a06b
f36fdf8
03385a3
3fde95c
b9fce65
5357f6f
94c53ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -198,7 +198,7 @@ public class HoodieWriteConfig extends HoodieConfig { | |
|
|
||
| public static final ConfigProperty<String> AVRO_SCHEMA_VALIDATE_ENABLE = ConfigProperty | ||
| .key("hoodie.avro.schema.validate") | ||
| .defaultValue("false") | ||
| .defaultValue("true") | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is flipped to default to make sure proper schema validation are run for every operation on the table |
||
| .withDocumentation("Validate the schema used for the write against the latest schema, for backwards compatibility."); | ||
|
|
||
| public static final ConfigProperty<String> INSERT_PARALLELISM_VALUE = ConfigProperty | ||
|
|
@@ -438,15 +438,15 @@ public class HoodieWriteConfig extends HoodieConfig { | |
| + "OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table and exactly one of them succeed " | ||
| + "if a conflict (writes affect the same file group) is detected."); | ||
|
|
||
| /** | ||
| * Currently the use this to specify the write schema. | ||
| */ | ||
| public static final ConfigProperty<String> WRITE_SCHEMA = ConfigProperty | ||
| public static final ConfigProperty<String> WRITE_SCHEMA_OVERRIDE = ConfigProperty | ||
| .key("hoodie.write.schema") | ||
| .noDefaultValue() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, this config naming is confusing -- i've updated the field name but can't change the config since it's publicly exposed. Override is a more appropriate name for it provided that we pass a nominal writer-schema (which this config is overriding) t/h a different config property |
||
| .withDocumentation("The specified write schema. In most case, we do not need set this parameter," | ||
| + " but for the case the write schema is not equal to the specified table schema, we can" | ||
| + " specify the write schema by this parameter. Used by MergeIntoHoodieTableCommand"); | ||
| .withDocumentation("Config allowing to override writer's schema. This might be necessary in " | ||
| + "cases when writer's schema derived from the incoming dataset might actually be different from " | ||
| + "the schema we actually want to use when writing. This, for ex, could be the case for" | ||
| + "'partial-update' use-cases (like `MERGE INTO` Spark SQL statement for ex) where only " | ||
| + "a projection of the incoming dataset might be used to update the records in the existing table, " | ||
| + "prompting us to override the writer's schema"); | ||
|
|
||
| /** | ||
| * HUDI-858 : There are users who had been directly using RDD APIs and have relied on a behavior in 0.4.x to allow | ||
|
|
@@ -938,6 +938,19 @@ public void setSchema(String schemaStr) { | |
| setValue(AVRO_SCHEMA_STRING, schemaStr); | ||
| } | ||
|
|
||
| /** | ||
| * Returns schema used for writing records | ||
| * | ||
| * NOTE: This method respects {@link HoodieWriteConfig#WRITE_SCHEMA_OVERRIDE} being | ||
| * specified overriding original writing schema | ||
| */ | ||
| public String getWriteSchema() { | ||
| if (props.containsKey(WRITE_SCHEMA_OVERRIDE.key())) { | ||
| return getString(WRITE_SCHEMA_OVERRIDE); | ||
| } | ||
| return getSchema(); | ||
| } | ||
|
|
||
| public String getInternalSchema() { | ||
| return getString(INTERNAL_SCHEMA_STRING); | ||
| } | ||
|
|
@@ -962,21 +975,7 @@ public void setSchemaEvolutionEnable(boolean enable) { | |
| setValue(HoodieCommonConfig.SCHEMA_EVOLUTION_ENABLE, String.valueOf(enable)); | ||
| } | ||
|
|
||
| /** | ||
| * Get the write schema for written records. | ||
| * | ||
| * If the WRITE_SCHEMA has specified, we use the WRITE_SCHEMA. | ||
| * Or else we use the AVRO_SCHEMA as the write schema. | ||
| * @return | ||
| */ | ||
| public String getWriteSchema() { | ||
| if (props.containsKey(WRITE_SCHEMA.key())) { | ||
| return getString(WRITE_SCHEMA); | ||
| } | ||
| return getSchema(); | ||
| } | ||
|
|
||
| public boolean getAvroSchemaValidate() { | ||
| public boolean shouldValidateAvroSchema() { | ||
| return getBoolean(AVRO_SCHEMA_VALIDATE_ENABLE); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,8 +34,6 @@ | |
| import org.apache.hudi.config.HoodieWriteConfig; | ||
| import org.apache.hudi.exception.HoodieException; | ||
| import org.apache.hudi.exception.HoodieIOException; | ||
| import org.apache.hudi.io.storage.HoodieFileWriter; | ||
| import org.apache.hudi.io.storage.HoodieFileWriterFactory; | ||
| import org.apache.hudi.table.HoodieTable; | ||
| import org.apache.hudi.table.marker.WriteMarkersFactory; | ||
|
|
||
|
|
@@ -81,20 +79,7 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload, I, K, O> | |
| public static IgnoreRecord IGNORE_RECORD = new IgnoreRecord(); | ||
|
|
||
| /** | ||
| * The specified schema of the table. ("specified" denotes that this is configured by the client, | ||
| * as opposed to being implicitly fetched out of the commit metadata) | ||
| */ | ||
| protected final Schema tableSchema; | ||
| protected final Schema tableSchemaWithMetaFields; | ||
nsivabalan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /** | ||
| * The write schema. In most case the write schema is the same to the | ||
| * input schema. But if HoodieWriteConfig#WRITE_SCHEMA is specified, | ||
| * we use the WRITE_SCHEMA as the write schema. | ||
| * | ||
| * This is useful for the case of custom HoodieRecordPayload which do some conversion | ||
| * to the incoming record in it. e.g. the ExpressionPayload do the sql expression conversion | ||
| * to the input. | ||
| * Schema used to write records into data files | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on my understanding of the latest master, the write schema (with or without meta fields) is only used for bootstrap base files. Make sure we have unit/functional tests around this, so it is not affected. |
||
| */ | ||
| protected final Schema writeSchema; | ||
| protected final Schema writeSchemaWithMetaFields; | ||
|
|
@@ -120,8 +105,6 @@ protected HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String | |
| super(config, Option.of(instantTime), hoodieTable); | ||
| this.partitionPath = partitionPath; | ||
| this.fileId = fileId; | ||
| this.tableSchema = overriddenSchema.orElseGet(() -> getSpecifiedTableSchema(config)); | ||
| this.tableSchemaWithMetaFields = HoodieAvroUtils.addMetadataFields(tableSchema, config.allowOperationMetadataField()); | ||
| this.writeSchema = overriddenSchema.orElseGet(() -> getWriteSchema(config)); | ||
| this.writeSchemaWithMetaFields = HoodieAvroUtils.addMetadataFields(writeSchema, config.allowOperationMetadataField()); | ||
| this.timer = HoodieTimer.start(); | ||
|
|
@@ -132,25 +115,6 @@ protected HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String | |
| schemaOnReadEnabled = !isNullOrEmpty(hoodieTable.getConfig().getInternalSchema()); | ||
| } | ||
|
|
||
| /** | ||
| * Get the specified table schema. | ||
| * @param config | ||
| * @return | ||
| */ | ||
| private static Schema getSpecifiedTableSchema(HoodieWriteConfig config) { | ||
| return new Schema.Parser().parse(config.getSchema()); | ||
| } | ||
|
|
||
| /** | ||
| * Get the schema, of the actual write. | ||
| * | ||
| * @param config | ||
| * @return | ||
| */ | ||
| private static Schema getWriteSchema(HoodieWriteConfig config) { | ||
| return new Schema.Parser().parse(config.getWriteSchema()); | ||
| } | ||
|
|
||
| /** | ||
| * Generate a write token based on the currently running spark task and its place in the spark dag. | ||
| */ | ||
|
|
@@ -272,9 +236,8 @@ protected long getAttemptId() { | |
| return taskContextSupplier.getAttemptIdSupplier().get(); | ||
| } | ||
|
|
||
| protected HoodieFileWriter createNewFileWriter(String instantTime, Path path, HoodieTable<T, I, K, O> hoodieTable, | ||
| HoodieWriteConfig config, Schema schema, TaskContextSupplier taskContextSupplier) throws IOException { | ||
| return HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, schema, taskContextSupplier); | ||
| private static Schema getWriteSchema(HoodieWriteConfig config) { | ||
| return new Schema.Parser().parse(config.getWriteSchema()); | ||
| } | ||
|
|
||
| protected HoodieLogFormat.Writer createLogWriter( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the imports moving follows the correct check style?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, seems like i need to fix my setup