Skip to content

Commit 25773ca

Browse files
committed
[HUDI-3523] Introduce AddPrimitiveColumnSchemaPostProcessor to support add new primitive column to the end of a schema
1 parent 797e7a6 commit 25773ca

File tree

7 files changed

+204
-13
lines changed

7 files changed

+204
-13
lines changed

hudi-utilities/src/main/java/org/apache/hudi/utilities/UtilHelpers.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics;
4646
import org.apache.hudi.utilities.exception.HoodieSchemaPostProcessException;
4747
import org.apache.hudi.utilities.exception.HoodieSourcePostProcessException;
48-
import org.apache.hudi.utilities.schema.ChainedSchemaPostProcessor;
48+
import org.apache.hudi.utilities.schema.postprocessor.ChainedSchemaPostProcessor;
4949
import org.apache.hudi.utilities.schema.DelegatingSchemaProvider;
5050
import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
5151
import org.apache.hudi.utilities.schema.SchemaPostProcessor;

hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/ChainedSchemaPostProcessor.java renamed to hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/postprocessor/ChainedSchemaPostProcessor.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616
* limitations under the License.
1717
*/
1818

19-
package org.apache.hudi.utilities.schema;
19+
package org.apache.hudi.utilities.schema.postprocessor;
2020

2121
import org.apache.hudi.common.config.TypedProperties;
2222

2323
import org.apache.avro.Schema;
24+
import org.apache.hudi.utilities.schema.SchemaPostProcessor;
2425
import org.apache.spark.api.java.JavaSparkContext;
2526

2627
import java.util.List;

hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/DeleteSupportSchemaPostProcessor.java renamed to hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/postprocessor/DeleteSupportSchemaPostProcessor.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616
* limitations under the License.
1717
*/
1818

19-
package org.apache.hudi.utilities.schema;
19+
package org.apache.hudi.utilities.schema.postprocessor;
2020

2121
import org.apache.hudi.common.config.TypedProperties;
2222
import org.apache.hudi.common.model.HoodieRecord;
2323

2424
import org.apache.avro.Schema;
25+
import org.apache.hudi.utilities.schema.SchemaPostProcessor;
2526
import org.apache.log4j.LogManager;
2627
import org.apache.log4j.Logger;
2728
import org.apache.spark.api.java.JavaSparkContext;

hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/DropColumnSchemaPostProcessor.java renamed to hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/postprocessor/DropColumnSchemaPostProcessor.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,14 @@
1616
* limitations under the License.
1717
*/
1818

19-
package org.apache.hudi.utilities.schema;
19+
package org.apache.hudi.utilities.schema.postprocessor;
2020

2121
import org.apache.hudi.common.config.TypedProperties;
2222
import org.apache.hudi.common.util.StringUtils;
2323
import org.apache.hudi.utilities.exception.HoodieSchemaPostProcessException;
2424

2525
import org.apache.avro.Schema;
26+
import org.apache.hudi.utilities.schema.SchemaPostProcessor;
2627
import org.apache.log4j.LogManager;
2728
import org.apache.log4j.Logger;
2829
import org.apache.spark.api.java.JavaSparkContext;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hudi.utilities.schema.postprocessor.add;
20+
21+
import org.apache.hudi.common.config.TypedProperties;
22+
import org.apache.hudi.common.util.StringUtils;
23+
import org.apache.hudi.common.util.ValidationUtils;
24+
import org.apache.hudi.utilities.exception.HoodieSchemaPostProcessException;
25+
import org.apache.hudi.utilities.schema.SchemaPostProcessor;
26+
27+
import org.apache.avro.Schema;
28+
import org.apache.spark.api.java.JavaSparkContext;
29+
30+
import java.util.ArrayList;
31+
import java.util.List;
32+
import java.util.Locale;
33+
34+
/**
35+
* A {@link SchemaPostProcessor} used to add a new column of primitive types to given schema. Only supports adding one
36+
* column at a time.
37+
* <p>
38+
* The new column will be appended to the end.
39+
* <p>
40+
* TODO support complex types.
41+
*/
42+
public class AddPrimitiveColumnSchemaPostProcessor extends SchemaPostProcessor {
43+
44+
public AddPrimitiveColumnSchemaPostProcessor(TypedProperties props, JavaSparkContext jssc) {
45+
super(props, jssc);
46+
}
47+
48+
@Override
49+
public Schema processSchema(Schema schema) {
50+
String newColumnName = this.config.getString(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_NAME_PROP.key());
51+
52+
if (schema.getField(newColumnName) != null) {
53+
throw new HoodieSchemaPostProcessException(String.format("Column %s already exist!", newColumnName));
54+
}
55+
56+
List<Schema.Field> sourceFields = schema.getFields();
57+
List<Schema.Field> targetFields = new ArrayList<>(sourceFields.size() + 1);
58+
59+
60+
for (Schema.Field sourceField : sourceFields) {
61+
targetFields.add(new Schema.Field(sourceField.name(), sourceField.schema(), sourceField.doc(), sourceField.defaultVal()));
62+
}
63+
64+
// add new column to the end
65+
targetFields.add(buildNewColumn());
66+
67+
return Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false, targetFields);
68+
}
69+
70+
private Schema.Field buildNewColumn() {
71+
72+
String columnName = this.config.getString(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_NAME_PROP.key());
73+
String type = this.config.getString(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_TYPE_PROP.key()).toUpperCase(Locale.ROOT);
74+
String doc = this.config.getString(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_DOC_PROP.key(), null);
75+
Object defaultValue = this.config.getOrDefault(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_DEFAULT_PROP.key(),
76+
null);
77+
boolean nullable = this.config.getBoolean(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_NULLABLE_PROP.key(),
78+
BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_NULLABLE_PROP.defaultValue());
79+
80+
ValidationUtils.checkArgument(!StringUtils.isNullOrEmpty(columnName));
81+
ValidationUtils.checkArgument(!StringUtils.isNullOrEmpty(type));
82+
ValidationUtils.checkArgument(!Schema.Type.NULL.getName().equals(type));
83+
84+
Schema newSchema = createSchema(type, nullable);
85+
86+
return new Schema.Field(columnName, newSchema, doc, defaultValue);
87+
}
88+
89+
private Schema createSchema(String type, boolean nullable) {
90+
Schema schema = Schema.create(Schema.Type.valueOf(type));
91+
if (nullable) {
92+
schema = Schema.createUnion(Schema.create(Schema.Type.NULL), schema);
93+
}
94+
return schema;
95+
}
96+
97+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hudi.utilities.schema.postprocessor.add;
20+
21+
import org.apache.hudi.common.config.ConfigProperty;
22+
23+
/**
24+
* Base configs to describe a primitive type column.
25+
*/
26+
public class BaseSchemaPostProcessorConfig {
27+
28+
public static final ConfigProperty<String> SCHEMA_POST_PROCESSOR_ADD_COLUMN_NAME_PROP = ConfigProperty
29+
.key("hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.name")
30+
.noDefaultValue()
31+
.withDocumentation("New column's name");
32+
33+
public static final ConfigProperty<String> SCHEMA_POST_PROCESSOR_ADD_COLUMN_TYPE_PROP = ConfigProperty
34+
.key("hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.type")
35+
.noDefaultValue()
36+
.withDocumentation("New column's type");
37+
38+
public static final ConfigProperty<Boolean> SCHEMA_POST_PROCESSOR_ADD_COLUMN_NULLABLE_PROP = ConfigProperty
39+
.key("hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.nullable")
40+
.defaultValue(true)
41+
.withDocumentation("New column's nullable");
42+
43+
public static final ConfigProperty<String> SCHEMA_POST_PROCESSOR_ADD_COLUMN_DEFAULT_PROP = ConfigProperty
44+
.key("hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.default")
45+
.noDefaultValue()
46+
.withDocumentation("New column's default value");
47+
48+
public static final ConfigProperty<String> SCHEMA_POST_PROCESSOR_ADD_COLUMN_DOC_PROP = ConfigProperty
49+
.key("hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.doc")
50+
.noDefaultValue()
51+
.withDocumentation("Docs about new column");
52+
53+
}

hudi-utilities/src/test/java/org/apache/hudi/utilities/TestSchemaPostProcessor.java

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,25 +20,32 @@
2020

2121
import org.apache.hudi.common.config.TypedProperties;
2222
import org.apache.hudi.utilities.exception.HoodieSchemaPostProcessException;
23-
import org.apache.hudi.utilities.schema.DeleteSupportSchemaPostProcessor;
24-
import org.apache.hudi.utilities.schema.DropColumnSchemaPostProcessor;
23+
import org.apache.hudi.utilities.schema.postprocessor.add.AddPrimitiveColumnSchemaPostProcessor;
24+
import org.apache.hudi.utilities.schema.postprocessor.DeleteSupportSchemaPostProcessor;
25+
import org.apache.hudi.utilities.schema.postprocessor.DropColumnSchemaPostProcessor;
2526
import org.apache.hudi.utilities.schema.SchemaPostProcessor;
2627
import org.apache.hudi.utilities.schema.SchemaPostProcessor.Config;
2728
import org.apache.hudi.utilities.schema.SchemaProvider;
2829
import org.apache.hudi.utilities.schema.SparkAvroPostProcessor;
30+
import org.apache.hudi.utilities.schema.postprocessor.add.BaseSchemaPostProcessorConfig;
2931
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
3032
import org.apache.hudi.utilities.transform.FlatteningTransformer;
3133

3234
import org.apache.avro.Schema;
3335
import org.apache.avro.Schema.Type;
3436
import org.junit.jupiter.api.Assertions;
3537
import org.junit.jupiter.api.Test;
38+
import org.junit.jupiter.params.ParameterizedTest;
39+
import org.junit.jupiter.params.provider.Arguments;
40+
import org.junit.jupiter.params.provider.MethodSource;
3641

3742
import java.io.IOException;
3843
import java.util.ArrayList;
3944
import java.util.List;
45+
import java.util.stream.Stream;
4046

4147
import static org.junit.jupiter.api.Assertions.assertEquals;
48+
import static org.junit.jupiter.api.Assertions.assertNotEquals;
4249
import static org.junit.jupiter.api.Assertions.assertNotNull;
4350
import static org.junit.jupiter.api.Assertions.assertNull;
4451

@@ -55,13 +62,18 @@ public class TestSchemaPostProcessor extends UtilitiesTestBase {
5562
+ "{\"name\":\"_row_key\",\"type\":\"string\"},{\"name\":\"rider\",\"type\":\"string\"},{\"name\":\"driver\","
5663
+ "\"type\":\"string\"},{\"name\":\"fare\",\"type\":\"double\"}]}";
5764

65+
private static Stream<Arguments> configParams() {
66+
String[] types = {"bytes", "string", "int", "long", "float", "double", "boolean"};
67+
return Stream.of(types).map(Arguments::of);
68+
}
69+
5870
@Test
5971
public void testPostProcessor() throws IOException {
6072
properties.put(Config.SCHEMA_POST_PROCESSOR_PROP, DummySchemaPostProcessor.class.getName());
6173
SchemaProvider provider =
6274
UtilHelpers.wrapSchemaProviderWithPostProcessor(
63-
UtilHelpers.createSchemaProvider(DummySchemaProvider.class.getName(), properties, jsc),
64-
properties, jsc,null);
75+
UtilHelpers.createSchemaProvider(DummySchemaProvider.class.getName(), properties, jsc),
76+
properties, jsc, null);
6577

6678
Schema schema = provider.getSourceSchema();
6779
assertEquals(schema.getType(), Type.RECORD);
@@ -76,9 +88,9 @@ public void testSparkAvro() throws IOException {
7688
transformerClassNames.add(FlatteningTransformer.class.getName());
7789

7890
SchemaProvider provider =
79-
UtilHelpers.wrapSchemaProviderWithPostProcessor(
80-
UtilHelpers.createSchemaProvider(SparkAvroSchemaProvider.class.getName(), properties, jsc),
81-
properties, jsc, transformerClassNames);
91+
UtilHelpers.wrapSchemaProviderWithPostProcessor(
92+
UtilHelpers.createSchemaProvider(SparkAvroSchemaProvider.class.getName(), properties, jsc),
93+
properties, jsc, transformerClassNames);
8294

8395
Schema schema = provider.getSourceSchema();
8496
assertEquals(schema.getType(), Type.RECORD);
@@ -99,7 +111,7 @@ public void testDeleteSupport() {
99111
public void testChainedSchemaPostProcessor() {
100112
// DeleteSupportSchemaPostProcessor first, DummySchemaPostProcessor second
101113
properties.put(Config.SCHEMA_POST_PROCESSOR_PROP,
102-
"org.apache.hudi.utilities.schema.DeleteSupportSchemaPostProcessor,org.apache.hudi.utilities.DummySchemaPostProcessor");
114+
"org.apache.hudi.utilities.schema.postprocessor.DeleteSupportSchemaPostProcessor,org.apache.hudi.utilities.DummySchemaPostProcessor");
103115

104116
SchemaPostProcessor processor = UtilHelpers.createSchemaPostProcessor(properties.getString(Config.SCHEMA_POST_PROCESSOR_PROP), properties, jsc);
105117
Schema schema = new Schema.Parser().parse(ORIGINAL_SCHEMA);
@@ -111,7 +123,7 @@ public void testChainedSchemaPostProcessor() {
111123

112124
// DummySchemaPostProcessor first, DeleteSupportSchemaPostProcessor second
113125
properties.put(Config.SCHEMA_POST_PROCESSOR_PROP,
114-
"org.apache.hudi.utilities.DummySchemaPostProcessor,org.apache.hudi.utilities.schema.DeleteSupportSchemaPostProcessor");
126+
"org.apache.hudi.utilities.DummySchemaPostProcessor,org.apache.hudi.utilities.schema.postprocessor.DeleteSupportSchemaPostProcessor");
115127

116128
processor = UtilHelpers.createSchemaPostProcessor(properties.getString(Config.SCHEMA_POST_PROCESSOR_PROP), properties, jsc);
117129
schema = new Schema.Parser().parse(ORIGINAL_SCHEMA);
@@ -144,6 +156,32 @@ public void testDeleteColumnThrows() {
144156
Assertions.assertThrows(HoodieSchemaPostProcessException.class, () -> processor.processSchema(schema));
145157
}
146158

159+
@ParameterizedTest
160+
@MethodSource("configParams")
161+
public void testAddPrimitiveTypeColumn(String type) {
162+
properties.put(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_NAME_PROP.key(), "primitive_column");
163+
properties.put(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_TYPE_PROP.key(), type);
164+
properties.put(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_DOC_PROP.key(), "primitive column test");
165+
166+
AddPrimitiveColumnSchemaPostProcessor processor = new AddPrimitiveColumnSchemaPostProcessor(properties, null);
167+
Schema schema = new Schema.Parser().parse(ORIGINAL_SCHEMA);
168+
Schema targetSchema = processor.processSchema(schema);
169+
170+
Schema.Field newColumn = targetSchema.getField("primitive_column");
171+
172+
assertNotNull(newColumn);
173+
assertEquals("primitive column test", newColumn.doc());
174+
// nullable by default, so new column is union type
175+
assertNotEquals(type, newColumn.schema().getType().getName());
176+
177+
// test not nullable
178+
properties.put(BaseSchemaPostProcessorConfig.SCHEMA_POST_PROCESSOR_ADD_COLUMN_NULLABLE_PROP.key(), false);
179+
targetSchema = processor.processSchema(schema);
180+
newColumn = targetSchema.getField("primitive_column");
181+
assertEquals(type, newColumn.schema().getType().getName());
182+
183+
}
184+
147185
@Test
148186
public void testSparkAvroSchema() throws IOException {
149187
SparkAvroPostProcessor processor = new SparkAvroPostProcessor(properties, null);

0 commit comments

Comments
 (0)