Skip to content

Commit 82b433d

Browse files
committed
[MINOR] follow up HUDI-3921, address all comments
1 parent c05a4e7 commit 82b433d

6 files changed

Lines changed: 89 additions & 43 deletions

File tree

hudi-common/src/main/java/org/apache/hudi/avro/HoodieAvroUtils.java

Lines changed: 28 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,10 @@
7272
import java.util.List;
7373
import java.util.Map;
7474
import java.util.Deque;
75-
import java.util.LinkedList;
75+
import java.util.ArrayDeque;
76+
import java.util.Spliterator;
77+
import java.util.Spliterators;
78+
import java.util.stream.StreamSupport;
7679
import java.util.TimeZone;
7780
import java.util.stream.Collectors;
7881

@@ -94,6 +97,10 @@ public class HoodieAvroUtils {
9497
//Export for test
9598
public static final Conversions.DecimalConversion DECIMAL_CONVERSION = new Conversions.DecimalConversion();
9699

100+
// Name of ArrayType/MapType for avro Schema
101+
private static final String ARRAY_TYPE_ELEMENT_NAME = "element";
102+
private static final String MAP_TYPE_VALUE_NAME = "value";
103+
97104
// As per https://avro.apache.org/docs/current/spec.html#names
98105
private static final String INVALID_AVRO_CHARS_IN_NAMES = "[^A-Za-z0-9_]";
99106
private static final String INVALID_AVRO_FIRST_CHAR_IN_NAMES = "[^A-Za-z_]";
@@ -410,7 +417,7 @@ public static GenericRecord rewriteRecordWithMetadata(GenericRecord genericRecor
410417

411418
// TODO Unify the logical of rewriteRecordWithMetadata and rewriteEvolutionRecordWithMetadata, and delete this function.
412419
public static GenericRecord rewriteEvolutionRecordWithMetadata(GenericRecord genericRecord, Schema newSchema, String fileName) {
413-
GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(genericRecord, newSchema, new HashMap<>());
420+
GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(genericRecord, newSchema, Collections.emptyMap());
414421
// do not preserve FILENAME_METADATA_FIELD
415422
newRecord.put(HoodieRecord.FILENAME_METADATA_FIELD_POS, fileName);
416423
return newRecord;
@@ -745,7 +752,7 @@ public static Object getRecordColumnValues(HoodieRecord<? extends HoodieRecordPa
745752
* @return newRecord for new Schema
746753
*/
747754
public static GenericRecord rewriteRecordWithNewSchema(IndexedRecord oldRecord, Schema newSchema, Map<String, String> renameCols) {
748-
Object newRecord = rewriteRecordWithNewSchema(oldRecord, oldRecord.getSchema(), newSchema, renameCols, new LinkedList<>());
755+
Object newRecord = rewriteRecordWithNewSchema(oldRecord, oldRecord.getSchema(), newSchema, renameCols, new ArrayDeque<>());
749756
return (GenericData.Record) newRecord;
750757
}
751758

@@ -773,39 +780,32 @@ private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSch
773780
}
774781
IndexedRecord indexedRecord = (IndexedRecord) oldRecord;
775782
List<Schema.Field> fields = newSchema.getFields();
776-
Map<Integer, Object> helper = new HashMap<>();
777-
783+
GenericData.Record newRecord = new GenericData.Record(newSchema);
778784
for (int i = 0; i < fields.size(); i++) {
779785
Schema.Field field = fields.get(i);
780786
String fieldName = field.name();
781787
fieldNames.push(fieldName);
782788
if (oldSchema.getField(field.name()) != null) {
783789
Schema.Field oldField = oldSchema.getField(field.name());
784-
helper.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames));
790+
newRecord.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames));
785791
} else {
786792
String fieldFullName = createFullName(fieldNames);
787-
String[] colNamePartsFromOldSchema = renameCols.getOrDefault(fieldFullName, "").split("\\.");
788-
String lastColNameFromOldSchema = colNamePartsFromOldSchema[colNamePartsFromOldSchema.length - 1];
793+
String fieldNameFromOldSchema = renameCols.getOrDefault(fieldFullName, "");
789794
// deal with rename
790-
if (oldSchema.getField(field.name()) == null && oldSchema.getField(lastColNameFromOldSchema) != null) {
795+
if (oldSchema.getField(field.name()) == null && oldSchema.getField(fieldNameFromOldSchema) != null) {
791796
// find rename
792-
Schema.Field oldField = oldSchema.getField(lastColNameFromOldSchema);
793-
helper.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames));
794-
}
795-
}
796-
fieldNames.pop();
797-
}
798-
GenericData.Record newRecord = new GenericData.Record(newSchema);
799-
for (int i = 0; i < fields.size(); i++) {
800-
if (helper.containsKey(i)) {
801-
newRecord.put(i, helper.get(i));
802-
} else {
803-
if (fields.get(i).defaultVal() instanceof JsonProperties.Null) {
804-
newRecord.put(i, null);
797+
Schema.Field oldField = oldSchema.getField(fieldNameFromOldSchema);
798+
newRecord.put(i, rewriteRecordWithNewSchema(indexedRecord.get(oldField.pos()), oldField.schema(), fields.get(i).schema(), renameCols, fieldNames));
805799
} else {
806-
newRecord.put(i, fields.get(i).defaultVal());
800+
// deal with default value
801+
if (fields.get(i).defaultVal() instanceof JsonProperties.Null) {
802+
newRecord.put(i, null);
803+
} else {
804+
newRecord.put(i, fields.get(i).defaultVal());
805+
}
807806
}
808807
}
808+
fieldNames.pop();
809809
}
810810
return newRecord;
811811
case ARRAY:
@@ -814,7 +814,7 @@ private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSch
814814
}
815815
Collection array = (Collection)oldRecord;
816816
List<Object> newArray = new ArrayList();
817-
fieldNames.push("element");
817+
fieldNames.push(ARRAY_TYPE_ELEMENT_NAME);
818818
for (Object element : array) {
819819
newArray.add(rewriteRecordWithNewSchema(element, oldSchema.getElementType(), newSchema.getElementType(), renameCols, fieldNames));
820820
}
@@ -826,7 +826,7 @@ private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSch
826826
}
827827
Map<Object, Object> map = (Map<Object, Object>) oldRecord;
828828
Map<Object, Object> newMap = new HashMap<>();
829-
fieldNames.push("value");
829+
fieldNames.push(MAP_TYPE_VALUE_NAME);
830830
for (Map.Entry<Object, Object> entry : map.entrySet()) {
831831
newMap.put(entry.getKey(), rewriteRecordWithNewSchema(entry.getValue(), oldSchema.getValueType(), newSchema.getValueType(), renameCols, fieldNames));
832832
}
@@ -840,13 +840,9 @@ private static Object rewriteRecordWithNewSchema(Object oldRecord, Schema oldSch
840840
}
841841

842842
private static String createFullName(Deque<String> fieldNames) {
843-
String result = "";
844-
if (!fieldNames.isEmpty()) {
845-
List<String> parentNames = new ArrayList<>();
846-
fieldNames.descendingIterator().forEachRemaining(parentNames::add);
847-
result = parentNames.stream().collect(Collectors.joining("."));
848-
}
849-
return result;
843+
return StreamSupport
844+
.stream(Spliterators.spliteratorUnknownSize(fieldNames.descendingIterator(), Spliterator.ORDERED), false)
845+
.collect(Collectors.joining("."));
850846
}
851847

852848
private static Object rewritePrimaryType(Object oldValue, Schema oldSchema, Schema newSchema) {

hudi-common/src/main/java/org/apache/hudi/common/table/log/AbstractHoodieLogRecordReader.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
import java.io.IOException;
5858
import java.util.ArrayDeque;
5959
import java.util.Arrays;
60+
import java.util.Collections;
6061
import java.util.Deque;
6162
import java.util.HashMap;
6263
import java.util.HashSet;
@@ -380,7 +381,7 @@ private void processDataBlock(HoodieDataBlock dataBlock, Option<KeySpec> keySpec
380381
Option<Schema> schemaOption = getMergedSchema(dataBlock);
381382
while (recordIterator.hasNext()) {
382383
IndexedRecord currentRecord = recordIterator.next();
383-
IndexedRecord record = schemaOption.isPresent() ? HoodieAvroUtils.rewriteRecordWithNewSchema(currentRecord, schemaOption.get(), new HashMap<>()) : currentRecord;
384+
IndexedRecord record = schemaOption.isPresent() ? HoodieAvroUtils.rewriteRecordWithNewSchema(currentRecord, schemaOption.get(), Collections.emptyMap()) : currentRecord;
384385
processNextRecord(createHoodieRecord(record, this.hoodieTableMetaClient.getTableConfig(), this.payloadClassFQN,
385386
this.preCombineField, this.withOperationField, this.simpleKeyGenFields, this.partitionName));
386387
totalLogRecords.incrementAndGet();

hudi-common/src/main/java/org/apache/hudi/internal/schema/action/InternalSchemaMerger.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,7 @@ public InternalSchemaMerger(InternalSchema fileSchema, InternalSchema querySchem
6868
}
6969

7070
public InternalSchemaMerger(InternalSchema fileSchema, InternalSchema querySchema, boolean ignoreRequiredAttribute, boolean useColumnTypeFromFileSchema) {
71-
this.fileSchema = fileSchema;
72-
this.querySchema = querySchema;
73-
this.ignoreRequiredAttribute = ignoreRequiredAttribute;
74-
this.useColumnTypeFromFileSchema = useColumnTypeFromFileSchema;
71+
this(fileSchema, querySchema, ignoreRequiredAttribute, useColumnTypeFromFileSchema, true);
7572
}
7673

7774
/**
@@ -151,14 +148,15 @@ private Types.Field dealWithRename(int fieldId, Type newType, Types.Field oldFie
151148
Types.Field fieldFromFileSchema = fileSchema.findField(fieldId);
152149
String nameFromFileSchema = fieldFromFileSchema.name();
153150
String nameFromQuerySchema = querySchema.findField(fieldId).name();
151+
String finalFieldName = useColNameFromFileSchema ? nameFromFileSchema : nameFromQuerySchema;
154152
Type typeFromFileSchema = fieldFromFileSchema.type();
155153
// Current design mechanism guarantees nestedType change is not allowed, so no need to consider.
156154
if (newType.isNestedType()) {
157155
return Types.Field.get(oldField.fieldId(), oldField.isOptional(),
158-
useColNameFromFileSchema ? nameFromFileSchema : nameFromQuerySchema, newType, oldField.doc());
156+
finalFieldName, newType, oldField.doc());
159157
} else {
160158
return Types.Field.get(oldField.fieldId(), oldField.isOptional(),
161-
useColNameFromFileSchema ? nameFromFileSchema : nameFromQuerySchema, useColumnTypeFromFileSchema ? typeFromFileSchema : newType, oldField.doc());
159+
finalFieldName, useColumnTypeFromFileSchema ? typeFromFileSchema : newType, oldField.doc());
162160
}
163161
}
164162

hudi-common/src/main/java/org/apache/hudi/internal/schema/utils/InternalSchemaUtils.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,14 +273,17 @@ public static String createFullName(String name, Deque<String> fieldNames) {
273273
*
274274
* @param oldSchema oldSchema
275275
* @param newSchema newSchema which modified from oldSchema
276-
* @return renameCols Map. (k, v) -> (colNameFromNewSchema, colNameFromOldSchema)
276+
* @return renameCols Map. (k, v) -> (colNameFromNewSchema, colNameLastPartFromOldSchema)
277277
*/
278278
public static Map<String, String> collectRenameCols(InternalSchema oldSchema, InternalSchema newSchema) {
279279
List<String> colNamesFromWriteSchema = oldSchema.getAllColsFullName();
280280
return colNamesFromWriteSchema.stream().filter(f -> {
281281
int filedIdFromWriteSchema = oldSchema.findIdByName(f);
282282
// try to find the cols which has the same id, but have different colName;
283283
return newSchema.getAllIds().contains(filedIdFromWriteSchema) && !newSchema.findfullName(filedIdFromWriteSchema).equalsIgnoreCase(f);
284-
}).collect(Collectors.toMap(e -> newSchema.findfullName(oldSchema.findIdByName(e)), e -> e));
284+
}).collect(Collectors.toMap(e -> newSchema.findfullName(oldSchema.findIdByName(e)), e -> {
285+
int lastDotIndex = e.lastIndexOf(".");
286+
return e.substring(lastDotIndex == -1 ? 0 : lastDotIndex + 1);
287+
}));
285288
}
286289
}

hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.avro.generic.GenericData;
2828
import org.apache.avro.generic.GenericRecord;
2929
import org.junit.jupiter.api.Test;
30+
import org.junit.jupiter.api.Assertions;
3031

3132
import java.io.IOException;
3233
import java.math.BigDecimal;
@@ -35,6 +36,7 @@
3536
import java.util.Arrays;
3637
import java.util.List;
3738
import java.util.Map;
39+
import java.util.HashMap;
3840

3941
import static org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldSchemaFromWriteSchema;
4042
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -98,6 +100,12 @@ public class TestHoodieAvroUtils {
98100
+ "{\"name\":\"student\",\"type\":{\"name\":\"student\",\"type\":\"record\",\"fields\":["
99101
+ "{\"name\":\"firstname\",\"type\":[\"null\" ,\"string\"],\"default\": null},{\"name\":\"lastname\",\"type\":[\"null\" ,\"string\"],\"default\": null}]}}]}";
100102

103+
private static String SCHEMA_WITH_NESTED_FIELD_RENAMED = "{\"name\":\"MyClass\",\"type\":\"record\",\"namespace\":\"com.acme.avro\",\"fields\":["
104+
+ "{\"name\":\"fn\",\"type\":\"string\"},"
105+
+ "{\"name\":\"ln\",\"type\":\"string\"},"
106+
+ "{\"name\":\"ss\",\"type\":{\"name\":\"ss\",\"type\":\"record\",\"fields\":["
107+
+ "{\"name\":\"fn\",\"type\":[\"null\" ,\"string\"],\"default\": null},{\"name\":\"ln\",\"type\":[\"null\" ,\"string\"],\"default\": null}]}}]}";
108+
101109
@Test
102110
public void testPropsPresent() {
103111
Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(EXAMPLE_SCHEMA));
@@ -342,4 +350,26 @@ public void testGetNestedFieldSchema() throws IOException {
342350
assertEquals(Schema.create(Schema.Type.STRING), getNestedFieldSchemaFromWriteSchema(rec3.getSchema(), "student.firstname"));
343351
assertEquals(Schema.create(Schema.Type.STRING), getNestedFieldSchemaFromWriteSchema(nestedSchema, "student.firstname"));
344352
}
353+
354+
@Test
355+
public void testReWriteAvroRecordWithNewSchema() {
356+
Schema nestedSchema = new Schema.Parser().parse(SCHEMA_WITH_NESTED_FIELD);
357+
GenericRecord rec3 = new GenericData.Record(nestedSchema);
358+
rec3.put("firstname", "person1");
359+
rec3.put("lastname", "person2");
360+
GenericRecord studentRecord = new GenericData.Record(rec3.getSchema().getField("student").schema());
361+
studentRecord.put("firstname", "person1");
362+
studentRecord.put("lastname", "person2");
363+
rec3.put("student", studentRecord);
364+
365+
Schema nestedSchemaRename = new Schema.Parser().parse(SCHEMA_WITH_NESTED_FIELD_RENAMED);
366+
Map<String, String> colRenames = new HashMap<>();
367+
colRenames.put("fn", "firstname");
368+
colRenames.put("ln", "lastname");
369+
colRenames.put("ss", "student");
370+
colRenames.put("ss.fn", "firstname");
371+
colRenames.put("ss.ln", "lastname");
372+
GenericRecord studentRecordRename = HoodieAvroUtils.rewriteRecordWithNewSchema(rec3, nestedSchemaRename, colRenames);
373+
Assertions.assertEquals(GenericData.get().validate(nestedSchemaRename, studentRecordRename), true);
374+
}
345375
}

hudi-common/src/test/java/org/apache/hudi/internal/schema/utils/TestAvroSchemaEvolutionUtils.java

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import java.nio.ByteBuffer;
3939
import java.util.ArrayList;
4040
import java.util.Arrays;
41+
import java.util.Collections;
4142
import java.util.HashMap;
4243
import java.util.List;
4344
import java.util.Map;
@@ -284,7 +285,7 @@ public void testReWriteRecordWithTypeChanged() {
284285
.updateColumnType("col6", Types.StringType.get());
285286
InternalSchema newSchema = SchemaChangeUtils.applyTableChanges2Schema(internalSchema, updateChange);
286287
Schema newAvroSchema = AvroInternalSchemaConverter.convert(newSchema, avroSchema.getName());
287-
GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema, new HashMap<>());
288+
GenericRecord newRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema, Collections.emptyMap());
288289

289290
Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newRecord), true);
290291
}
@@ -349,9 +350,26 @@ public void testReWriteNestRecord() {
349350
);
350351

351352
Schema newAvroSchema = AvroInternalSchemaConverter.convert(newRecord, schema.getName());
352-
GenericRecord newAvroRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema, new HashMap<>());
353+
GenericRecord newAvroRecord = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, newAvroSchema, Collections.emptyMap());
353354
// test the correctly of rewrite
354355
Assertions.assertEquals(GenericData.get().validate(newAvroSchema, newAvroRecord), true);
356+
357+
// test rewrite with rename
358+
InternalSchema internalSchema = AvroInternalSchemaConverter.convert(schema);
359+
// do change rename operation
360+
TableChanges.ColumnUpdateChange updateChange = TableChanges.ColumnUpdateChange.get(internalSchema);
361+
updateChange
362+
.renameColumn("id", "idx")
363+
.renameColumn("data", "datax")
364+
.renameColumn("preferences.feature1", "f1")
365+
.renameColumn("preferences.feature2", "f2")
366+
.renameColumn("locations.value.lat", "lt");
367+
InternalSchema internalSchemaRename = SchemaChangeUtils.applyTableChanges2Schema(internalSchema, updateChange);
368+
Schema avroSchemaRename = AvroInternalSchemaConverter.convert(internalSchemaRename, schema.getName());
369+
Map<String, String> renameCols = InternalSchemaUtils.collectRenameCols(internalSchema, internalSchemaRename);
370+
GenericRecord avroRecordRename = HoodieAvroUtils.rewriteRecordWithNewSchema(avroRecord, avroSchemaRename, renameCols);
371+
// test the correctly of rewrite
372+
Assertions.assertEquals(GenericData.get().validate(avroSchemaRename, avroRecordRename), true);
355373
}
356374

357375
@Test

0 commit comments

Comments
 (0)