Skip to content

Commit 93c09f2

Browse files
parisnixushiyan
andauthored
[HUDI-4781] Allow omit metadata fields for hive sync (#6471)
Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
1 parent c8c1ee4 commit 93c09f2

6 files changed

Lines changed: 39 additions & 1 deletion

File tree

hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,15 @@ public MessageType getTableParquetSchema() throws Exception {
171171
return convertAvroSchemaToParquet(getTableAvroSchema(true));
172172
}
173173

174+
/**
175+
* Gets users data schema for a hoodie table in Parquet format.
176+
*
177+
* @return Parquet schema for the table
178+
*/
179+
public MessageType getTableParquetSchema(boolean includeMetadataField) throws Exception {
180+
return convertAvroSchemaToParquet(getTableAvroSchema(includeMetadataField));
181+
}
182+
174183
/**
175184
* Gets users data schema for a hoodie table in Avro format.
176185
*

hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ public class HiveSyncConfig extends HoodieSyncConfig {
5858
public static final ConfigProperty<String> HIVE_SYNC_AS_DATA_SOURCE_TABLE = HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE;
5959
public static final ConfigProperty<Integer> HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD = HiveSyncConfigHolder.HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD;
6060
public static final ConfigProperty<Boolean> HIVE_CREATE_MANAGED_TABLE = HiveSyncConfigHolder.HIVE_CREATE_MANAGED_TABLE;
61+
public static final ConfigProperty<Boolean> HIVE_SYNC_OMIT_METADATA_FIELDS = HiveSyncConfigHolder.HIVE_SYNC_OMIT_METADATA_FIELDS;
6162
public static final ConfigProperty<Integer> HIVE_BATCH_SYNC_PARTITION_NUM = HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM;
6263
public static final ConfigProperty<String> HIVE_SYNC_MODE = HiveSyncConfigHolder.HIVE_SYNC_MODE;
6364
public static final ConfigProperty<Boolean> HIVE_SYNC_BUCKET_SYNC = HiveSyncConfigHolder.HIVE_SYNC_BUCKET_SYNC;
@@ -130,6 +131,8 @@ public static class HiveSyncConfigParams {
130131
public Boolean supportTimestamp;
131132
@Parameter(names = {"--managed-table"}, description = "Create a managed table")
132133
public Boolean createManagedTable;
134+
@Parameter(names = {"--omit-metafields"}, description = "Omit metafields in schema")
135+
public Boolean omitMetaFields;
133136
@Parameter(names = {"--batch-sync-num"}, description = "The number of partitions one batch when synchronous partitions to hive")
134137
public Integer batchSyncNum;
135138
@Parameter(names = {"--spark-datasource"}, description = "Whether sync this table as spark data source table.")
@@ -167,6 +170,7 @@ public TypedProperties toProps() {
167170
props.setPropertyIfNonNull(HIVE_SYNC_AS_DATA_SOURCE_TABLE.key(), syncAsSparkDataSourceTable);
168171
props.setPropertyIfNonNull(HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD.key(), sparkSchemaLengthThreshold);
169172
props.setPropertyIfNonNull(HIVE_CREATE_MANAGED_TABLE.key(), createManagedTable);
173+
props.setPropertyIfNonNull(HIVE_SYNC_OMIT_METADATA_FIELDS.key(), omitMetaFields);
170174
props.setPropertyIfNonNull(HIVE_BATCH_SYNC_PARTITION_NUM.key(), batchSyncNum);
171175
props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC.key(), bucketSync);
172176
props.setPropertyIfNonNull(HIVE_SYNC_BUCKET_SYNC_SPEC.key(), bucketSpec);

hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfigHolder.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ public class HiveSyncConfigHolder {
103103
.key("hoodie.datasource.hive_sync.create_managed_table")
104104
.defaultValue(false)
105105
.withDocumentation("Whether to sync the table as managed table.");
106+
public static final ConfigProperty<Boolean> HIVE_SYNC_OMIT_METADATA_FIELDS = ConfigProperty
107+
.key("hoodie.datasource.hive_sync.omit_metadata_fields")
108+
.defaultValue(false)
109+
.sinceVersion("0.13.0")
110+
.withDocumentation("Whether to omit the hoodie metadata fields in the target table.");
106111
public static final ConfigProperty<Integer> HIVE_BATCH_SYNC_PARTITION_NUM = ConfigProperty
107112
.key("hoodie.datasource.hive_sync.batch_num")
108113
.defaultValue(1000)

hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848

4949
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_AUTO_CREATE_DATABASE;
5050
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_IGNORE_EXCEPTIONS;
51+
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_OMIT_METADATA_FIELDS;
5152
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE;
5253
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SUPPORT_TIMESTAMP_TYPE;
5354
import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_AS_DATA_SOURCE_TABLE;
@@ -201,7 +202,8 @@ protected void syncHoodieTable(String tableName, boolean useRealtimeInputFormat,
201202
boolean tableExists = syncClient.tableExists(tableName);
202203

203204
// Get the parquet schema for this table looking at the latest commit
204-
MessageType schema = syncClient.getStorageSchema();
205+
MessageType schema = syncClient.getStorageSchema(!config.getBoolean(HIVE_SYNC_OMIT_METADATA_FIELDS));
206+
205207

206208
// Currently HoodieBootstrapRelation does support reading bootstrap MOR rt table,
207209
// so we disable the syncAsSparkDataSourceTable here to avoid read such kind table

hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieMetaSyncOperations.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,15 @@ default MessageType getStorageSchema() {
124124
return null;
125125
}
126126

127+
/**
128+
* Get the schema from the Hudi table on storage.
129+
*
130+
* @param includeMetadataField true if to include metadata fields in the schema
131+
*/
132+
default MessageType getStorageSchema(boolean includeMetadataField) {
133+
return null;
134+
}
135+
127136
/**
128137
* Update schema for the table in the metastore.
129138
*/

hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncClient.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,15 @@ public MessageType getStorageSchema() {
104104
}
105105
}
106106

107+
@Override
108+
public MessageType getStorageSchema(boolean includeMetadataField) {
109+
try {
110+
return new TableSchemaResolver(metaClient).getTableParquetSchema(includeMetadataField);
111+
} catch (Exception e) {
112+
throw new HoodieSyncException("Failed to read schema from storage.", e);
113+
}
114+
}
115+
107116
public List<String> getWrittenPartitionsSince(Option<String> lastCommitTimeSynced) {
108117
if (!lastCommitTimeSynced.isPresent()) {
109118
LOG.info("Last commit time synced is not known, listing all partitions in "

0 commit comments

Comments
 (0)