Skip to content

Commit 31e13db

Browse files
authored
[HUDI-4023] Decouple hudi-spark from hudi-utilities-slim-bundle (#5641)
1 parent 98c5c6c commit 31e13db

4 files changed

Lines changed: 109 additions & 132 deletions

File tree

hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,8 +605,6 @@ private Pair<Option<String>, JavaRDD<WriteStatus>> writeToSink(JavaRDD<HoodieRec
605605
long totalErrorRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalErrorRecords).sum().longValue();
606606
long totalRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalRecords).sum().longValue();
607607
boolean hasErrors = totalErrorRecords > 0;
608-
long hiveSyncTimeMs = 0;
609-
long metaSyncTimeMs = 0;
610608
if (!hasErrors || cfg.commitOnErrors) {
611609
HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
612610
if (checkpointStr != null) {

packaging/hudi-utilities-slim-bundle/README.md

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,89 @@
1717

1818
# Usage of hudi-utilities-slim-bundle
1919

20-
Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules.
21-
This new bundle is intended to be used with Hudi Spark bundle together, if using hudi-utilities-bundle solely
22-
introduces problems for a specific Spark version.
20+
Starting from versions 0.11, Hudi provides hudi-utilities-slim-bundle which excludes hudi-spark-datasource modules. This new bundle is intended to be used with Hudi Spark bundle together, if using
21+
hudi-utilities-bundle solely introduces problems for a specific Spark version.
22+
23+
## Example with Spark 2.4.7
24+
25+
* Build Hudi: `mvn clean install -DskipTests`
26+
* Run deltastreamer
27+
28+
```
29+
bin/spark-submit \
30+
--driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \
31+
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
32+
--conf spark.sql.catalogImplementation=hive \
33+
--conf spark.driver.maxResultSize=1g \
34+
--conf spark.ui.port=6679 \
35+
--packages org.apache.spark:spark-avro_2.11:2.4.7 \
36+
--jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark-bundle_2.11-0.12.0-SNAPSHOT.jar \
37+
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.11-0.12.0-SNAPSHOT.jar` \
38+
--props `ls /path/to/hudi/dfs-source.properties` \
39+
--source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
40+
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
41+
--source-ordering-field tpep_dropoff_datetime \
42+
--table-type COPY_ON_WRITE \
43+
--target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark24/ \
44+
--target-table ny_hudi_tbl \
45+
--op UPSERT \
46+
--continuous \
47+
--source-limit 5000000 \
48+
--min-sync-interval-seconds 60
49+
```
50+
51+
## Example with Spark 3.1.2
52+
53+
* Build Hudi: `mvn clean install -DskipTests -Dspark3.1 -Dscala-2.12`
54+
* Run deltastreamer
55+
56+
```
57+
bin/spark-submit \
58+
--driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \
59+
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
60+
--conf spark.sql.catalogImplementation=hive \
61+
--conf spark.driver.maxResultSize=1g \
62+
--conf spark.ui.port=6679 \
63+
--packages org.apache.spark:spark-avro_2.12:3.1.2 \
64+
--jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.1-bundle_2.12-0.12.0-SNAPSHOT.jar \
65+
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \
66+
--props `ls /path/to/hudi/dfs-source.properties` \
67+
--source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
68+
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
69+
--source-ordering-field tpep_dropoff_datetime \
70+
--table-type COPY_ON_WRITE \
71+
--target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark31/ \
72+
--target-table ny_hudi_tbl \
73+
--op UPSERT \
74+
--continuous \
75+
--source-limit 5000000 \
76+
--min-sync-interval-seconds 60
77+
```
78+
79+
## Example with Spark 3.2.0
80+
81+
* Build Hudi: `mvn clean install -DskipTests -Dspark3.2 -Dscala-2.12`
82+
* Run deltastreamer
83+
84+
```
85+
bin/spark-submit \
86+
--driver-memory 4g --executor-memory 2g --num-executors 3 --executor-cores 1 \
87+
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
88+
--conf spark.sql.catalogImplementation=hive \
89+
--conf spark.driver.maxResultSize=1g \
90+
--conf spark.ui.port=6679 \
91+
--packages org.apache.spark:spark-avro_2.12:3.2.0 \
92+
--jars /path/to/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.2-bundle_2.12-0.12.0-SNAPSHOT.jar \
93+
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer `ls /path/to/hudi/packaging/hudi-utilities-slim-bundle/target/hudi-utilities-slim-bundle_2.12-0.12.0-SNAPSHOT.jar` \
94+
--props `ls /path/to/hudi/dfs-source.properties` \
95+
--source-class org.apache.hudi.utilities.sources.ParquetDFSSource \
96+
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
97+
--source-ordering-field tpep_dropoff_datetime \
98+
--table-type COPY_ON_WRITE \
99+
--target-base-path file:\/\/\/tmp/hudi-ny-taxi-spark32/ \
100+
--target-table ny_hudi_tbl \
101+
--op UPSERT \
102+
--continuous \
103+
--source-limit 5000000 \
104+
--min-sync-interval-seconds 60
105+
```

packaging/hudi-utilities-slim-bundle/pom.xml

Lines changed: 16 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@
7777
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer">
7878
</transformer>
7979
<transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
80-
<addHeader>true</addHeader>
80+
<addHeader>true</addHeader>
8181
</transformer>
8282
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
8383
<resource>META-INF/LICENSE</resource>
@@ -92,10 +92,7 @@
9292
<includes>
9393
<include>org.apache.hudi:hudi-common</include>
9494
<include>org.apache.hudi:hudi-client-common</include>
95-
<include>org.apache.hudi:hudi-spark-client</include>
9695
<include>org.apache.hudi:hudi-utilities_${scala.binary.version}</include>
97-
<include>org.apache.hudi:hudi-hive-sync</include>
98-
<include>org.apache.hudi:hudi-sync-common</include>
9996
<include>org.apache.hudi:hudi-hadoop-mr</include>
10097
<include>org.apache.hudi:hudi-timeline-service</include>
10198
<include>org.apache.hudi:hudi-aws</include>
@@ -136,13 +133,6 @@
136133
<include>org.apache.kafka:kafka_${scala.binary.version}</include>
137134
<include>com.101tec:zkclient</include>
138135
<include>org.apache.kafka:kafka-clients</include>
139-
140-
<include>org.apache.hive:hive-common</include>
141-
<include>org.apache.hive:hive-service</include>
142-
<include>org.apache.hive:hive-service-rpc</include>
143-
<include>org.apache.hive:hive-metastore</include>
144-
<include>org.apache.hive:hive-jdbc</include>
145-
146136
<include>org.apache.hbase:hbase-client</include>
147137
<include>org.apache.hbase:hbase-common</include>
148138
<include>org.apache.hbase:hbase-hadoop-compat</include>
@@ -178,10 +168,6 @@
178168
<pattern>com.beust.jcommander.</pattern>
179169
<shadedPattern>org.apache.hudi.com.beust.jcommander.</shadedPattern>
180170
</relocation>
181-
<relocation>
182-
<pattern>org.apache.hive.jdbc.</pattern>
183-
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.jdbc.</shadedPattern>
184-
</relocation>
185171
<relocation>
186172
<pattern>org.apache.commons.io.</pattern>
187173
<shadedPattern>org.apache.hudi.org.apache.commons.io.</shadedPattern>
@@ -205,10 +191,6 @@
205191
<pattern>org.apache.hadoop.hive.metastore.</pattern>
206192
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.metastore.</shadedPattern>
207193
</relocation>
208-
<relocation>
209-
<pattern>org.apache.hive.common.</pattern>
210-
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.common.</shadedPattern>
211-
</relocation>
212194
<relocation>
213195
<pattern>org.apache.hadoop.hive.common.</pattern>
214196
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.common.</shadedPattern>
@@ -217,10 +199,6 @@
217199
<pattern>org.apache.hadoop.hive.conf.</pattern>
218200
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.conf.</shadedPattern>
219201
</relocation>
220-
<relocation>
221-
<pattern>org.apache.hive.service.</pattern>
222-
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hive.service.</shadedPattern>
223-
</relocation>
224202
<relocation>
225203
<pattern>org.apache.hadoop.hive.service.</pattern>
226204
<shadedPattern>${utilities.bundle.hive.shade.prefix}org.apache.hadoop.hive.service.</shadedPattern>
@@ -344,116 +322,27 @@
344322
</dependency>
345323
<dependency>
346324
<groupId>org.apache.hudi</groupId>
347-
<artifactId>hudi-client-common</artifactId>
348-
<version>${project.version}</version>
349-
</dependency>
350-
<dependency>
351-
<groupId>org.apache.hudi</groupId>
352-
<artifactId>hudi-spark-client</artifactId>
353-
<version>${project.version}</version>
354-
</dependency>
355-
<dependency>
356-
<groupId>org.apache.hudi</groupId>
357-
<artifactId>hudi-hive-sync</artifactId>
325+
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
358326
<version>${project.version}</version>
359327
<exclusions>
360328
<exclusion>
361-
<groupId>javax.servlet</groupId>
362-
<artifactId>servlet-api</artifactId>
329+
<groupId>org.apache.hudi</groupId>
330+
<artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
331+
</exclusion>
332+
<exclusion>
333+
<groupId>org.apache.hudi</groupId>
334+
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
335+
</exclusion>
336+
<exclusion>
337+
<groupId>org.apache.hudi</groupId>
338+
<artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId>
339+
</exclusion>
340+
<exclusion>
341+
<groupId>org.apache.hudi</groupId>
342+
<artifactId>${hudi.spark.common.module}</artifactId>
363343
</exclusion>
364344
</exclusions>
365345
</dependency>
366-
<dependency>
367-
<groupId>org.apache.hudi</groupId>
368-
<artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
369-
<version>${project.version}</version>
370-
<scope>provided</scope>
371-
</dependency>
372-
<dependency>
373-
<groupId>org.apache.hudi</groupId>
374-
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
375-
<version>${project.version}</version>
376-
<scope>provided</scope>
377-
</dependency>
378-
<dependency>
379-
<groupId>org.apache.hudi</groupId>
380-
<artifactId>${hudi.spark.module}_${scala.binary.version}</artifactId>
381-
<version>${project.version}</version>
382-
<scope>provided</scope>
383-
</dependency>
384-
<dependency>
385-
<groupId>org.apache.hudi</groupId>
386-
<artifactId>${hudi.spark.common.module}</artifactId>
387-
<version>${project.version}</version>
388-
<scope>provided</scope>
389-
</dependency>
390-
<dependency>
391-
<groupId>org.apache.hudi</groupId>
392-
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
393-
<version>${project.version}</version>
394-
</dependency>
395-
396-
<!-- Hive -->
397-
<dependency>
398-
<groupId>${hive.groupid}</groupId>
399-
<artifactId>hive-service</artifactId>
400-
<version>${hive.version}</version>
401-
<scope>${utilities.bundle.hive.scope}</scope>
402-
</dependency>
403-
404-
<dependency>
405-
<groupId>${hive.groupid}</groupId>
406-
<artifactId>hive-service-rpc</artifactId>
407-
<version>${hive.version}</version>
408-
<scope>${utilities.bundle.hive.scope}</scope>
409-
</dependency>
410-
411-
<dependency>
412-
<groupId>${hive.groupid}</groupId>
413-
<artifactId>hive-jdbc</artifactId>
414-
<version>${hive.version}</version>
415-
<scope>${utilities.bundle.hive.scope}</scope>
416-
</dependency>
417-
418-
<dependency>
419-
<groupId>${hive.groupid}</groupId>
420-
<artifactId>hive-metastore</artifactId>
421-
<version>${hive.version}</version>
422-
<scope>${utilities.bundle.hive.scope}</scope>
423-
</dependency>
424-
425-
<dependency>
426-
<groupId>${hive.groupid}</groupId>
427-
<artifactId>hive-common</artifactId>
428-
<version>${hive.version}</version>
429-
<scope>${utilities.bundle.hive.scope}</scope>
430-
</dependency>
431-
432-
<dependency>
433-
<groupId>org.apache.htrace</groupId>
434-
<artifactId>htrace-core</artifactId>
435-
<version>${htrace.version}</version>
436-
<scope>compile</scope>
437-
</dependency>
438-
439-
<!-- zookeeper -->
440-
<dependency>
441-
<groupId>org.apache.curator</groupId>
442-
<artifactId>curator-framework</artifactId>
443-
<version>${zk-curator.version}</version>
444-
</dependency>
445-
446-
<dependency>
447-
<groupId>org.apache.curator</groupId>
448-
<artifactId>curator-client</artifactId>
449-
<version>${zk-curator.version}</version>
450-
</dependency>
451-
452-
<dependency>
453-
<groupId>org.apache.curator</groupId>
454-
<artifactId>curator-recipes</artifactId>
455-
<version>${zk-curator.version}</version>
456-
</dependency>
457346
</dependencies>
458347

459348
<profiles>

pom.xml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
<pulsar.version>2.8.1</pulsar.version>
100100
<confluent.version>5.3.4</confluent.version>
101101
<glassfish.version>2.17</glassfish.version>
102+
<glassfish.el.version>3.0.1-b12</glassfish.el.version>
102103
<parquet.version>1.10.1</parquet.version>
103104
<junit.jupiter.version>5.7.0-M1</junit.jupiter.version>
104105
<junit.vintage.version>5.7.0-M1</junit.vintage.version>
@@ -556,6 +557,12 @@
556557
<artifactId>jersey-container-servlet-core</artifactId>
557558
<version>${glassfish.version}</version>
558559
</dependency>
560+
<dependency>
561+
<groupId>org.glassfish</groupId>
562+
<artifactId>javax.el</artifactId>
563+
<version>${glassfish.el.version}</version>
564+
<scope>provided</scope>
565+
</dependency>
559566

560567
<!-- Avro -->
561568
<dependency>

0 commit comments

Comments
 (0)