Skip to content

Commit d5745cc

Browse files
jonvexxushiyan
authored andcommitted
[HUDI-4982] Add Utilities and Utilities Slim + Spark Bundle testing to GH Actions (apache#7005)
Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
1 parent 51d1f94 commit d5745cc

12 files changed

Lines changed: 274 additions & 52 deletions

File tree

.github/workflows/bot.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI
7373
run: |
7474
HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
75-
./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh $HUDI_VERSION
75+
./packaging/bundle-validation/ci_run.sh $HUDI_VERSION
7676
- name: Spark SQL Test
7777
env:
7878
SCALA_PROFILE: ${{ matrix.scalaProfile }}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
ARG IMAGE_TAG=spark313hive313
19+
FROM apachehudi/hudi-ci-bundle-validation-base:$IMAGE_TAG
20+
21+
# configure the stack
22+
ADD . .
23+
ENV HUDI_CONF_DIR=$WORKDIR/conf
24+
RUN cp conf/hive-site.xml $HIVE_HOME/conf/
25+
RUN cp conf/hive-site.xml $SPARK_HOME/conf/
26+
RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
27+
RUN cp conf/spark-defaults.conf $SPARK_HOME/conf/
28+
RUN if [[ $SPARK_HOME == *"spark-3.2"* ]] || [[ $SPARK_HOME == *"spark-3.3"* ]]; \
29+
then printf "\nspark.sql.catalog.spark_catalog org.apache.spark.sql.hudi.catalog.HoodieCatalog\n" >> $SPARK_HOME/conf/spark-defaults.conf; fi

packaging/bundle-validation/spark-write-hive-sync/Dockerfile renamed to packaging/bundle-validation/Dockerfile-base

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ FROM adoptopenjdk/openjdk8:alpine
1818

1919
RUN apk add --no-cache --upgrade bash
2020

21-
RUN mkdir /opt/hudi-bundles
22-
ENV WORKDIR=/opt/hudi-bundles
21+
RUN mkdir /opt/bundle-validation
22+
ENV WORKDIR=/opt/bundle-validation
2323
WORKDIR $WORKDIR
2424

2525
ARG HADOOP_VERSION=2.7.7
@@ -47,10 +47,3 @@ RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK
4747
&& tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
4848
&& rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
4949
ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION
50-
51-
RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
52-
COPY hive-site.xml $HIVE_HOME/conf/
53-
RUN ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
54-
COPY spark-defaults.conf $SPARK_HOME/conf/
55-
COPY validate.scala .
56-
COPY validate.sh .

packaging/bundle-validation/spark-write-hive-sync/ci_run.sh renamed to packaging/bundle-validation/ci_run.sh

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,16 @@
1818
# under the License.
1919

2020
# Note:
21-
# this script is to run by GitHub Actions CI tasks from the project root directory
22-
# and contains environment-specific variables
21+
#
22+
# This script is to
23+
# - set the corresponding variables based on CI job's build profiles
24+
# - prepare Hudi bundle jars for mounting into Docker container for validation
25+
# - prepare test datasets for mounting into Docker container for validation
26+
#
27+
# This is to run by GitHub Actions CI tasks from the project root directory
28+
# and it contains the CI environment-specific variables.
2329

2430
HUDI_VERSION=$1
25-
# to store bundle jars for validation
26-
mkdir ${GITHUB_WORKSPACE}/jars
27-
cp packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar ${GITHUB_WORKSPACE}/jars
28-
echo 'Validating jars below:'
29-
ls -l ${GITHUB_WORKSPACE}/jars
3031

3132
# choose versions based on build profiles
3233
if [[ ${SPARK_PROFILE} == 'spark2.4' ]]; then
@@ -59,13 +60,33 @@ elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
5960
IMAGE_TAG=spark330hive313
6061
fi
6162

62-
cd packaging/bundle-validation/spark-write-hive-sync || exit 1
63+
# Copy bundle jars to temp dir for mounting
64+
TMP_JARS_DIR=/tmp/jars/$(date +%s)
65+
mkdir -p $TMP_JARS_DIR
66+
cp ${GITHUB_WORKSPACE}/packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
67+
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
68+
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-slim-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
69+
echo 'Validating jars below:'
70+
ls -l $TMP_JARS_DIR
71+
72+
# Copy test dataset
73+
TMP_DATA_DIR=/tmp/data/$(date +%s)
74+
mkdir -p $TMP_DATA_DIR/stocks/data
75+
cp ${GITHUB_WORKSPACE}/docker/demo/data/*.json $TMP_DATA_DIR/stocks/data/
76+
cp ${GITHUB_WORKSPACE}/docker/demo/config/schema.avsc $TMP_DATA_DIR/stocks/
77+
78+
# build docker image
79+
cd ${GITHUB_WORKSPACE}/packaging/bundle-validation || exit 1
6380
docker build \
6481
--build-arg HADOOP_VERSION=$HADOOP_VERSION \
6582
--build-arg HIVE_VERSION=$HIVE_VERSION \
6683
--build-arg DERBY_VERSION=$DERBY_VERSION \
6784
--build-arg SPARK_VERSION=$SPARK_VERSION \
6885
--build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
86+
--build-arg IMAGE_TAG=$IMAGE_TAG \
6987
-t hudi-ci-bundle-validation:$IMAGE_TAG \
7088
.
71-
docker run -v ${GITHUB_WORKSPACE}/jars:/opt/hudi-bundles/jars -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
89+
90+
# run validation script in docker
91+
docker run -v $TMP_JARS_DIR:/opt/bundle-validation/jars -v $TMP_DATA_DIR:/opt/bundle-validation/data \
92+
-i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh

packaging/bundle-validation/spark-write-hive-sync/hive-site.xml renamed to packaging/bundle-validation/conf/hive-site.xml

File renamed without changes.

packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf renamed to packaging/bundle-validation/conf/hudi-defaults.conf

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
# limitations under the License.
1616
#
1717

18-
spark.serializer org.apache.spark.serializer.KryoSerializer
19-
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
20-
spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
18+
hoodie.upsert.shuffle.parallelism 8
19+
hoodie.insert.shuffle.parallelism 8
20+
hoodie.delete.shuffle.parallelism 8
21+
hoodie.bulkinsert.shuffle.parallelism 8
22+
hoodie.finalize.write.parallelism 8
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
spark.serializer org.apache.spark.serializer.KryoSerializer
19+
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
20+
spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
21+
spark.default.parallelism 8
22+
spark.sql.shuffle.partitions 8

packaging/bundle-validation/spark-write-hive-sync/validate.sh

Lines changed: 0 additions & 30 deletions
This file was deleted.

packaging/bundle-validation/spark-write-hive-sync/validate.scala renamed to packaging/bundle-validation/spark/validate.scala

File renamed without changes.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
hoodie.datasource.write.recordkey.field=key
18+
hoodie.datasource.write.partitionpath.field=date
19+
hoodie.datasource.write.precombine.field=ts
20+
hoodie.metadata.enable=true
21+
hoodie.deltastreamer.source.dfs.root=file:///opt/bundle-validation/data/stocks/data
22+
hoodie.deltastreamer.schemaprovider.target.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc
23+
hoodie.deltastreamer.schemaprovider.source.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc

0 commit comments

Comments
 (0)