Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ jobs:
if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI
run: |
HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh $HUDI_VERSION
./packaging/bundle-validation/ci_run.sh $HUDI_VERSION
- name: Spark SQL Test
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
Expand Down
29 changes: 29 additions & 0 deletions packaging/bundle-validation/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

ARG IMAGE_TAG=spark313hive313
FROM apachehudi/hudi-ci-bundle-validation-base:$IMAGE_TAG

# configure the stack
ADD . .
ENV HUDI_CONF_DIR=$WORKDIR/conf
RUN cp conf/hive-site.xml $HIVE_HOME/conf/
RUN cp conf/hive-site.xml $SPARK_HOME/conf/
RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
RUN cp conf/spark-defaults.conf $SPARK_HOME/conf/
RUN if [[ $SPARK_HOME == *"spark-3.2"* ]] || [[ $SPARK_HOME == *"spark-3.3"* ]]; \
then printf "\nspark.sql.catalog.spark_catalog org.apache.spark.sql.hudi.catalog.HoodieCatalog\n" >> $SPARK_HOME/conf/spark-defaults.conf; fi
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ FROM adoptopenjdk/openjdk8:alpine

RUN apk add --no-cache --upgrade bash

RUN mkdir /opt/hudi-bundles
ENV WORKDIR=/opt/hudi-bundles
RUN mkdir /opt/bundle-validation
ENV WORKDIR=/opt/bundle-validation
WORKDIR $WORKDIR

ARG HADOOP_VERSION=2.7.7
Expand Down Expand Up @@ -47,10 +47,3 @@ RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK
&& tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
&& rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION

RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
COPY hive-site.xml $HIVE_HOME/conf/
RUN ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
COPY spark-defaults.conf $SPARK_HOME/conf/
COPY validate.scala .
COPY validate.sh .
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@
# under the License.

# Note:
# this script is to run by GitHub Actions CI tasks from the project root directory
# and contains environment-specific variables
#
# This script is to
# - set the corresponding variables based on CI job's build profiles
# - prepare Hudi bundle jars for mounting into Docker container for validation
# - prepare test datasets for mounting into Docker container for validation
#
# This is to run by GitHub Actions CI tasks from the project root directory
# and it contains the CI environment-specific variables.

HUDI_VERSION=$1
# to store bundle jars for validation
mkdir ${GITHUB_WORKSPACE}/jars
cp packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar ${GITHUB_WORKSPACE}/jars
echo 'Validating jars below:'
ls -l ${GITHUB_WORKSPACE}/jars

# choose versions based on build profiles
if [[ ${SPARK_PROFILE} == 'spark2.4' ]]; then
Expand Down Expand Up @@ -59,13 +60,33 @@ elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
IMAGE_TAG=spark330hive313
fi

cd packaging/bundle-validation/spark-write-hive-sync || exit 1
# Copy bundle jars to temp dir for mounting
TMP_JARS_DIR=/tmp/jars/$(date +%s)
mkdir -p $TMP_JARS_DIR
cp ${GITHUB_WORKSPACE}/packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
cp ${GITHUB_WORKSPACE}/packaging/hudi-utilities-slim-bundle/target/hudi-*-$HUDI_VERSION.jar $TMP_JARS_DIR/
echo 'Validating jars below:'
ls -l $TMP_JARS_DIR

# Copy test dataset
TMP_DATA_DIR=/tmp/data/$(date +%s)
mkdir -p $TMP_DATA_DIR/stocks/data
cp ${GITHUB_WORKSPACE}/docker/demo/data/*.json $TMP_DATA_DIR/stocks/data/
cp ${GITHUB_WORKSPACE}/docker/demo/config/schema.avsc $TMP_DATA_DIR/stocks/

# build docker image
cd ${GITHUB_WORKSPACE}/packaging/bundle-validation || exit 1
docker build \
--build-arg HADOOP_VERSION=$HADOOP_VERSION \
--build-arg HIVE_VERSION=$HIVE_VERSION \
--build-arg DERBY_VERSION=$DERBY_VERSION \
--build-arg SPARK_VERSION=$SPARK_VERSION \
--build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
--build-arg IMAGE_TAG=$IMAGE_TAG \
-t hudi-ci-bundle-validation:$IMAGE_TAG \
.
docker run -v ${GITHUB_WORKSPACE}/jars:/opt/hudi-bundles/jars -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh

# run validation script in docker
docker run -v $TMP_JARS_DIR:/opt/bundle-validation/jars -v $TMP_DATA_DIR:/opt/bundle-validation/data \
-i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# limitations under the License.
#

spark.serializer org.apache.spark.serializer.KryoSerializer
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
hoodie.upsert.shuffle.parallelism 8
hoodie.insert.shuffle.parallelism 8
hoodie.delete.shuffle.parallelism 8
hoodie.bulkinsert.shuffle.parallelism 8
hoodie.finalize.write.parallelism 8
22 changes: 22 additions & 0 deletions packaging/bundle-validation/conf/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

spark.serializer org.apache.spark.serializer.KryoSerializer
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
spark.default.parallelism 8
spark.sql.shuffle.partitions 8
30 changes: 0 additions & 30 deletions packaging/bundle-validation/spark-write-hive-sync/validate.sh

This file was deleted.

23 changes: 23 additions & 0 deletions packaging/bundle-validation/utilities/hoodieapp.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

hoodie.datasource.write.recordkey.field=key
hoodie.datasource.write.partitionpath.field=date
hoodie.datasource.write.precombine.field=ts
hoodie.metadata.enable=true
hoodie.deltastreamer.source.dfs.root=file:///opt/bundle-validation/data/stocks/data
hoodie.deltastreamer.schemaprovider.target.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc
hoodie.deltastreamer.schemaprovider.source.schema.file=file:///opt/bundle-validation/data/stocks/schema.avsc
25 changes: 25 additions & 0 deletions packaging/bundle-validation/utilities/validate.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

val hudiDf = spark.read.format("hudi").load("/tmp/hudi-utilities-test/")
val inputDf = spark.read.format("json").load("/opt/bundle-validation/data/stocks/data")
val hudiCount = hudiDf.select("date", "key").distinct.count
val srcCount = inputDf.select("date", "key").distinct.count
if (hudiCount == srcCount) System.exit(0)
println(s"Counts don't match hudiCount: $hudiCount, srcCount: $srcCount")
System.exit(1)
137 changes: 137 additions & 0 deletions packaging/bundle-validation/validate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

#################################################################################################
# NOTE: this script runs inside hudi-ci-bundle-validation container
# $WORKDIR/jars/ is to mount to a host directory where bundle jars are placed
# $WORKDIR/data/ is to mount to a host directory where test data are placed with structures like
# - <dataset name>/schema.avsc
# - <dataset name>/data/<data files>
#################################################################################################

WORKDIR=/opt/bundle-validation
JARS_DIR=${WORKDIR}/jars
# link the jar names to easier to use names
ln -sf $JARS_DIR/hudi-spark*.jar $JARS_DIR/spark.jar
ln -sf $JARS_DIR/hudi-utilities-bundle*.jar $JARS_DIR/utilities.jar
ln -sf $JARS_DIR/hudi-utilities-slim*.jar $JARS_DIR/utilities-slim.jar


##
# Function to test the spark bundle with hive sync.
#
# env vars (defined in container):
# HIVE_HOME: path to the hive directory
# DERBY_HOME: path to the derby directory
# SPARK_HOME: path to the spark directory
##
test_spark_bundle () {
echo "::warning::validate.sh setting up hive metastore for spark bundle validation"

$DERBY_HOME/bin/startNetworkServer -h 0.0.0.0 &
$HIVE_HOME/bin/hiveserver2 &
echo "::warning::validate.sh hive metastore setup complete. Testing"
$SPARK_HOME/bin/spark-shell --jars $JARS_DIR/spark.jar < $WORKDIR/spark/validate.scala
if [ "$?" -ne 0 ]; then
echo "::error::validate.sh failed hive testing"
exit 1
fi
echo "::warning::validate.sh spark bundle validation successful"
}


##
# Function to test the utilities bundle and utilities slim bundle + spark bundle.
# It runs deltastreamer and then verifies that deltastreamer worked correctly.
#
# 1st arg: main jar to run with spark-submit, usually it's the utilities(-slim) bundle
# 2nd arg and beyond: any additional jars to pass to --jars option
#
# env vars (defined in container):
# SPARK_HOME: path to the spark directory
##
test_utilities_bundle () {
MAIN_JAR=$1
printf -v EXTRA_JARS '%s,' "${@:2}"
EXTRA_JARS="${EXTRA_JARS%,}"
OPT_JARS=""
if [[ -n $EXTRA_JARS ]]; then
OPT_JARS="--jars $EXTRA_JARS"
fi
OUTPUT_DIR=/tmp/hudi-utilities-test/
rm -r $OUTPUT_DIR
echo "::warning::validate.sh running deltastreamer"
$SPARK_HOME/bin/spark-submit \
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer \
$OPT_JARS $MAIN_JAR \
--props $WORKDIR/utilities/hoodieapp.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--source-class org.apache.hudi.utilities.sources.JsonDFSSource \
--source-ordering-field ts --table-type MERGE_ON_READ \
--target-base-path ${OUTPUT_DIR} \
--target-table utilities_tbl --op UPSERT
if [ "$?" -ne 0 ]; then
echo "::error::validate.sh deltastreamer failed with exit code $?"
exit 1
fi
echo "::warning::validate.sh done with deltastreamer"

OUTPUT_SIZE=$(du -s ${OUTPUT_DIR} | awk '{print $1}')
if [[ -z $OUTPUT_SIZE || "$OUTPUT_SIZE" -lt "580" ]]; then
echo "::error::validate.sh deltastreamer output folder ($OUTPUT_SIZE) is smaller than minimum expected (580)"
exit 1
fi

echo "::warning::validate.sh validating deltastreamer in spark shell"
SHELL_COMMAND="$SPARK_HOME/bin/spark-shell $OPT_JARS $MAIN_JAR -i $WORKDIR/utilities/validate.scala"
echo "::debug::this is the shell command: $SHELL_COMMAND"
LOGFILE="$WORKDIR/${FUNCNAME[0]}.log"
$SHELL_COMMAND >> $LOGFILE
if [ "$?" -ne 0 ]; then
SHELL_RESULT=$(cat $LOGFILE | grep "Counts don't match")
echo "::error::validate.sh $SHELL_RESULT"
exit 1
fi
echo "::warning::validate.sh done validating deltastreamer in spark shell"
}


test_spark_bundle
if [ "$?" -ne 0 ]; then
exit 1
fi

if [[ $SPARK_HOME == *"spark-2.4"* ]] || [[ $SPARK_HOME == *"spark-3.1"* ]]
then
echo "::warning::validate.sh testing utilities bundle"
test_utilities_bundle $JARS_DIR/utilities.jar
if [ "$?" -ne 0 ]; then
exit 1
fi
echo "::warning::validate.sh done testing utilities bundle"
else
echo "::warning::validate.sh skip testing utilities bundle for non-spark2.4 & non-spark3.1 build"
fi

echo "::warning::validate.sh testing utilities slim bundle"
test_utilities_bundle $JARS_DIR/utilities-slim.jar $JARS_DIR/spark.jar
if [ "$?" -ne 0 ]; then
exit 1
fi
echo "::warning::validate.sh done testing utilities slim bundle"