diff --git a/.github/workflows/bot.yml b/.github/workflows/bot.yml index 6ad5a83514781..e2491b67dbfbc 100644 --- a/.github/workflows/bot.yml +++ b/.github/workflows/bot.yml @@ -64,6 +64,15 @@ jobs: FLINK_PROFILE: ${{ matrix.flinkProfile }} run: mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -D"$FLINK_PROFILE" -DfailIfNoTests=false -pl hudi-examples/hudi-examples-flink,hudi-examples/hudi-examples-java,hudi-examples/hudi-examples-spark $MVN_ARGS + - name: Bundle Validation + env: + SCALA_PROFILE: ${{ matrix.scalaProfile }} + SPARK_PROFILE: ${{ matrix.sparkProfile }} + FLINK_PROFILE: ${{ matrix.flinkProfile }} + if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI + run: | + HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) + ./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh $HUDI_VERSION - name: Spark SQL Test env: SCALA_PROFILE: ${{ matrix.scalaProfile }} diff --git a/packaging/bundle-validation/spark-write-hive-sync/Dockerfile b/packaging/bundle-validation/spark-write-hive-sync/Dockerfile new file mode 100644 index 0000000000000..bc9656ef3f311 --- /dev/null +++ b/packaging/bundle-validation/spark-write-hive-sync/Dockerfile @@ -0,0 +1,56 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM adoptopenjdk/openjdk8:alpine + +RUN apk add --no-cache --upgrade bash + +RUN mkdir /opt/hudi-bundles +ENV WORKDIR=/opt/hudi-bundles +WORKDIR $WORKDIR + +ARG HADOOP_VERSION=2.7.7 +ARG HIVE_VERSION=3.1.3 +ARG DERBY_VERSION=10.14.1.0 +ARG SPARK_VERSION=3.1.3 +ARG SPARK_HADOOP_VERSION=2.7 + +RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz -P "$WORKDIR" \ + && tar -xf $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz -C $WORKDIR/ \ + && rm $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz +ENV HADOOP_HOME=$WORKDIR/hadoop-$HADOOP_VERSION + +RUN wget https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz -P "$WORKDIR" \ + && tar -xf $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz -C $WORKDIR/ \ + && rm $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz +ENV HIVE_HOME=$WORKDIR/apache-hive-$HIVE_VERSION-bin + +RUN wget https://archive.apache.org/dist/db/derby/db-derby-$DERBY_VERSION/db-derby-$DERBY_VERSION-bin.tar.gz -P "$WORKDIR" \ + && tar -xf $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz -C $WORKDIR/ \ + && rm $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz +ENV DERBY_HOME=$WORKDIR/db-derby-$DERBY_VERSION-bin + +RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -P "$WORKDIR" \ + && tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \ + && rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz +ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION + +RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/ +COPY hive-site.xml $HIVE_HOME/conf/ +RUN ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml +COPY spark-defaults.conf $SPARK_HOME/conf/ +COPY validate.scala . +COPY validate.sh . diff --git a/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh b/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh new file mode 100755 index 0000000000000..a1e3832105c89 --- /dev/null +++ b/packaging/bundle-validation/spark-write-hive-sync/ci_run.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Note: +# this script is to run by GitHub Actions CI tasks from the project root directory +# and contains environment-specific variables + +HUDI_VERSION=$1 +# to store bundle jars for validation +mkdir ${GITHUB_WORKSPACE}/jars +cp packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar ${GITHUB_WORKSPACE}/jars +echo 'Validating jars below:' +ls -l ${GITHUB_WORKSPACE}/jars + +# choose versions based on build profiles +if [[ ${SPARK_PROFILE} == 'spark2.4' ]]; then + HADOOP_VERSION=2.7.7 + HIVE_VERSION=2.3.9 + DERBY_VERSION=10.10.2.0 + SPARK_VERSION=2.4.8 + SPARK_HADOOP_VERSION=2.7 + IMAGE_TAG=spark248hive239 +elif [[ ${SPARK_PROFILE} == 'spark3.1' ]]; then + HADOOP_VERSION=2.7.7 + HIVE_VERSION=3.1.3 + DERBY_VERSION=10.14.1.0 + SPARK_VERSION=3.1.3 + SPARK_HADOOP_VERSION=2.7 + IMAGE_TAG=spark313hive313 +elif [[ ${SPARK_PROFILE} == 'spark3.2' ]]; then + HADOOP_VERSION=2.7.7 + HIVE_VERSION=3.1.3 + DERBY_VERSION=10.14.1.0 + SPARK_VERSION=3.2.2 + SPARK_HADOOP_VERSION=2.7 + IMAGE_TAG=spark322hive313 +elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then + HADOOP_VERSION=2.7.7 + HIVE_VERSION=3.1.3 + DERBY_VERSION=10.14.1.0 + SPARK_VERSION=3.3.0 + SPARK_HADOOP_VERSION=2 + IMAGE_TAG=spark330hive313 +fi + +cd packaging/bundle-validation/spark-write-hive-sync || exit 1 +docker build \ +--build-arg HADOOP_VERSION=$HADOOP_VERSION \ +--build-arg HIVE_VERSION=$HIVE_VERSION \ +--build-arg DERBY_VERSION=$DERBY_VERSION \ +--build-arg SPARK_VERSION=$SPARK_VERSION \ +--build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \ +-t hudi-ci-bundle-validation:$IMAGE_TAG \ +. +docker run -v ${GITHUB_WORKSPACE}/jars:/opt/hudi-bundles/jars -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh diff --git a/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml b/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml new file mode 100644 index 0000000000000..810cd695f28ac --- /dev/null +++ b/packaging/bundle-validation/spark-write-hive-sync/hive-site.xml @@ -0,0 +1,53 @@ + + + + + + system:user.name + ${user.name} + + + system:java.io.tmpdir + file:///tmp/hudi-bundles/hive/java + + + hive.exec.scratchdir + file:///tmp/hudi-bundles/hive/exec + + + hive.metastore.warehouse.dir + file:///tmp/hudi-bundles/hive/warehouse + + + hive.metastore.schema.verification + false + + + + datanucleus.schema.autoCreateAll + true + + + javax.jdo.option.ConnectionDriverName + org.apache.derby.jdbc.ClientDriver + + + javax.jdo.option.ConnectionURL + jdbc:derby://localhost:1527/default;create=true + + diff --git a/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf b/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf new file mode 100644 index 0000000000000..136d9d5ddcb46 --- /dev/null +++ b/packaging/bundle-validation/spark-write-hive-sync/spark-defaults.conf @@ -0,0 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension +spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse diff --git a/packaging/bundle-validation/spark-write-hive-sync/validate.scala b/packaging/bundle-validation/spark-write-hive-sync/validate.scala new file mode 100644 index 0000000000000..01faa38509809 --- /dev/null +++ b/packaging/bundle-validation/spark-write-hive-sync/validate.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.hudi.QuickstartUtils._ +import scala.collection.JavaConversions._ +import org.apache.spark.sql.SaveMode._ +import org.apache.hudi.DataSourceReadOptions._ +import org.apache.hudi.DataSourceWriteOptions._ +import org.apache.hudi.config.HoodieWriteConfig._ +import org.apache.hudi.common.model.HoodieRecord + +val expected = 10 +val database = "default" +val tableName = "trips" +val basePath = "file:///tmp/hudi-bundles/tests/" + tableName +val dataGen = new DataGenerator +val inserts = convertToStringList(dataGen.generateInserts(expected)) +val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2)) +df.write.format("hudi"). + options(getQuickstartWriteConfigs). + option(PRECOMBINE_FIELD_OPT_KEY, "ts"). + option(RECORDKEY_FIELD_OPT_KEY, "uuid"). + option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath"). + option(TABLE_NAME, tableName). + option("hoodie.datasource.meta.sync.enable", "true"). + option("hoodie.datasource.hive_sync.database", database). + option("hoodie.datasource.hive_sync.table", tableName). + option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.SinglePartPartitionValueExtractor"). + option("hoodie.datasource.hive_sync.mode", "hms"). + option("hoodie.datasource.hive_sync.metastore.uris", "thrift://localhost:9083/"). + mode(Overwrite). + save(basePath) + +spark.sql("desc " + tableName).show +val actual = spark.sql("select * from " + tableName).count +if (expected == actual) { + System.out.println($"bundle combination passed sanity run.") + System.exit(0) +} else { + System.err.println($"bundle combination failed sanity run:\n\tshould have written $expected records in $database.$tableName") + System.exit(1) +} diff --git a/packaging/bundle-validation/spark-write-hive-sync/validate.sh b/packaging/bundle-validation/spark-write-hive-sync/validate.sh new file mode 100755 index 0000000000000..d8526a481507f --- /dev/null +++ b/packaging/bundle-validation/spark-write-hive-sync/validate.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: this script runs inside hudi-ci-bundle-validation container +# $WORKDIR/jars/ is supposed to be mounted to a host directory where bundle jars are placed +# TODO: $JAR_COMBINATIONS should have different orders for different jars to detect class loading issues + +$DERBY_HOME/bin/startNetworkServer -h 0.0.0.0 & +$HIVE_HOME/bin/hiveserver2 & +WORKDIR=/opt/hudi-bundles +JAR_COMBINATIONS=$(echo $WORKDIR/jars/*.jar | tr ' ' ',') +$SPARK_HOME/bin/spark-shell --jars $JAR_COMBINATIONS < $WORKDIR/validate.scala + +exit $?