diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3a4620746f73a..b5b27423d5f53 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -14,274 +14,13 @@ on: required: true jobs: - # Build: build Spark and run the tests for specified modules. - build: - name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" - # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04. - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - java: - - 8 - hadoop: - - hadoop3.2 - hive: - - hive2.3 - # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. - # Kinesis tests depends on external Amazon kinesis service. - # Note that the modules below are from sparktestsupport/modules.py. - modules: - - >- - core, unsafe, kvstore, avro, - network-common, network-shuffle, repl, launcher, - examples, sketch, graphx - - >- - catalyst, hive-thriftserver - - >- - streaming, sql-kafka-0-10, streaming-kafka-0-10, - mllib-local, mllib, - yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl - # Here, we split Hive and SQL tests into some of slow ones and the rest of them. - included-tags: [""] - excluded-tags: [""] - comment: [""] - include: - # Hive tests - - modules: hive - java: 8 - hadoop: hadoop3.2 - hive: hive2.3 - included-tags: org.apache.spark.tags.SlowHiveTest - comment: "- slow tests" - - modules: hive - java: 8 - hadoop: hadoop3.2 - hive: hive2.3 - excluded-tags: org.apache.spark.tags.SlowHiveTest - comment: "- other tests" - # SQL tests - - modules: sql - java: 8 - hadoop: hadoop3.2 - hive: hive2.3 - included-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- slow tests" - - modules: sql - java: 8 - hadoop: hadoop3.2 - hive: hive2.3 - excluded-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- other tests" - env: - MODULES_TO_TEST: ${{ matrix.modules }} - EXCLUDED_TAGS: ${{ matrix.excluded-tags }} - INCLUDED_TAGS: ${{ matrix.included-tags }} - HADOOP_PROFILE: ${{ matrix.hadoop }} - HIVE_PROFILE: ${{ matrix.hive }} - # GitHub Actions' default miniconda to use in pip packaging test. - CONDA_PREFIX: /usr/share/miniconda - GITHUB_PREV_SHA: ${{ github.event.before }} - GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }} - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - # In order to fetch changed files - with: - fetch-depth: 0 - - name: Merge dispatched input branch - if: ${{ github.event.inputs.target != '' }} - run: git merge --progress --ff-only origin/${{ github.event.inputs.target }} - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} - - name: Install Python 3.8 - uses: actions/setup-python@v2 - # We should install one Python that is higher then 3+ for SQL and Yarn because: - # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. - # - Yarn has a Python specific test too, for example, YarnClusterSuite. - if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - with: - python-version: 3.8 - architecture: x64 - - name: Install Python packages (Python 3.8) - if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - run: | - python3.8 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner - python3.8 -m pip list - # Run the tests. - - name: Run tests - run: | - # Hive and SQL tests become flaky when running in parallel as it's too intensive. - if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi - ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() - uses: actions/upload-artifact@v2 - with: - name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/unit-tests.log" - - pyspark: - name: "Build modules: ${{ matrix.modules }}" - runs-on: ubuntu-20.04 - container: - image: dongjoon/apache-spark-github-action-image:20201025 - strategy: - fail-fast: false - matrix: - modules: - - >- - pyspark-sql, pyspark-mllib, pyspark-resource - - >- - pyspark-core, pyspark-streaming, pyspark-ml - env: - MODULES_TO_TEST: ${{ matrix.modules }} - HADOOP_PROFILE: hadoop3.2 - HIVE_PROFILE: hive2.3 - # GitHub Actions' default miniconda to use in pip packaging test. - CONDA_PREFIX: /usr/share/miniconda - GITHUB_PREV_SHA: ${{ github.event.before }} - GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }} - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - # In order to fetch changed files - with: - fetch-depth: 0 - - name: Merge dispatched input branch - if: ${{ github.event.inputs.target != '' }} - run: git merge --progress --ff-only origin/${{ github.event.inputs.target }} - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - pyspark-coursier- - - name: Install Python 3.6 - uses: actions/setup-python@v2 - with: - python-version: 3.6 - architecture: x64 - # This step takes much less time (~30s) than other Python versions so it is not included - # in the Docker image being used. There is also a technical issue to install Python 3.6 on - # Ubuntu 20.04. See also SPARK-33162. - - name: Install Python packages (Python 3.6) - run: | - python3.6 -m pip install numpy 'pyarrow<3.0.0' pandas scipy xmlrunner - python3.6 -m pip list - # Run the tests. - - name: Run tests - run: | - ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3 - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() - uses: actions/upload-artifact@v2 - with: - name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3 - path: "**/target/unit-tests.log" - - sparkr: - name: "Build modules: sparkr" - runs-on: ubuntu-20.04 - container: - image: dongjoon/apache-spark-github-action-image:20201025 - env: - HADOOP_PROFILE: hadoop3.2 - HIVE_PROFILE: hive2.3 - GITHUB_PREV_SHA: ${{ github.event.before }} - GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }} - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - # In order to fetch changed files - with: - fetch-depth: 0 - - name: Merge dispatched input branch - if: ${{ github.event.inputs.target != '' }} - run: git merge --progress --ff-only origin/${{ github.event.inputs.target }} - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - sparkr-coursier- - - name: Run tests - run: | - # The followings are also used by `r-lib/actions/setup-r` to avoid - # R issues at docker environment - export TZ=UTC - export _R_CHECK_SYSTEM_CLOCK_=FALSE - ./dev/run-tests --parallelism 2 --modules sparkr - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-sparkr--8-hadoop3.2-hive2.3 - path: "**/target/test-reports/*.xml" - # Static analysis, and documentation build lint: name: Linters, licenses, dependencies and documentation generation runs-on: ubuntu-20.04 + env: + LC_ALL: C.UTF-8 + LANG: C.UTF-8 container: image: dongjoon/apache-spark-github-action-image:20201025 steps: @@ -342,10 +81,6 @@ jobs: gem install bundler cd docs bundle install - - name: Scala linter - run: ./dev/lint-scala - - name: Java linter - run: ./dev/lint-java - name: Python linter run: ./dev/lint-python - name: R linter @@ -357,166 +92,5 @@ jobs: - name: Run documentation build run: | cd docs - export LC_ALL=C.UTF-8 - export LANG=C.UTF-8 bundle exec jekyll build - java-11: - name: Java 11 build with Maven - runs-on: ubuntu-20.04 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: java11-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - java11-maven- - - name: Install Java 11 - uses: actions/setup-java@v1 - with: - java-version: 11 - - name: Build with Maven - run: | - export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" - export MAVEN_CLI_OPTS="--no-transfer-progress" - # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414. - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install - rm -rf ~/.m2/repository/org/apache/spark - - scala-213: - name: Scala 2.13 build with SBT - runs-on: ubuntu-20.04 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - scala-213-coursier- - - name: Install Java 8 - uses: actions/setup-java@v1 - with: - java-version: 8 - - name: Build with SBT - run: | - ./dev/change-scala-version.sh 2.13 - ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile - - hadoop-2: - name: Hadoop 2 build with SBT - runs-on: ubuntu-20.04 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - hadoop-2-coursier- - - name: Install Java 8 - uses: actions/setup-java@v1 - with: - java-version: 8 - - name: Build with SBT - run: | - ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile - - tpcds-1g: - name: Run TPC-DS queries with SF=1 - runs-on: ubuntu-20.04 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - - name: Cache TPC-DS generated data - id: cache-tpcds-sf-1 - uses: actions/cache@v2 - with: - path: ./tpcds-sf-1 - key: tpcds-${{ hashFiles('tpcds-sf-1/.spark-tpcds-sf-1.md5') }} - restore-keys: | - tpcds- - - name: Checkout TPC-DS (SF=1) generated data repository - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - uses: actions/checkout@v2 - with: - repository: maropu/spark-tpcds-sf-1 - ref: 6b660a53091bd6d23cbe58b0f09aae08e71cc667 - path: ./tpcds-sf-1 - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - tpcds-coursier- - - name: Install Java 8 - uses: actions/setup-java@v1 - with: - java-version: 8 - - name: Run TPC-DS queries - run: | - SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-tpcds--8-hadoop3.2-hive2.3 - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() - uses: actions/upload-artifact@v2 - with: - name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3 - path: "**/target/unit-tests.log"