rapidsai
diff --git a/‎.github/workflows/pr.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pr.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/spark-rapids-jni.yaml‎
Lines changed: 66 additions & 0 deletions b/‎.github/workflows/spark-rapids-jni.yaml‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 16 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎conda/environments/all_cuda-129_arch-aarch64.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conda/environments/all_cuda-129_arch-aarch64.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conda/environments/all_cuda-129_arch-x86_64.yaml‎
Lines changed: 1 addition & 1 deletion b/‎conda/environments/all_cuda-129_arch-x86_64.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎conda/environments/all_cuda-131_arch-aarch64.yaml‎
Lines changed: 2 additions & 1 deletion b/‎conda/environments/all_cuda-131_arch-aarch64.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎conda/environments/all_cuda-131_arch-x86_64.yaml‎
Lines changed: 2 additions & 1 deletion b/‎conda/environments/all_cuda-131_arch-x86_64.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎conda/recipes/cudf-polars/recipe.yaml‎
Lines changed: 3 additions & 0 deletions b/‎conda/recipes/cudf-polars/recipe.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/benchmarks/CMakeLists.txt‎
Lines changed: 29 additions & 5 deletions b/‎cpp/benchmarks/CMakeLists.txt‎
Lines changed: 29 additions & 5 deletions
@@ -536,7 +536,7 @@ jobs:
       arch: '["amd64", "arm64"]'
       cuda: '["13.1"]'
       node_type: "cpu8"
-      timeout-minutes: 60
+      timeout-minutes: 90
       env: |
         SCCACHE_DIST_MAX_RETRIES=inf
         SCCACHE_SERVER_LOG=sccache=debug
 
@@ -8,6 +8,8 @@ jobs:
     runs-on: linux-amd64-cpu8
     container:
       image: rapidsai/ci-spark-rapids-jni:rockylinux8-cuda12.9.1
+    permissions:
+      id-token: write
     steps:
       - uses: actions/checkout@v4
         with:
@@ -17,10 +19,74 @@ jobs:
       - uses: actions/checkout@v4
         with:
           path: thirdparty/cudf
+      - uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7 # v6.0.0
+        with:
+          role-to-assume: ${{ vars.AWS_ROLE_ARN }}
+          aws-region: ${{ vars.AWS_REGION }}
+          role-duration-seconds: 43200 # 12h
       - name: "Build spark-rapids-jni"
+        env:
+          SCCACHE_S3_KEY_PREFIX: spark-rapids-jni
+          SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: spark-rapids-jni/preprocessor
+          SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: true
         run: |
+          set -euo pipefail
+
+          ###
+          # Setup sccache client
+          ###
+
+          # Install jq
+          dnf -y install jq
+
+          # Download gha-tools
+          wget https://github.com/rapidsai/gha-tools/releases/latest/download/tools.tar.gz -O - | tar -xz -C /usr/local/bin
+
+          # Build cluster endpoint
+          export SCCACHE_DIST_SCHEDULER_URL="https://$(uname -m | sed -e 's/x86_64/amd64/' -e 's/aarch64/arm64/').linux.sccache.rapids.nvidia.com"
+
+          export SCCACHE_DIST_AUTH_TOKEN="$(
+            curl -fsSL -H "Authorization: Bearer $(
+              curl -fsSL -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
+                "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=token.rapids.nvidia.com" \
+            | jq -r '.value'
+            )" https://token.rapids.nvidia.com/gh/token/exchange \
+          | jq -r '.token')"
+
+          # Install sccache client
+          . rapids-install-sccache
+
+          # Configure sccache
+          . rapids-configure-sccache
+
+          export CPP_PARALLEL_LEVEL="$PARALLEL_LEVEL"
+
+          # Don't use the build cluster for CMake's compiler tests
+          echo -e '\nset(ENV{SCCACHE_NO_DIST_COMPILE} "1")' >> thirdparty/cudf-pins/add_dependency_pins.cmake
+
           echo "------------------------"
           env | sort
           echo "------------------------"
+
+          # Increase the nofile ulimit to build with as much parallelism as possible
+          ulimit -n $(ulimit -Hn)
+
+          # Start the sccache daemon
+          sccache --start-server
+
+          # Verify sccache version and distributed compilation
+          sccache --show-stats
+
+          if sccache --dist-status 2>/dev/null | jq -er '.SchedulerStatus? != null' >/dev/null 2>&1; then
+              echo "Distributed compilation is available:"
+              sccache --dist-status | jq -r '["scheduler URL: " + .SchedulerStatus[0], "server count: " + (.SchedulerStatus[1].servers | length | tostring)][]';
+          else
+              echo "Error: Distributed compilation not available, check connectivity"
+              cat "$SCCACHE_ERROR_LOG";
+              exit 1
+          fi
+
           mkdir target
           source build/env.sh && CMAKE_CUDA_ARCHITECTURES=90 LIBCUDF_DEPENDENCY_MODE=latest USE_GDS=on ${sclCMD} build/buildcpp.sh
+
+          sccache --show-stats
@@ -104,6 +104,22 @@ repos:
         entry: 'rmm::exec_policy\('
         language: pygrep
         types_or: [c, c++, cuda]
+      - id: use-cudf-memcpy-async
+        name: use-cudf-memcpy-async
+        description: 'Enforce that cudf::detail::memcpy_async or memcpy_batch_async is used instead of cudaMemcpyAsync (see developer guide)'
+        entry: 'cudaMemcpyAsync'
+        language: pygrep
+        types_or: [c, c++, cuda]
+        files: '^cpp/(src|include)/'
+        exclude: |
+          (?x)^(
+            cpp/src/utilities/host_memory\.cpp|
+            cpp/src/utilities/cuda_memcpy\.cu|
+            cpp/src/io/utilities/data_sink\.cpp|
+            cpp/include/cudf/contiguous_split\.hpp|
+            cpp/include/cudf/utilities/pinned_memory\.hpp|
+            cpp/include/cudf/utilities/error\.hpp
+          )$
       - id: no-unseeded-default-rng
         name: no-unseeded-default-rng
         description: 'Enforce that no non-seeded default_rng is used and default_rng is used instead of np.random.seed'
 
@@ -57,7 +57,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.22.2,<0.28.0
+- numba-cuda>=0.22.2
 - numba>=0.60.0,<0.65.0
 - numpy>=1.26,<3.0
 - numpydoc
 
@@ -57,7 +57,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.22.2,<0.28.0
+- numba-cuda>=0.22.2
 - numba>=0.60.0,<0.65.0
 - numpy>=1.26,<3.0
 - numpydoc
 
@@ -57,7 +57,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.22.2,<0.28.0
+- numba-cuda>=0.22.2
 - numba>=0.60.0,<0.65.0
 - numpy>=1.26,<3.0
 - numpydoc
@@ -81,6 +81,7 @@ dependencies:
 - python-confluent-kafka
 - python-xxhash
 - python>=3.11
+- pytorch>=2.10.0
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-dask-dependency==26.4.*,>=0.0.0a0
 - rapids-logger==0.2.*,>=0.0.0a0
 
@@ -57,7 +57,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.22.2,<0.28.0
+- numba-cuda>=0.22.2
 - numba>=0.60.0,<0.65.0
 - numpy>=1.26,<3.0
 - numpydoc
@@ -81,6 +81,7 @@ dependencies:
 - python-confluent-kafka
 - python-xxhash
 - python>=3.11
+- pytorch>=2.10.0
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-dask-dependency==26.4.*,>=0.0.0a0
 - rapids-logger==0.2.*,>=0.0.0a0
 
@@ -39,6 +39,9 @@ requirements:
     - polars>=1.30,<1.39
     - packaging
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+    - if: cuda_major == "12"
+      then: cuda-python >=12.9.2,<13.0
+      else: cuda-python >=13.0.1,<14.0
   ignore_run_exports:
     by_name:
       - cuda-version
 
@@ -536,6 +536,7 @@ add_library(
   src/io/parquet/experimental/hybrid_scan_preprocess.cu
   src/io/parquet/experimental/page_index_filter.cu
   src/io/parquet/experimental/page_index_filter_utils.cu
+  src/io/parquet/expression_transform_helpers.cpp
   src/io/parquet/io_utils/parquet_io_utils.cpp
   src/io/parquet/page_data.cu
   src/io/parquet/chunk_dict.cu
 
@@ -282,13 +282,9 @@ ConfigureNVBench(
   merge/merge_strings.cpp
 )
 
-# ##################################################################################################
-# * null_mask benchmark ---------------------------------------------------------------------------
-ConfigureNVBench(NULLMASK_NVBENCH bitmask/set_null_mask.cpp)
-
 # ##################################################################################################
 # * bitmask benchmark ---------------------------------------------------------------------------
-ConfigureNVBench(BITMASK_NVBENCH bitmask/bitmask_and.cu)
+ConfigureNVBench(BITMASK_NVBENCH bitmask/bitmask_and.cpp bitmask/set_null_mask.cpp)
 
 # ##################################################################################################
 # * parquet writer benchmark ----------------------------------------------------------------------
@@ -300,6 +296,34 @@ ConfigureNVBench(
 # * parquet reader benchmark ----------------------------------------------------------------------
 ConfigureNVBench(
   PARQUET_READER_NVBENCH io/parquet/parquet_reader_input.cpp io/parquet/parquet_reader_options.cpp
+  io/parquet/reader_common.cpp
+)
+
+# ##################################################################################################
+# * parquet reader chunks benchmark
+# ------------------------------------------------------------------
+ConfigureNVBench(
+  PARQUET_READER_CHUNKS_NVBENCH io/parquet/parquet_reader_chunks.cpp io/parquet/reader_common.cpp
+)
+
+# ##################################################################################################
+# * parquet reader compressed benchmark ------------------------------------------------------------
+ConfigureNVBench(
+  PARQUET_READER_COMPRESSED_NVBENCH io/parquet/parquet_reader_compressed.cpp
+  io/parquet/reader_common.cpp
+)
+
+# ##################################################################################################
+# * parquet reader strings benchmark
+# ------------------------------------------------------------------
+ConfigureNVBench(
+  PARQUET_READER_STRINGS_NVBENCH io/parquet/parquet_reader_strings.cpp io/parquet/reader_common.cpp
+)
+
+# ##################################################################################################
+# * parquet reader wide benchmark ------------------------------------------------------------------
+ConfigureNVBench(
+  PARQUET_READER_WIDE_NVBENCH io/parquet/parquet_reader_wide.cpp io/parquet/reader_common.cpp
 )
 
 # ##################################################################################################