diff --git a/.env b/.env index dad867f8f66e..6985926772df 100644 --- a/.env +++ b/.env @@ -52,7 +52,7 @@ ULIMIT_CORE=-1 # Default versions for platforms ALMALINUX=8 ALPINE_LINUX=3.22 -DEBIAN=12 +DEBIAN=13 FEDORA=42 UBUNTU=22.04 @@ -61,11 +61,9 @@ CLANG_TOOLS=18 CMAKE=3.26.0 CUDA=11.7.1 DASK=latest -DOTNET=8.0 GCC= HDFS=3.2.1 JDK=11 -KARTOTHEK=latest # LLVM 12 and GCC 11 reports -Wmismatched-new-delete. LLVM=18 MAVEN=3.8.7 @@ -79,7 +77,6 @@ PYTHON_IMAGE_TAG=3.10 PYTHON_ABI_TAG=cp310 R=4.5 SPARK=master -TURBODBC=latest # These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-release:latest R_IMAGE=ubuntu-release @@ -96,14 +93,14 @@ TZ=UTC # Used through compose.yaml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. -VCPKG="4334d8b4c8916018600212ab4dd4bbdc343065d1" # 2025.09.17 Release +VCPKG="66c0373dc7fca549e5803087b9487edfe3aca0a1" # 2026.01.16 Release # This must be updated when we update # ci/docker/python-*-windows-*.dockerfile or the vcpkg config. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2025-10-13 -PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2025-10-13 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2026-02-07 +PYTHON_WHEEL_WINDOWS_TEST_IMAGE_REVISION=2026-02-07 # Use conanio/${CONAN_BASE}:{CONAN_VERSION} for "docker compose run --rm conan". # See https://github.com/conan-io/conan-docker-tools#readme and diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 4b3eac2d4330..a293127ed9f7 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,6 +4,7 @@ If this is your first pull request you can find detailed information on how to c * [New Contributor's Guide](https://arrow.apache.org/docs/dev/developers/guide/step_by_step/pr_lifecycle.html#reviews-and-merge-of-the-pull-request) * [Contributing Overview](https://arrow.apache.org/docs/dev/developers/overview.html) + * [AI-generated Code Guidance](https://arrow.apache.org/docs/dev/developers/overview.html#ai-generated-code) Please remove this line and the above text before creating your pull request. diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 68f47926ad97..cdaf268ca02b 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -336,9 +336,79 @@ jobs: cd cpp/examples/minimal_build ../minimal_build.build/arrow-example - odbc: + odbc-macos: needs: check-labels - name: ODBC + name: ODBC ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} + runs-on: macos-${{ matrix.macos-version }} + if: >- + needs.check-labels.outputs.force == 'true' || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra: C++') + timeout-minutes: 75 + strategy: + fail-fast: false + matrix: + include: + - architecture: AMD64 + macos-version: "15-intel" + - architecture: ARM64 + macos-version: "14" + env: + ARROW_BUILD_TESTS: ON + ARROW_FLIGHT_SQL_ODBC: ON + ARROW_HOME: /tmp/local + steps: + - name: Checkout Arrow + uses: actions/checkout@v6.0.1 + with: + fetch-depth: 0 + submodules: recursive + - name: Install Dependencies + run: | + brew bundle --file=cpp/Brewfile + - name: Setup ccache + run: | + ci/scripts/ccache_setup.sh + - name: ccache info + id: ccache-info + run: | + echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT + - name: Cache ccache + uses: actions/cache@v5.0.2 + with: + path: ${{ steps.ccache-info.outputs.cache-dir }} + key: cpp-odbc-ccache-macos-${{ matrix.macos-version }}-${{ hashFiles('cpp/**') }} + restore-keys: cpp-odbc-ccache-macos-${{ matrix.macos-version }}- + - name: Build + run: | + # Homebrew uses /usr/local as prefix. So packages + # installed by Homebrew also use /usr/local/include. We + # want to include headers for packages installed by + # Homebrew as system headers to ignore warnings in them. + # But "-isystem /usr/local/include" isn't used by CMake + # because /usr/local/include is marked as the default + # include path. So we disable -Werror to avoid build error + # by warnings from packages installed by Homebrew. + export BUILD_WARNING_LEVEL=PRODUCTION + LIBIODBC_DIR="$(brew --cellar libiodbc)/$(brew list --versions libiodbc | awk '{print $2}')" + ODBC_INCLUDE_DIR=$LIBIODBC_DIR/include + export ARROW_CMAKE_ARGS="-DODBC_INCLUDE_DIR=$ODBC_INCLUDE_DIR" + export CXXFLAGS="$CXXFLAGS -I$ODBC_INCLUDE_DIR" + ci/scripts/cpp_build.sh $(pwd) $(pwd)/build + - name: Register Flight SQL ODBC Driver + run: | + sudo cpp/src/arrow/flight/sql/odbc/install/mac/install_odbc.sh $(pwd)/build/cpp/debug/libarrow_flight_sql_odbc.dylib + - name: Test + shell: bash + run: | + sudo sysctl -w kern.coredump=1 + sudo sysctl -w kern.corefile=/tmp/core.%N.%P + ulimit -c unlimited # must enable within the same shell + ci/scripts/cpp_test.sh $(pwd) $(pwd)/build + + odbc-msvc: + needs: check-labels + name: ODBC Windows runs-on: windows-2022 if: >- needs.check-labels.outputs.force == 'true' || @@ -352,6 +422,9 @@ jobs: ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: ON ARROW_BUILD_TYPE: release + # Turn Arrow CSV off to disable `find_package(Arrow)` check on MSVC CI. + # GH-49050 TODO: enable `find_package(Arrow)` check on MSVC CI. + ARROW_CSV: OFF ARROW_DEPENDENCY_SOURCE: VCPKG ARROW_FLIGHT_SQL_ODBC: ON ARROW_FLIGHT_SQL_ODBC_INSTALLER: ON @@ -434,10 +507,15 @@ jobs: shell: cmd run: | call "cpp\src\arrow\flight\sql\odbc\tests\install_odbc.cmd" ${{ github.workspace }}\build\cpp\%ARROW_BUILD_TYPE%\arrow_flight_sql_odbc.dll - # GH-48270 TODO: Resolve segementation fault during Arrow library unload - # GH-48269 TODO: Enable Flight & Flight SQL testing in MSVC CI - # GH-48547 TODO: enable ODBC tests after GH-48270 and GH-48269 are resolved. - + - name: Test + shell: cmd + run: | + set VCPKG_ROOT_KEEP=%VCPKG_ROOT% + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 + set VCPKG_ROOT=%VCPKG_ROOT_KEEP% + # Convert VCPKG Windows path to MSYS path + for /f "usebackq delims=" %%I in (`bash -c "cygpath -u \"$VCPKG_ROOT_KEEP\""` ) do set VCPKG_ROOT=%%I + bash -c "ci/scripts/cpp_test.sh $(pwd) $(pwd)/build" - name: Install WiX Toolset shell: pwsh run: | @@ -455,18 +533,79 @@ jobs: uses: actions/upload-artifact@v6 with: name: flight-sql-odbc-msi-installer - path: build/cpp/Apache Arrow Flight SQL ODBC-*-win64.msi + path: build/cpp/Apache-Arrow-Flight-SQL-ODBC-*-win64.msi if-no-files-found: error - # Upload ODBC installer as nightly release in scheduled runs + - name: Install ODBC MSI + run: | + cd build/cpp + $odbc_msi = Get-ChildItem -Filter "Apache-Arrow-Flight-SQL-ODBC-*-win64.msi" + if (-not $odbc_msi) { + Write-Error "ODBC MSI not found" + exit 1 + } + + foreach ($msi in $odbc_msi) { + Write-Host "Installing $($msi.Name) with logs" + $log = "odbc-install.log" + Start-Process msiexec.exe -Wait -ArgumentList "/i `"$msi`"", "/qn", "/L*V `"$log`"" + Get-Content $log + } + - name: Check ODBC DLL installation + run: | + $dirs = Get-ChildItem "C:\Program Files" -Directory -Filter "Apache-Arrow-Flight-SQL-ODBC*" + + foreach ($dir in $dirs) { + $bin = Join-Path $dir.FullName "bin" + + if (Test-Path $bin) { + tree $bin /f + + $dll = Join-Path $bin "arrow_flight_sql_odbc.dll" + if (Test-Path $dll) { + Write-Host "Found ODBC DLL: $dll" + exit 0 + } + } + } + + Write-Error "ODBC DLL not found" + exit 1 + + odbc-nightly: + needs: odbc-msvc + name: ODBC nightly + runs-on: ubuntu-latest + if: github.event_name == 'schedule' && github.repository == 'apache/arrow' + steps: + - name: Download the artifacts + uses: actions/download-artifact@v7 + with: + name: flight-sql-odbc-msi-installer - name: Prepare ODBC installer for sync - if: github.event_name == 'schedule' run: | mkdir odbc-installer - Move-Item "build/cpp/Apache Arrow Flight SQL ODBC-*-win64.msi" odbc-installer/ - tree odbc-installer /f + mv *.msi odbc-installer/ + + # Add `dev-yyyy-mm-dd` to ODBC MSI before `win64.msi`: + # Apache Arrow Flight SQL ODBC-24.0.0-win64.msi -> + # Apache Arrow Flight SQL ODBC-24.0.0-dev-2026-02-06-win64.msi + cd odbc-installer + msi_name=$(ls *.msi) + dev_msi_name=$(echo ${msi_name} | sed -e "s/win64\.msi$/dev-$(date +%Y-%m-%d)-win64.msi/") + mv "${msi_name}" "${dev_msi_name}" + cd .. + + tree odbc-installer + - name: Checkout Arrow + uses: actions/checkout@v6 + with: + fetch-depth: 1 + path: arrow + repository: apache/arrow + ref: main + submodules: recursive - name: Sync to Remote - if: github.event_name == 'schedule' - uses: ./.github/actions/sync-nightlies + uses: ./arrow/.github/actions/sync-nightlies with: upload: true switches: -avzh --update --delete --progress @@ -478,6 +617,39 @@ jobs: remote_key: ${{ secrets.NIGHTLIES_RSYNC_KEY }} remote_host_key: ${{ secrets.NIGHTLIES_RSYNC_HOST_KEY }} + odbc-release: + needs: odbc-msvc + name: ODBC release + runs-on: ubuntu-latest + if: ${{ startsWith(github.ref_name, 'apache-arrow-') && contains(github.ref_name, '-rc') }} + permissions: + # Upload to GitHub Release + contents: write + steps: + - name: Checkout Arrow + uses: actions/checkout@v6 + with: + fetch-depth: 0 + submodules: recursive + - name: Download the artifacts + uses: actions/download-artifact@v7 + with: + name: flight-sql-odbc-msi-installer + - name: Wait for creating GitHub Release + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + dev/release/utils-watch-gh-workflow.sh \ + ${GITHUB_REF_NAME} \ + release_candidate.yml + - name: Upload the artifacts to GitHub Release + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release upload ${GITHUB_REF_NAME} \ + --clobber \ + Apache-Arrow-Flight-SQL-ODBC-*-win64.msi + report-extra-cpp: if: github.event_name == 'schedule' && always() needs: @@ -485,6 +657,8 @@ jobs: - jni-linux - jni-macos - msvc-arm64 - - odbc + - odbc-macos + - odbc-msvc + - odbc-nightly uses: ./.github/workflows/report_ci.yml secrets: inherit diff --git a/.github/workflows/cpp_windows.yml b/.github/workflows/cpp_windows.yml index 69bbfee28b97..3e1f2b4181e4 100644 --- a/.github/workflows/cpp_windows.yml +++ b/.github/workflows/cpp_windows.yml @@ -41,12 +41,14 @@ jobs: runs-on: ${{ inputs.os }} timeout-minutes: 60 env: + ARROW_AZURE: ON ARROW_BOOST_USE_SHARED: OFF ARROW_BUILD_BENCHMARKS: ON ARROW_BUILD_SHARED: ON ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: ON ARROW_DATASET: ON + ARROW_FILESYSTEM: ON ARROW_FLIGHT: OFF ARROW_HDFS: ON ARROW_HOME: /usr diff --git a/.github/workflows/package_linux.yml b/.github/workflows/package_linux.yml index 4dc9a70e8798..e300251e1651 100644 --- a/.github/workflows/package_linux.yml +++ b/.github/workflows/package_linux.yml @@ -218,7 +218,7 @@ jobs: rake version:update popd - name: Login to GitHub Container registry - uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0 + uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0 with: registry: ghcr.io username: ${{ github.actor }} diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index e5d367958dd1..b200b37d1fe0 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -60,6 +60,7 @@ jobs: timeout-minutes: 60 strategy: fail-fast: false + max-parallel: 20 matrix: name: - conda-python-docs @@ -69,10 +70,10 @@ jobs: - conda-python-3.12-no-numpy include: - name: conda-python-docs - cache: conda-python-3.10 + cache: conda-python-3.11 image: conda-python-docs - title: AMD64 Conda Python 3.10 Sphinx & Numpydoc - python: "3.10" + title: AMD64 Conda Python 3.11 Sphinx & Numpydoc + python: "3.11" - name: conda-python-3.11-nopandas cache: conda-python-3.11 image: conda-python @@ -145,12 +146,15 @@ jobs: timeout-minutes: 60 strategy: fail-fast: false + max-parallel: 20 matrix: include: - architecture: AMD64 macos-version: "15-intel" + large-memory-tests: "OFF" - architecture: ARM64 macos-version: "14" + large-memory-tests: "ON" env: ARROW_HOME: /tmp/local ARROW_AZURE: ON @@ -173,7 +177,8 @@ jobs: ARROW_WITH_SNAPPY: ON ARROW_WITH_BROTLI: ON ARROW_BUILD_TESTS: OFF - PYARROW_TEST_LARGE_MEMORY: ON + PYARROW_TEST_LARGE_MEMORY: ${{ matrix.large-memory-tests }} + PYTEST_ARGS: "-n auto --durations=40" # Current oldest supported version according to https://endoflife.date/macos MACOSX_DEPLOYMENT_TARGET: 12.0 steps: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c4c4f04188d3..a33aa3acb473 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -337,6 +337,7 @@ repos: ?^ci/scripts/python_sdist_build\.sh$| ?^ci/scripts/python_sdist_test\.sh$| ?^ci/scripts/python_wheel_unix_test\.sh$| + ?^ci/scripts/python_test_type_annotations\.sh$| ?^ci/scripts/r_build\.sh$| ?^ci/scripts/r_revdepcheck\.sh$| ?^ci/scripts/release_test\.sh$| @@ -352,6 +353,7 @@ repos: ?^cpp/build-support/update-thrift\.sh$| ?^cpp/examples/minimal_build/run\.sh$| ?^cpp/examples/tutorial_examples/run\.sh$| + ?^cpp/src/arrow/flight/sql/odbc/install/mac/install_odbc\.sh$| ?^dev/release/05-binary-upload\.sh$| ?^dev/release/08-binary-verify\.sh$| ?^dev/release/binary-recover\.sh$| @@ -360,8 +362,6 @@ repos: ?^dev/release/post-09-python\.sh$| ?^dev/release/setup-rhel-rebuilds\.sh$| ?^dev/release/utils-generate-checksum\.sh$| - ?^python/asv-install\.sh$| - ?^python/asv-uninstall\.sh$| ?^swift/gen-protobuffers\.sh$| ) - repo: https://github.com/scop/pre-commit-shfmt @@ -379,6 +379,8 @@ repos: # TODO: Remove this when we fix all lint failures files: >- ( + ?^ci/scripts/python_test_type_annotations\.sh$| + ?^cpp/src/arrow/flight/sql/odbc/install/mac/install_odbc\.sh$| ?^dev/release/05-binary-upload\.sh$| ?^dev/release/binary-recover\.sh$| ?^dev/release/post-03-binary\.sh$| diff --git a/AGENT.md b/AGENT.md new file mode 100644 index 000000000000..9a614de972eb --- /dev/null +++ b/AGENT.md @@ -0,0 +1,10 @@ +# LinkedIn Arrow Fork + +This repository is a fork of [Apache Arrow](https://github.com/apache/arrow) maintained by LinkedIn. + +## Branching Rules + +- Every new branch MUST be created from `main` +- Every PR MUST target `main` as the base branch +- `apache-main-sync` is reserved for upstream syncing only +- There must be NO direct branches from or to `apache-main-sync` diff --git a/IMPLEMENTATION-GUIDE.md b/IMPLEMENTATION-GUIDE.md new file mode 100644 index 000000000000..99c5ef2c121d --- /dev/null +++ b/IMPLEMENTATION-GUIDE.md @@ -0,0 +1,413 @@ +# ORC Predicate Pushdown Implementation Guide + +This document defines **how** to implement ORC predicate pushdown, using Parquet as the reference implementation. It establishes constraints, comparison frameworks, reuse rules, and required outputs for quality assurance. + +**Read this before starting any implementation work.** + +--- + +## Table of Contents + +1. [The Parquet Reference Relationship](#the-parquet-reference-relationship) +2. [Non-Negotiable Constraints](#non-negotiable-constraints) +3. [Comparison Framework](#comparison-framework) +4. [Reuse & Sharing Rules](#reuse--sharing-rules) +5. [Test & Validation Strategy](#test--validation-strategy) +6. [Required Session Outputs](#required-session-outputs) +7. [Initial Parity Analysis](#initial-parity-analysis) +8. [Footguns Checklist](#footguns-checklist) +9. [Key Parquet Code References](#key-parquet-code-references) + +--- + +## The Parquet Reference Relationship + +The Parquet predicate pushdown reference is: + +### Inspirational +Treat it as a proven blueprint for strategy, architecture, concurrency patterns, and feature completeness. + +### A Source of Reusable Ideas and Patterns +You may recommend copying approaches and structure. + +### Sometimes a Source of Reusable Code +You may suggest reusing generic utilities or abstractions if they are not Parquet-specific and can be shared cleanly (no tight coupling, no semantic mismatch). + +### Never to Be Modified +Do not propose edits to reference files unless explicitly instructed. If you believe a change in shared code is necessary, propose an ORC-local alternative first, and only then suggest a shared abstraction as an optional follow-up. + +--- + +## Non-Negotiable Constraints + +1. **Do not touch the reference implementation** (Parquet predicate pushdown) unless explicitly instructed. + +2. **Preserve semantics**: ORC pushdown must match ORC's encoding/reader semantics and Arrow's scan/filter semantics. + +3. **Avoid accidental coupling**: Don't introduce Parquet-only assumptions into ORC (statistics formats, encodings, row-group logic, etc.). + +4. **Keep concurrency safe**: Any parallel evaluation/IO must be race-free, deterministic in behavior, and consistent with Arrow's patterns. + +5. **Conservative filtering**: Never exclude stripes that might contain matching rows. When in doubt, include the stripe. + +--- + +## Comparison Framework + +When comparing ORC vs Parquet pushdown, **always** evaluate these five dimensions: + +### 1. Feature Surface & Semantics + +| Aspect | Parquet Status | ORC Target | Notes | +|--------|---------------|------------|-------| +| Comparison predicates (=, !=, <, <=, >, >=) | Full support | Must implement | Core feature | +| Logical operators (AND, OR, NOT) | Full support | Must implement | Compound predicates | +| IN predicate | Supported | Must implement | Range intersection | +| IS NULL / IS VALID | Supported | Must implement | Null handling | +| Type coverage: int32, int64 | Supported | Phase 1 | Initial types | +| Type coverage: float32, float64 | Supported | Phase 2 | Float edge cases | +| Type coverage: string, binary | Supported | Phase 2 | Truncation handling | +| Type coverage: timestamp, date | Supported | Phase 2 | Unit conversion | +| Type coverage: decimal | Supported | Future | Complex | +| Nested types (struct/list/map) | Via SchemaManifest | Must implement | Column index mapping | +| Three-valued logic (NULL semantics) | Correct | Must match | UNKNOWN = include | + +### 2. Pushdown Depth & Plan + +| Aspect | Parquet | ORC Target | +|--------|---------|------------| +| Partition pruning (directory level) | Scanner handles | Same (no change) | +| Row group / stripe filtering | `FilterRowGroups()` | `FilterStripes()` | +| Sub-stripe (row index) | Not used | Not initially (future) | +| Expression binding | Defensive in `TestRowGroups` | Same pattern | +| Fallback on missing stats | Include row group | Include stripe | +| Fallback on corrupted stats | Include row group | Include stripe | + +### 3. Statistics and Index Usage + +| Aspect | Parquet | ORC | Difference | +|--------|---------|-----|------------| +| Statistics source | RowGroup column metadata | Stripe column statistics | API differs | +| Min/max availability | `has_min_max` flag | `has_minimum`, `has_maximum` | Similar | +| Null count | `null_count` field | `has_null`, `num_values` | ORC uses num_values=0 for all-null | +| Deprecated stats flag | Writer version check | `is_statistics_deprecated` | Similar concept | +| Bloom filters | Supported (separate) | Available in ORC | Future enhancement | +| Column index (page-level) | Supported | Row index (similar) | Future enhancement | + +### 4. Concurrency & Performance Strategy + +| Aspect | Parquet | ORC Target | +|--------|---------|------------| +| Cache protection | `physical_schema_mutex_` | Same pattern | +| Metadata caching | `metadata_`, `manifest_` | Same fields | +| Statistics caching | `statistics_expressions_[]` | `stripe_guarantees[]` | +| Column completion tracking | `statistics_expressions_complete_[]` | `statistics_complete[]` | +| Idempotent operations | Yes | Must maintain | +| Incremental cache population | Yes | Must implement | + +### 5. Architecture & Extensibility + +| Aspect | Parquet | ORC Target | +|--------|---------|------------| +| Fragment class | `ParquetFileFragment` | `OrcFileFragment` (NEW) | +| Schema manifest | `parquet::arrow::SchemaManifest` | `OrcSchemaManifest` (NEW) | +| Statistics to expression | `EvaluateStatisticsAsExpression()` | `DeriveFieldGuarantee()` | +| Row group testing | `TestRowGroups()` | `TestStripes()` | +| Row group filtering | `FilterRowGroups()` | `FilterStripes()` | +| Count optimization | `TryCountRows()` | `OrcTryCountRows()` | + +--- + +## Reuse & Sharing Rules + +When you see something strong in the Parquet reference, classify it into exactly one bucket: + +### Idea Reuse (Preferred) +Replicate the design pattern or strategy in ORC-specific code. + +**Examples:** +- Thread safety model with `physical_schema_mutex_` +- Incremental statistics cache population +- Defensive expression binding +- Conservative filtering invariants + +### Infra Reuse +Reuse existing shared infrastructure if already designed to be format-agnostic. + +**Examples:** +- `compute::SimplifyWithGuarantee()` - shared expression simplification +- `FileFormatFixtureMixin` - test fixtures +- `compute::Expression` - expression representation +- `compute::Simplify()` - expression optimization + +### Code Reuse (Only If Clean) +Suggest factoring or reusing code only if it is clearly generic and does not require changing the reference. + +**For each reuse suggestion, explicitly state:** +1. Why it's reusable +2. What format-specific assumptions must be removed/avoided +3. Whether it requires new shared abstractions (and whether that would touch reference files) + +--- + +## Test & Validation Strategy + +### Reusable Test Infrastructure from Parquet + +| Component | Location | Reusable? | +|-----------|----------|-----------| +| `FileFormatFixtureMixin` | `test_util_internal.h` | YES - format-agnostic | +| `FileFormatScanMixin` | `test_util_internal.h` | YES - format-agnostic | +| `OrcFormatHelper` | `file_orc_test.cc` | EXISTS - extend it | +| Expression builders | `compute/expression.h` | YES - shared | +| Test data generation | Format-specific | NO - ORC-specific needed | + +### ORC-Specific Test Fixtures Needed + +1. **Multi-stripe ORC file generator** + - Create files with known statistics per stripe + - Control min/max values, null counts + - Support deprecated statistics flag + +2. **Statistics edge case files** + - All-null stripes (num_values = 0) + - Single-value stripes (min = max) + - Missing statistics + - Corrupted statistics (min > max) + +3. **Nested type test files** + - Struct columns with leaf statistics + - List columns + - Map columns + +### Test Parity Matrix + +| Test Category | Parquet Has | ORC Needs | +|---------------|-------------|-----------| +| Basic scan tests | YES | YES (exists) | +| CountRows | YES | YES (exists) | +| CountRows with predicate pushdown | YES | **NO - ADD** | +| PredicatePushdown | YES | **NO - ADD** | +| PredicatePushdownRowGroupFragments | YES | **NO - ADD** | +| String column pushdown | YES | **FUTURE** | +| Duration column pushdown | YES | **FUTURE** | +| Multithreaded scan | YES | **NO - ADD** | +| Cached metadata | YES | **NO - ADD** | +| Explicit row group selection | YES | **NO - ADD** | + +### Required New Tests for ORC + +```cpp +// Tests to add to file_orc_test.cc + +TEST_F(TestOrcFileFormat, CountRowsPredicatePushdown) { ... } +TEST_F(TestOrcFileFormat, CachedMetadata) { ... } +TEST_F(TestOrcFileFormat, MultithreadedScan) { ... } + +TEST_P(TestOrcFileFormatScan, PredicatePushdown) { ... } +TEST_P(TestOrcFileFormatScan, PredicatePushdownStripeFragments) { ... } +TEST_P(TestOrcFileFormatScan, ExplicitStripeSelection) { ... } +``` + +--- + +## Required Session Outputs + +Every implementation session **MUST** produce these sections: + +### 1. Reference Snapshot +What parts of Parquet pushdown are most relevant to the current work. + +### 2. ORC Current State +What exists, what changed recently, and what's under review. + +### 3. Parity & Gaps Table +| Feature | Parquet | ORC | Status | +|---------|---------|-----|--------| +| ... | ... | ... | Parity/Missing/Different-by-design | + +### 4. Reuse Plan +Ideas/infra/code reuse suggestions with constraints. + +### 5. Risk Register +- Correctness risks +- Performance risks +- Concurrency risks + +### 6. Action Checklist +Prioritized steps: +- P0: Correctness +- P1: Tests +- P2: Performance +- P3: Cleanup + +### 7. Test Matrix +Predicate types × data types × metadata availability × edge cases. + +--- + +## Initial Parity Analysis + +### Current State Comparison + +| Metric | Parquet | ORC | Gap | +|--------|---------|-----|-----| +| Header file lines | 410 | 75 | 5.5x | +| Implementation lines | 1200 | 233 | 5.1x | +| Test file lines | 999 | 96 | 10.4x | +| Fragment class | `ParquetFileFragment` (78 lines) | **MISSING** | Must create | +| Schema manifest | `parquet::arrow::SchemaManifest` | **MISSING** | Must create | +| Predicate pushdown tests | 8+ tests | 0 | Must add | + +### Parquet Components to Mirror in ORC + +| Parquet Component | Lines | ORC Equivalent | Priority | +|-------------------|-------|----------------|----------| +| `ParquetFileFragment` class | ~78 | `OrcFileFragment` | P0 | +| `TestRowGroups()` | ~50 | `TestStripes()` | P0 | +| `FilterRowGroups()` | ~15 | `FilterStripes()` | P0 | +| `TryCountRows()` | ~30 | `OrcTryCountRows()` | P1 | +| `EvaluateStatisticsAsExpression()` | ~80 | `DeriveFieldGuarantee()` | P0 | +| `EnsureCompleteMetadata()` | ~70 | `EnsureFileMetadataCached()` | P0 | +| Statistics caching members | ~10 | Same pattern | P0 | +| Thread safety (mutex) | Throughout | Same pattern | P0 | + +### Key Semantic Differences + +| Aspect | Parquet | ORC | Implementation Impact | +|--------|---------|-----|----------------------| +| Unit of filtering | Row Group | Stripe | Terminology only | +| Column indexing | Schema-ordered | Depth-first pre-order (col 0 = root) | Must handle offset | +| Null detection | `null_count = num_values` | `num_values = 0` | Different check | +| Statistics struct | `parquet::Statistics` | liborc statistics types | Different API | +| Manifest source | `parquet::arrow::SchemaManifest` | ORC type tree | Must build custom | + +--- + +## Footguns Checklist + +These edge cases can cause correctness bugs. Address each explicitly: + +### Numeric Types +- [ ] **NaN handling** (float/double): NaN in statistics makes min/max unusable +- [ ] **Signed zero**: -0.0 == +0.0 but may appear differently in stats +- [ ] **Infinity**: +Inf/-Inf are valid min/max values +- [ ] **Overflow**: Statistics computation may overflow for large values +- [ ] **Decimal precision**: Scale/precision must match + +### String/Binary Types +- [ ] **Truncation**: ORC may truncate long strings in statistics +- [ ] **Collation**: String ordering depends on encoding +- [ ] **Empty strings**: "" vs null distinction + +### Temporal Types +- [ ] **Timestamp units**: Seconds vs milliseconds vs microseconds vs nanoseconds +- [ ] **Timezone handling**: UTC vs local time +- [ ] **Date boundaries**: Handling of dates before epoch + +### Null Handling +- [ ] **Three-valued logic**: UNKNOWN != FALSE +- [ ] **All-null columns**: num_values = 0 detection +- [ ] **Null in predicates**: `x = NULL` is UNKNOWN, not FALSE + +### Statistics Reliability +- [ ] **Deprecated statistics**: Old ORC writers had bugs +- [ ] **Missing statistics**: Not all columns have stats +- [ ] **Corrupted statistics**: min > max should be rejected +- [ ] **Empty stripes**: num_rows = 0 edge case + +### Concurrency +- [ ] **Race conditions**: Multiple threads updating cache +- [ ] **Deadlocks**: Lock ordering +- [ ] **Idempotency**: Repeated operations must be safe + +--- + +## Key Parquet Code References + +Study these specific locations in the Parquet implementation: + +### ParquetFileFragment Class +**File:** `cpp/src/arrow/dataset/file_parquet.h:158-235` + +Key members to mirror: +```cpp +std::optional> row_groups_; // -> stripes_ +std::vector statistics_expressions_; // -> stripe_guarantees_ +std::vector statistics_expressions_complete_; // -> statistics_complete_ +std::shared_ptr metadata_; // -> OrcFileMetadata +std::shared_ptr manifest_; // -> OrcSchemaManifest +``` + +### TestRowGroups Implementation +**File:** `cpp/src/arrow/dataset/file_parquet.cc:933-983` + +Pattern to follow: +1. Lock mutex +2. Simplify predicate with partition expression +3. Check satisfiability (early exit) +4. Resolve predicate fields +5. For uncached columns: load statistics, derive guarantees +6. For each row group: simplify predicate with guarantee +7. Return per-row-group expressions + +### FilterRowGroups Implementation +**File:** `cpp/src/arrow/dataset/file_parquet.cc:918-931` + +Simple wrapper: +1. Call `TestRowGroups()` +2. Select row groups where expression is satisfiable + +### TryCountRows Implementation +**File:** `cpp/src/arrow/dataset/file_parquet.cc:986-1010` + +Optimization: +1. If no field refs: count = num_rows or 0 +2. Call `TestRowGroups()` +3. Sum row counts for `literal(true)` groups +4. Return null if any group is not literal(true/false) + +### Thread Safety Pattern +**File:** `cpp/src/arrow/dataset/file_parquet.cc` + +Locations using `physical_schema_mutex_`: +- Line 798: `metadata()` accessor +- Line 803: `EnsureCompleteMetadata()` +- Line 923: `FilterRowGroups()` +- Line 935: `TestRowGroups()` + +### Test Patterns +**File:** `cpp/src/arrow/dataset/file_parquet_test.cc` + +Key tests to mirror: +- `CountRowsPredicatePushdown` (line 307) +- `PredicatePushdown` (line 639) +- `PredicatePushdownRowGroupFragments` (line 694) +- `CachedMetadata` (line 378) +- `MultithreadedScan` (line 436) + +--- + +## Operating Mode + +When implementing ORC predicate pushdown: + +1. **Default to analyzing** the ORC-related code and comparing against Parquet patterns +2. **Produce structured comparisons** using the framework above +3. **Work autonomously**: identify gaps, propose solutions, validate correctness +4. **Never wait** for explicit direction on what to compare +5. **Always end with actionable steps** + +Your goal is to ensure ORC predicate pushdown achieves a **high-quality, idiomatic implementation** that matches or intentionally diverges from the Parquet reference with clear justification. + +--- + +## Quick Reference: File Locations + +| Purpose | Parquet | ORC | +|---------|---------|-----| +| Header | `cpp/src/arrow/dataset/file_parquet.h` | `cpp/src/arrow/dataset/file_orc.h` | +| Implementation | `cpp/src/arrow/dataset/file_parquet.cc` | `cpp/src/arrow/dataset/file_orc.cc` | +| Tests | `cpp/src/arrow/dataset/file_parquet_test.cc` | `cpp/src/arrow/dataset/file_orc_test.cc` | +| ORC Adapter | - | `cpp/src/arrow/adapters/orc/adapter.h` | +| Specification | - | `orc-predicate-pushdown.allium` | diff --git a/QUICK-START.md b/QUICK-START.md new file mode 100644 index 000000000000..ac8310756c10 --- /dev/null +++ b/QUICK-START.md @@ -0,0 +1,140 @@ +# Quick Start for Next Agent + +## CRITICAL: Read IMPLEMENTATION-GUIDE.md First + +Before starting any implementation work, read `IMPLEMENTATION-GUIDE.md` which defines: +- How to use Parquet as a reference (inspiration, not modification) +- The comparison framework for ensuring quality +- Required outputs for each implementation session +- Non-negotiable constraints + +## Task 0: Extend ORC Adapter + +**Goal:** Add column statistics access to `cpp/src/arrow/adapters/orc/adapter.h` + +**What to add:** +```cpp +// In adapter.h: +struct ColumnStatistics { + bool has_null; + int64_t num_values; + bool has_minimum; + bool has_maximum; + std::shared_ptr minimum; + std::shared_ptr maximum; + bool is_statistics_deprecated; +}; + +Result GetStripeColumnStatistics( + int64_t stripe, int64_t column); +``` + +**Implementation steps:** +1. Study liborc API for statistics access +2. Add struct and method declaration to adapter.h +3. Implement in adapter.cc (access liborc Reader's statistics) +4. Convert ORC statistics to Arrow format +5. Write unit test verifying statistics for int32/int64 columns + +**Verification:** +```bash +cmake --build . --target arrow_orc +ctest -R orc # All ORC tests should pass +``` + +## After Task 0: Phase 1 (Tasks 1-5) + +### Task 1: OrcSchemaManifest structures +- File: `file_orc.h` +- Add OrcSchemaManifest and OrcSchemaField classes +- Similar to Parquet's SchemaManifest + +### Task 2: BuildOrcSchemaManifest +- File: `file_orc.cc` +- Walk Arrow schema + ORC type tree +- Extract column indices from type tree + +### Task 3: GetOrcColumnIndex +- File: `file_orc.cc` +- Resolve FieldRef -> ORC column index +- Handle nested fields + +### Task 4: OrcFileFragment +- Files: `file_orc.h`, `file_orc.cc` +- Extend FileFragment with ORC-specific fields +- Add: metadata, manifest, statistics_cache + +### Task 5: StripeStatisticsCache +- File: `file_orc.cc` +- Cache structure with stripe_guarantees +- Thread-safe with mutex + +## Key Files + +- **Implementation Guide:** `IMPLEMENTATION-GUIDE.md` (READ FIRST) +- **Task list:** `task_list.json` (36 tasks) +- **Specification:** `orc-predicate-pushdown.allium` +- **Parquet Reference:** `cpp/src/arrow/dataset/file_parquet.cc` + +## Build Commands + +```bash +# Configure (if needed) +cmake -S . -B build -DARROW_ORC=ON -DARROW_DATASET=ON + +# Build ORC adapter +cmake --build build --target arrow_orc + +# Build dataset module +cmake --build build --target arrow_dataset + +# Run tests +ctest --test-dir build -R orc +``` + +## Getting Unstuck + +1. **Can't access ORC statistics?** + - Check liborc documentation: `orc/Reader.hh` + - Look at existing adapter.cc for patterns + - Alternative: access liborc directly from file_orc.cc + +2. **Don't understand expression simplification?** + - Study `file_parquet.cc` TestRowGroups function + - Read Arrow compute expression docs + - Start with simple case: literal true/false + +3. **Thread safety confusion?** + - Follow Parquet pattern: physical_schema_mutex_ + - Protect all cache reads/writes + - Make operations idempotent + +4. **Tests failing?** + - Start with simplest test (single field, int32, >) + - Hand-craft ORC file with known statistics + - Verify stripe filtering manually + +## Testing Strategy + +1. **Unit tests** (per task) + - Test each function in isolation + - Mock/stub dependencies + - Cover edge cases + +2. **Integration tests** (after Task 20) + - End-to-end: create ORC file -> filter -> verify results + - Measure I/O reduction + - Test with various predicates + +3. **Performance benchmarks** (Task 33) + - Compare to baseline (no filtering) + - Measure cache benefit + - Compare to Parquet performance + +## Success = All 36 Tasks Complete + +Check `task_list.json` regularly. Mark tasks "complete" only when fully verified. + +--- + +**Start here:** Read `IMPLEMENTATION-GUIDE.md` -> Task 0 -> ORC adapter statistics APIs diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index cf6e94738e75..bf5bf60d006d 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -1114,7 +1114,11 @@ garrow_array_get_null_bitmap(GArrowArray *array) auto arrow_array = garrow_array_get_raw(array); auto arrow_null_bitmap = arrow_array->null_bitmap(); - return garrow_buffer_new_raw(&arrow_null_bitmap); + if (arrow_null_bitmap) { + return garrow_buffer_new_raw(&arrow_null_bitmap); + } else { + return nullptr; + } } /** diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 9b77e87422d5..87c5eed530f3 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -1165,13 +1165,13 @@ GArrowTimestampDataType * garrow_timestamp_data_type_new(GArrowTimeUnit unit, GTimeZone *time_zone) { auto arrow_unit = garrow_time_unit_to_raw(unit); - std::string arrow_timezone; + std::string arrow_time_zone; #if GLIB_CHECK_VERSION(2, 58, 0) if (time_zone) { - arrow_timezone = g_time_zone_get_identifier(time_zone); + arrow_time_zone = g_time_zone_get_identifier(time_zone); } #endif - auto arrow_data_type = arrow::timestamp(arrow_unit, arrow_timezone); + auto arrow_data_type = arrow::timestamp(arrow_unit, arrow_time_zone); auto data_type = GARROW_TIMESTAMP_DATA_TYPE(g_object_new(GARROW_TYPE_TIMESTAMP_DATA_TYPE, "data-type", @@ -2645,6 +2645,28 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) break; case arrow::Type::type::TIMESTAMP: type = GARROW_TYPE_TIMESTAMP_DATA_TYPE; + { + auto arrow_timestamp_data_type = + std::static_pointer_cast(*arrow_data_type); + const auto &arrow_time_zone = arrow_timestamp_data_type->timezone(); + if (!arrow_time_zone.empty()) { +#if GLIB_CHECK_VERSION(2, 68, 0) + auto time_zone = g_time_zone_new_identifier(arrow_time_zone.c_str()); +#else + auto time_zone = g_time_zone_new(arrow_time_zone.c_str()); +#endif + data_type = GARROW_DATA_TYPE(g_object_new(type, + "data-type", + arrow_data_type, + "time-zone", + time_zone, + nullptr)); + if (time_zone) { + g_time_zone_unref(time_zone); + } + return data_type; + } + } break; case arrow::Type::type::TIME32: type = GARROW_TYPE_TIME32_DATA_TYPE; diff --git a/c_glib/arrow-glib/composite-array.cpp b/c_glib/arrow-glib/composite-array.cpp index 9bc53264b729..4f31a599f510 100644 --- a/c_glib/arrow-glib/composite-array.cpp +++ b/c_glib/arrow-glib/composite-array.cpp @@ -188,6 +188,22 @@ garrow_base_list_array_get_value_offsets(GArrowArray *array, gint64 *n_offsets) return arrow_list_array->raw_value_offsets(); }; +template +GArrowBuffer * +garrow_base_list_array_get_value_offsets_buffer(GArrowArray *array) +{ + GArrowBuffer *buffer = nullptr; + g_object_get(array, "buffer1", &buffer, nullptr); + if (buffer) { + return buffer; + } + + auto arrow_array = garrow_array_get_raw(array); + auto arrow_list_array = std::static_pointer_cast(arrow_array); + auto arrow_buffer = arrow_list_array->value_offsets(); + return garrow_buffer_new_raw(&arrow_buffer); +}; + G_BEGIN_DECLS static void @@ -385,6 +401,21 @@ garrow_list_array_get_value_offsets(GArrowListArray *array, gint64 *n_offsets) n_offsets); } +/** + * garrow_list_array_get_value_offsets_buffer: + * @array: A #GArrowListArray. + * + * Returns: (transfer full) (nullable): The value offsets buffer. + * + * Since: 24.0.0 + */ +GArrowBuffer * +garrow_list_array_get_value_offsets_buffer(GArrowListArray *array) +{ + return garrow_base_list_array_get_value_offsets_buffer( + GARROW_ARRAY(array)); +} + typedef struct GArrowLargeListArrayPrivate_ { GArrowArray *raw_values; @@ -602,6 +633,21 @@ garrow_large_list_array_get_value_offsets(GArrowLargeListArray *array, gint64 *n return reinterpret_cast(value_offsets); } +/** + * garrow_large_list_array_get_value_offsets_buffer: + * @array: A #GArrowLargeListArray. + * + * Returns: (transfer full) (nullable): The value offsets buffer. + * + * Since: 24.0.0 + */ +GArrowBuffer * +garrow_large_list_array_get_value_offsets_buffer(GArrowLargeListArray *array) +{ + return garrow_base_list_array_get_value_offsets_buffer( + GARROW_ARRAY(array)); +} + typedef struct GArrowFixedSizeListArrayPrivate_ { GArrowArray *raw_values; @@ -1415,6 +1461,21 @@ garrow_union_array_get_field(GArrowUnionArray *array, gint i) return field; } +/** + * garrow_union_array_get_n_fields + * @array: A #GArrowUnionArray. + * + * Returns: The number of fields. + * + * Since: 24.0.0 + */ +gint +garrow_union_array_get_n_fields(GArrowUnionArray *array) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + return arrow_array->num_fields(); +} + G_DEFINE_TYPE(GArrowSparseUnionArray, garrow_sparse_union_array, GARROW_TYPE_UNION_ARRAY) static void diff --git a/c_glib/arrow-glib/composite-array.h b/c_glib/arrow-glib/composite-array.h index 117ffdf70797..930bb813acd7 100644 --- a/c_glib/arrow-glib/composite-array.h +++ b/c_glib/arrow-glib/composite-array.h @@ -68,6 +68,10 @@ GARROW_AVAILABLE_IN_2_0 const gint32 * garrow_list_array_get_value_offsets(GArrowListArray *array, gint64 *n_offsets); +GARROW_AVAILABLE_IN_24_0 +GArrowBuffer * +garrow_list_array_get_value_offsets_buffer(GArrowListArray *array); + #define GARROW_TYPE_LARGE_LIST_ARRAY (garrow_large_list_array_get_type()) GARROW_AVAILABLE_IN_0_16 G_DECLARE_DERIVABLE_TYPE( @@ -110,6 +114,10 @@ GARROW_AVAILABLE_IN_2_0 const gint64 * garrow_large_list_array_get_value_offsets(GArrowLargeListArray *array, gint64 *n_offsets); +GARROW_AVAILABLE_IN_24_0 +GArrowBuffer * +garrow_large_list_array_get_value_offsets_buffer(GArrowLargeListArray *array); + #define GARROW_TYPE_FIXED_SIZE_LIST_ARRAY (garrow_fixed_size_list_array_get_type()) GARROW_AVAILABLE_IN_23_0 G_DECLARE_DERIVABLE_TYPE(GArrowFixedSizeListArray, @@ -228,6 +236,10 @@ GARROW_AVAILABLE_IN_ALL GArrowArray * garrow_union_array_get_field(GArrowUnionArray *array, gint i); +GARROW_AVAILABLE_IN_24_0 +gint +garrow_union_array_get_n_fields(GArrowUnionArray *array); + #define GARROW_TYPE_SPARSE_UNION_ARRAY (garrow_sparse_union_array_get_type()) GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowSparseUnionArray, diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index ca43d4e0f179..745d3e567e47 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -1274,9 +1274,71 @@ garrow_source_node_options_new_table(GArrowTable *table) return options; } -G_DEFINE_TYPE(GArrowFilterNodeOptions, - garrow_filter_node_options, - GARROW_TYPE_EXECUTE_NODE_OPTIONS) +enum { + PROP_EXPRESSION = 1, +}; + +struct GArrowFilterNodeOptionsPrivate +{ + GArrowExpression *expression; +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowFilterNodeOptions, + garrow_filter_node_options, + GARROW_TYPE_EXECUTE_NODE_OPTIONS) + +#define GARROW_FILTER_NODE_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_filter_node_options_get_instance_private(GARROW_FILTER_NODE_OPTIONS(object))) + +static void +garrow_filter_node_options_dispose(GObject *object) +{ + auto priv = GARROW_FILTER_NODE_OPTIONS_GET_PRIVATE(object); + + if (priv->expression) { + g_object_unref(priv->expression); + priv->expression = nullptr; + } + + G_OBJECT_CLASS(garrow_filter_node_options_parent_class)->dispose(object); +} + +static void +garrow_filter_node_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_FILTER_NODE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_EXPRESSION: + priv->expression = GARROW_EXPRESSION(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_filter_node_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_FILTER_NODE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_EXPRESSION: + g_value_set_object(value, priv->expression); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} static void garrow_filter_node_options_init(GArrowFilterNodeOptions *object) @@ -1286,6 +1348,28 @@ garrow_filter_node_options_init(GArrowFilterNodeOptions *object) static void garrow_filter_node_options_class_init(GArrowFilterNodeOptionsClass *klass) { + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_filter_node_options_dispose; + gobject_class->set_property = garrow_filter_node_options_set_property; + gobject_class->get_property = garrow_filter_node_options_get_property; + + GParamSpec *spec; + + /** + * GArrowFilterNodeOptions:expression: + * + * The expression of this filter. + * + * Since: 24.0.0 + */ + spec = g_param_spec_object( + "expression", + "Expression", + "The expression of this filter", + GARROW_TYPE_EXPRESSION, + static_cast(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_EXPRESSION, spec); } /** @@ -1301,14 +1385,39 @@ garrow_filter_node_options_new(GArrowExpression *expression) { auto arrow_expression = garrow_expression_get_raw(expression); auto arrow_options = new arrow::acero::FilterNodeOptions(*arrow_expression); - auto options = - g_object_new(GARROW_TYPE_FILTER_NODE_OPTIONS, "options", arrow_options, NULL); + auto options = g_object_new(GARROW_TYPE_FILTER_NODE_OPTIONS, + "options", + arrow_options, + "expression", + expression, + nullptr); return GARROW_FILTER_NODE_OPTIONS(options); } -G_DEFINE_TYPE(GArrowProjectNodeOptions, - garrow_project_node_options, - GARROW_TYPE_EXECUTE_NODE_OPTIONS) +struct GArrowProjectNodeOptionsPrivate +{ + GList *expressions; +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowProjectNodeOptions, + garrow_project_node_options, + GARROW_TYPE_EXECUTE_NODE_OPTIONS) + +#define GARROW_PROJECT_NODE_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_project_node_options_get_instance_private( \ + GARROW_PROJECT_NODE_OPTIONS(object))) + +static void +garrow_project_node_options_dispose(GObject *object) +{ + auto priv = GARROW_PROJECT_NODE_OPTIONS_GET_PRIVATE(object); + + g_list_free_full(priv->expressions, g_object_unref); + priv->expressions = nullptr; + + G_OBJECT_CLASS(garrow_project_node_options_parent_class)->dispose(object); +} static void garrow_project_node_options_init(GArrowProjectNodeOptions *object) @@ -1318,6 +1427,9 @@ garrow_project_node_options_init(GArrowProjectNodeOptions *object) static void garrow_project_node_options_class_init(GArrowProjectNodeOptionsClass *klass) { + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_project_node_options_dispose; } /** @@ -1354,9 +1466,28 @@ garrow_project_node_options_new(GList *expressions, gchar **names, gsize n_names new arrow::acero::ProjectNodeOptions(arrow_expressions, arrow_names); auto options = g_object_new(GARROW_TYPE_PROJECT_NODE_OPTIONS, "options", arrow_options, NULL); + auto priv = GARROW_PROJECT_NODE_OPTIONS_GET_PRIVATE(options); + priv->expressions = + g_list_copy_deep(expressions, reinterpret_cast(g_object_ref), nullptr); return GARROW_PROJECT_NODE_OPTIONS(options); } +/** + * garrow_project_node_options_get_expressions: + * @options: A #GArrowProjectNodeOptions. + * + * Returns: (transfer none) (element-type GArrowExpression): Expressions + * of the @options. + * + * Since: 24.0.0 + */ +GList * +garrow_project_node_options_get_expressions(GArrowProjectNodeOptions *options) +{ + auto priv = GARROW_PROJECT_NODE_OPTIONS_GET_PRIVATE(options); + return priv->expressions; +} + typedef struct GArrowAggregationPrivate_ { gchar *function; @@ -1558,9 +1689,28 @@ garrow_aggregation_new(const gchar *function, NULL)); } -G_DEFINE_TYPE(GArrowAggregateNodeOptions, - garrow_aggregate_node_options, - GARROW_TYPE_EXECUTE_NODE_OPTIONS) +struct GArrowAggregateNodeOptionsPrivate +{ + GList *aggregations; +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowAggregateNodeOptions, + garrow_aggregate_node_options, + GARROW_TYPE_EXECUTE_NODE_OPTIONS) + +#define GARROW_AGGREGATE_NODE_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_aggregate_node_options_get_instance_private( \ + GARROW_AGGREGATE_NODE_OPTIONS(object))) + +static void +garrow_aggregate_node_options_dispose(GObject *object) +{ + auto priv = GARROW_AGGREGATE_NODE_OPTIONS_GET_PRIVATE(object); + g_list_free_full(priv->aggregations, g_object_unref); + priv->aggregations = nullptr; + G_OBJECT_CLASS(garrow_aggregate_node_options_parent_class)->dispose(object); +} static void garrow_aggregate_node_options_init(GArrowAggregateNodeOptions *object) @@ -1570,6 +1720,9 @@ garrow_aggregate_node_options_init(GArrowAggregateNodeOptions *object) static void garrow_aggregate_node_options_class_init(GArrowAggregateNodeOptionsClass *klass) { + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_aggregate_node_options_dispose; } /** @@ -1623,10 +1776,29 @@ garrow_aggregate_node_options_new(GList *aggregations, auto arrow_options = new arrow::acero::AggregateNodeOptions(std::move(arrow_aggregates), std::move(arrow_keys)); auto options = - g_object_new(GARROW_TYPE_AGGREGATE_NODE_OPTIONS, "options", arrow_options, NULL); + g_object_new(GARROW_TYPE_AGGREGATE_NODE_OPTIONS, "options", arrow_options, nullptr); + auto priv = GARROW_AGGREGATE_NODE_OPTIONS_GET_PRIVATE(options); + priv->aggregations = + g_list_copy_deep(aggregations, reinterpret_cast(g_object_ref), nullptr); return GARROW_AGGREGATE_NODE_OPTIONS(options); } +/** + * garrow_aggregate_node_options_get_aggregations: + * @options: A #GArrowAggregateNodeOptions. + * + * Returns: (transfer none) (element-type GArrowAggregation): Aggregations + * of the @options. + * + * Since: 24.0.0 + */ +GList * +garrow_aggregate_node_options_get_aggregations(GArrowAggregateNodeOptions *options) +{ + auto priv = GARROW_AGGREGATE_NODE_OPTIONS_GET_PRIVATE(options); + return priv->aggregations; +} + typedef struct GArrowSinkNodeOptionsPrivate_ { arrow::AsyncGenerator> generator; diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index ff2d0d29956d..2f4153676d45 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -183,6 +183,10 @@ GARROW_AVAILABLE_IN_11_0 GArrowProjectNodeOptions * garrow_project_node_options_new(GList *expressions, gchar **names, gsize n_names); +GARROW_AVAILABLE_IN_24_0 +GList * +garrow_project_node_options_get_expressions(GArrowProjectNodeOptions *options); + #define GARROW_TYPE_AGGREGATION (garrow_aggregation_get_type()) GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE( @@ -218,6 +222,10 @@ garrow_aggregate_node_options_new(GList *aggregations, gsize n_keys, GError **error); +GARROW_AVAILABLE_IN_24_0 +GList * +garrow_aggregate_node_options_get_aggregations(GArrowAggregateNodeOptions *options); + #define GARROW_TYPE_SINK_NODE_OPTIONS (garrow_sink_node_options_get_type()) GARROW_AVAILABLE_IN_6_0 G_DECLARE_DERIVABLE_TYPE(GArrowSinkNodeOptions, diff --git a/c_glib/arrow-glib/expression.cpp b/c_glib/arrow-glib/expression.cpp index 9be8e1f68bc1..84cc3ace467c 100644 --- a/c_glib/arrow-glib/expression.cpp +++ b/c_glib/arrow-glib/expression.cpp @@ -42,10 +42,14 @@ G_BEGIN_DECLS * Since: 6.0.0 */ -typedef struct GArrowExpressionPrivate_ +enum { + PROP_EXPRESSION = 1, +}; + +struct GArrowExpressionPrivate { arrow::compute::Expression expression; -} GArrowExpressionPrivate; +}; G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowExpression, garrow_expression, G_TYPE_OBJECT) @@ -61,6 +65,25 @@ garrow_expression_finalize(GObject *object) G_OBJECT_CLASS(garrow_expression_parent_class)->finalize(object); } +static void +garrow_expression_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_EXPRESSION_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_EXPRESSION: + priv->expression = + *static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + static void garrow_expression_init(GArrowExpression *object) { @@ -74,6 +97,15 @@ garrow_expression_class_init(GArrowExpressionClass *klass) auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->finalize = garrow_expression_finalize; + gobject_class->set_property = garrow_expression_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "expression", + "Expression", + "The raw arrow::compute::Expression *", + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_EXPRESSION, spec); } /** @@ -112,7 +144,71 @@ garrow_expression_equal(GArrowExpression *expression, GArrowExpression *other_ex return priv->expression.Equals(other_priv->expression); } -G_DEFINE_TYPE(GArrowLiteralExpression, garrow_literal_expression, GARROW_TYPE_EXPRESSION) +enum { + PROP_DATUM = 1, +}; + +struct GArrowLiteralExpressionPrivate +{ + GArrowDatum *datum; +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowLiteralExpression, + garrow_literal_expression, + GARROW_TYPE_EXPRESSION) + +#define GARROW_LITERAL_EXPRESSION_GET_PRIVATE(object) \ + static_cast( \ + garrow_literal_expression_get_instance_private(GARROW_LITERAL_EXPRESSION(object))) + +static void +garrow_literal_expression_dispose(GObject *object) +{ + auto priv = GARROW_LITERAL_EXPRESSION_GET_PRIVATE(object); + + if (priv->datum) { + g_object_unref(priv->datum); + priv->datum = nullptr; + } + + G_OBJECT_CLASS(garrow_literal_expression_parent_class)->dispose(object); +} + +static void +garrow_literal_expression_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_LITERAL_EXPRESSION_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATUM: + priv->datum = GARROW_DATUM(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_literal_expression_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_LITERAL_EXPRESSION_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATUM: + g_value_set_object(value, priv->datum); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} static void garrow_literal_expression_init(GArrowLiteralExpression *object) @@ -122,6 +218,28 @@ garrow_literal_expression_init(GArrowLiteralExpression *object) static void garrow_literal_expression_class_init(GArrowLiteralExpressionClass *klass) { + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_literal_expression_dispose; + gobject_class->set_property = garrow_literal_expression_set_property; + gobject_class->get_property = garrow_literal_expression_get_property; + + GParamSpec *spec; + + /** + * GArrowLiteralExpression:datum: + * + * The datum of this literal. + * + * Since: 24.0.0 + */ + spec = g_param_spec_object( + "datum", + "Datum", + "The datum of this literal", + GARROW_TYPE_DATUM, + static_cast(G_PARAM_READWRITE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATUM, spec); } /** @@ -137,7 +255,12 @@ garrow_literal_expression_new(GArrowDatum *datum) { auto arrow_datum = garrow_datum_get_raw(datum); auto arrow_expression = arrow::compute::literal(arrow_datum); - return GARROW_LITERAL_EXPRESSION(garrow_expression_new_raw(arrow_expression)); + return GARROW_LITERAL_EXPRESSION(garrow_expression_new_raw(arrow_expression, + "expression", + &arrow_expression, + "datum", + datum, + nullptr)); } G_DEFINE_TYPE(GArrowFieldExpression, garrow_field_expression, GARROW_TYPE_EXPRESSION) @@ -173,7 +296,29 @@ garrow_field_expression_new(const gchar *reference, GError **error) return GARROW_FIELD_EXPRESSION(garrow_expression_new_raw(arrow_expression)); } -G_DEFINE_TYPE(GArrowCallExpression, garrow_call_expression, GARROW_TYPE_EXPRESSION) +struct GArrowCallExpressionPrivate +{ + GList *arguments; +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowCallExpression, + garrow_call_expression, + GARROW_TYPE_EXPRESSION) + +#define GARROW_CALL_EXPRESSION_GET_PRIVATE(object) \ + static_cast( \ + garrow_call_expression_get_instance_private(GARROW_CALL_EXPRESSION(object))) + +static void +garrow_call_expression_dispose(GObject *object) +{ + auto priv = GARROW_CALL_EXPRESSION_GET_PRIVATE(object); + + g_list_free_full(priv->arguments, g_object_unref); + priv->arguments = nullptr; + + G_OBJECT_CLASS(garrow_call_expression_parent_class)->dispose(object); +} static void garrow_call_expression_init(GArrowCallExpression *object) @@ -183,6 +328,9 @@ garrow_call_expression_init(GArrowCallExpression *object) static void garrow_call_expression_class_init(GArrowCallExpressionClass *klass) { + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_call_expression_dispose; } /** @@ -211,13 +359,57 @@ garrow_call_expression_new(const gchar *function, arrow_options.reset(garrow_function_options_get_raw(options)->Copy().release()); } auto arrow_expression = arrow::compute::call(function, arrow_arguments, arrow_options); - return GARROW_CALL_EXPRESSION(garrow_expression_new_raw(arrow_expression)); + auto expression = GARROW_CALL_EXPRESSION(garrow_expression_new_raw(arrow_expression)); + auto priv = GARROW_CALL_EXPRESSION_GET_PRIVATE(expression); + priv->arguments = + g_list_copy_deep(arguments, reinterpret_cast(g_object_ref), nullptr); + return expression; +} + +/** + * garrow_call_expression_get_arguments: + * @expression: A #GArrowCallExpression. + * + * Returns: (transfer none) (element-type GArrowExpression): Arguments + * of this expression. + * + * Since: 24.0.0 + */ +GList * +garrow_call_expression_get_arguments(GArrowCallExpression *expression) +{ + auto priv = GARROW_CALL_EXPRESSION_GET_PRIVATE(expression); + return priv->arguments; } G_END_DECLS GArrowExpression * garrow_expression_new_raw(const arrow::compute::Expression &arrow_expression) +{ + return garrow_expression_new_raw(arrow_expression, + "expression", + &arrow_expression, + nullptr); +} + +GArrowExpression * +garrow_expression_new_raw(const arrow::compute::Expression &arrow_expression, + const gchar *first_property_name, + ...) +{ + va_list args; + va_start(args, first_property_name); + auto array = + garrow_expression_new_raw_valist(arrow_expression, first_property_name, args); + va_end(args); + return array; +} + +GArrowExpression * +garrow_expression_new_raw_valist(const arrow::compute::Expression &arrow_expression, + const gchar *first_property_name, + va_list args) { GType gtype = GARROW_TYPE_EXPRESSION; if (arrow_expression.literal()) { @@ -227,10 +419,7 @@ garrow_expression_new_raw(const arrow::compute::Expression &arrow_expression) } else if (arrow_expression.call()) { gtype = GARROW_TYPE_CALL_EXPRESSION; } - auto expression = GARROW_EXPRESSION(g_object_new(gtype, NULL)); - auto priv = GARROW_EXPRESSION_GET_PRIVATE(expression); - priv->expression = arrow_expression; - return expression; + return GARROW_EXPRESSION(g_object_new_valist(gtype, first_property_name, args)); } arrow::compute::Expression * diff --git a/c_glib/arrow-glib/expression.h b/c_glib/arrow-glib/expression.h index 5a6bfb456fc6..e690aa41b865 100644 --- a/c_glib/arrow-glib/expression.h +++ b/c_glib/arrow-glib/expression.h @@ -76,5 +76,8 @@ GArrowCallExpression * garrow_call_expression_new(const gchar *function, GList *arguments, GArrowFunctionOptions *options); +GARROW_AVAILABLE_IN_24_0 +GList * +garrow_call_expression_get_arguments(GArrowCallExpression *expression); G_END_DECLS diff --git a/c_glib/arrow-glib/expression.hpp b/c_glib/arrow-glib/expression.hpp index cc96badbe67a..90606a6fb31c 100644 --- a/c_glib/arrow-glib/expression.hpp +++ b/c_glib/arrow-glib/expression.hpp @@ -27,6 +27,18 @@ GARROW_EXTERN GArrowExpression * garrow_expression_new_raw(const arrow::compute::Expression &arrow_expression); +GARROW_EXTERN +GArrowExpression * +garrow_expression_new_raw(const arrow::compute::Expression &arrow_expression, + const gchar *first_property_name, + ...); + +GARROW_EXTERN +GArrowExpression * +garrow_expression_new_raw_valist(const arrow::compute::Expression &arrow_expression, + const gchar *first_property_name, + va_list args); + GARROW_EXTERN arrow::compute::Expression * garrow_expression_get_raw(GArrowExpression *expression); diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp b/c_glib/parquet-glib/arrow-file-reader.cpp index 7c7d20291a54..86bf284d1236 100644 --- a/c_glib/parquet-glib/arrow-file-reader.cpp +++ b/c_glib/parquet-glib/arrow-file-reader.cpp @@ -246,8 +246,7 @@ gparquet_arrow_file_reader_read_row_group(GParquetArrowFileReader *reader, { const gchar *tag = "[parquet][arrow][file-reader][read-row-group]"; auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader); - std::shared_ptr arrow_table; - arrow::Status status; + arrow::Result> arrow_table_result; if (column_indices) { const auto n_columns = parquet_arrow_file_reader->parquet_reader()->metadata()->num_columns(); @@ -268,14 +267,13 @@ gparquet_arrow_file_reader_read_row_group(GParquetArrowFileReader *reader, } parquet_column_indices.push_back(column_index); } - status = parquet_arrow_file_reader->ReadRowGroup(row_group_index, - parquet_column_indices, - &arrow_table); + arrow_table_result = + parquet_arrow_file_reader->ReadRowGroup(row_group_index, parquet_column_indices); } else { - status = parquet_arrow_file_reader->ReadRowGroup(row_group_index, &arrow_table); + arrow_table_result = parquet_arrow_file_reader->ReadRowGroup(row_group_index); } - if (garrow_error_check(error, status, tag)) { - return garrow_table_new_raw(&arrow_table); + if (garrow::check(error, arrow_table_result, tag)) { + return garrow_table_new_raw(&(*arrow_table_result)); } else { return NULL; } diff --git a/c_glib/vcpkg.json b/c_glib/vcpkg.json index a9b1593cebe4..e640c9044bd5 100644 --- a/c_glib/vcpkg.json +++ b/c_glib/vcpkg.json @@ -7,5 +7,5 @@ "pkgconf" ], "$comment": "We can update builtin-baseline by 'vcpkg x-update-baseline'", - "builtin-baseline": "09f6a4ef2f08252f7f4d924fd9c2d42165fb21c9" + "builtin-baseline": "40c89449f0ccce12d21f8a906639f6c2c649b9e7" } diff --git a/ci/conan/all/conanfile.py b/ci/conan/all/conanfile.py index 7dab8c82f699..b9999e50050d 100644 --- a/ci/conan/all/conanfile.py +++ b/ci/conan/all/conanfile.py @@ -207,9 +207,9 @@ def requirements(self): self.requires("lz4/1.9.4") if self.options.with_snappy: self.requires("snappy/1.1.9") - if self.options.get_safe("simd_level") != None or \ - self.options.get_safe("runtime_simd_level") != None: - self.requires("xsimd/13.0.0") + if self.options.get_safe("simd_level") is not None or \ + self.options.get_safe("runtime_simd_level") is not None: + self.requires("xsimd/14.0.0") if self.options.with_zlib: self.requires("zlib/[>=1.2.11 <2]") if self.options.with_zstd: diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index fec8488f954e..470db4f8b9da 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -21,7 +21,7 @@ azure-identity-cpp>=1.6.0 azure-storage-blobs-cpp>=12.10.0 azure-storage-common-cpp>=12.5.0 azure-storage-files-datalake-cpp>=12.9.0 -benchmark>=1.6.0,!=1.8.4 +benchmark>=1.6.0,!=1.8.4,<1.9.5 brotli bzip2 c-ares @@ -47,6 +47,6 @@ rapidjson re2 snappy thrift-cpp>=0.11.0 -xsimd +xsimd>=14.0 zlib zstd diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index eddba95a11ff..33ac193f86e8 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -24,6 +24,7 @@ cython>=3.1 cloudpickle fsspec hypothesis +libcst>=1.8.6 numpy>=1.16.6 pytest pytest-faulthandler diff --git a/ci/docker/alpine-linux-3.22-cpp.dockerfile b/ci/docker/alpine-linux-3.22-cpp.dockerfile index 48907e61a4a6..c3a2a58ef959 100644 --- a/ci/docker/alpine-linux-3.22-cpp.dockerfile +++ b/ci/docker/alpine-linux-3.22-cpp.dockerfile @@ -64,7 +64,6 @@ RUN apk add \ thrift-dev \ tzdata \ utf8proc-dev \ - xsimd-dev \ zlib-dev \ zstd-dev && \ rm -rf /var/cache/apk/* && \ @@ -103,4 +102,5 @@ ENV ARROW_ACERO=ON \ AWSSDK_SOURCE=BUNDLED \ google_cloud_cpp_storage_SOURCE=BUNDLED \ MUSL_LOCPATH=/usr/share/i18n/locales/musl \ - PATH=/usr/lib/ccache/bin:$PATH + PATH=/usr/lib/ccache/bin:$PATH \ + xsimd_SOURCE=BUNDLED diff --git a/ci/docker/cpp-jni.dockerfile b/ci/docker/cpp-jni.dockerfile index f268de12ca35..91508089c422 100644 --- a/ci/docker/cpp-jni.dockerfile +++ b/ci/docker/cpp-jni.dockerfile @@ -29,6 +29,7 @@ RUN dnf install -y \ gdb \ git \ perl-IPC-Cmd \ + perl-Time-Piece \ wget \ zip diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile deleted file mode 100644 index 44c845bb17ef..000000000000 --- a/ci/docker/debian-12-cpp.dockerfile +++ /dev/null @@ -1,149 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG arch=amd64 -FROM ${arch}/debian:12 -ARG arch - -ENV DEBIAN_FRONTEND noninteractive - -ARG llvm -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - gnupg \ - lsb-release \ - wget && \ - if [ ${llvm} -ge 17 ]; then \ - wget -O /usr/share/keyrings/llvm-snapshot.asc \ - https://apt.llvm.org/llvm-snapshot.gpg.key && \ - (echo "Types: deb"; \ - echo "URIs: https://apt.llvm.org/$(lsb_release --codename --short)/"; \ - echo "Suites: llvm-toolchain-$(lsb_release --codename --short)-${llvm}"; \ - echo "Components: main"; \ - echo "Signed-By: /usr/share/keyrings/llvm-snapshot.asc") | \ - tee /etc/apt/sources.list.d/llvm.sources; \ - fi && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - autoconf \ - ccache \ - clang-${llvm} \ - cmake \ - curl \ - g++ \ - gcc \ - gdb \ - git \ - libbenchmark-dev \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgflags-dev \ - libgmock-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - libidn2-dev \ - libkrb5-dev \ - libldap-dev \ - liblz4-dev \ - libnghttp2-dev \ - libprotobuf-dev \ - libprotoc-dev \ - libpsl-dev \ - libre2-dev \ - librtmp-dev \ - libsnappy-dev \ - libsqlite3-dev \ - libssh-dev \ - libssh2-1-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libxml2-dev \ - libzstd-dev \ - llvm-${llvm}-dev \ - make \ - ninja-build \ - nlohmann-json3-dev \ - npm \ - patch \ - pkg-config \ - protobuf-compiler-grpc \ - python3-dev \ - python3-pip \ - python3-venv \ - rapidjson-dev \ - rsync \ - tzdata \ - zlib1g-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh latest /usr/local - -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - -COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_azurite.sh - -COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin - -# Prioritize system packages and local installation. -# -# The following dependencies will be downloaded due to missing/invalid packages -# provided by the distribution: -# - opentelemetry-cpp-dev is not packaged -ENV ARROW_ACERO=ON \ - ARROW_AZURE=ON \ - ARROW_BUILD_TESTS=ON \ - ARROW_DATASET=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_DATASET=ON \ - ARROW_FLIGHT=ON \ - ARROW_FLIGHT_SQL=ON \ - ARROW_GANDIVA=ON \ - ARROW_GCS=ON \ - ARROW_HOME=/usr/local \ - ARROW_JEMALLOC=ON \ - ARROW_ORC=ON \ - ARROW_PARQUET=ON \ - ARROW_S3=ON \ - ARROW_SUBSTRAIT=ON \ - ARROW_USE_CCACHE=ON \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_OPENTELEMETRY=ON \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - AWSSDK_SOURCE=BUNDLED \ - Azure_SOURCE=BUNDLED \ - google_cloud_cpp_storage_SOURCE=BUNDLED \ - opentelemetry_cpp_SOURCE=BUNDLED \ - ORC_SOURCE=BUNDLED \ - PATH=/usr/lib/ccache/:$PATH \ - PYTHON=python3 \ - xsimd_SOURCE=BUNDLED diff --git a/ci/docker/debian-13-cpp.dockerfile b/ci/docker/debian-13-cpp.dockerfile index ca96b4177ff0..4f0529ab50e5 100644 --- a/ci/docker/debian-13-cpp.dockerfile +++ b/ci/docker/debian-13-cpp.dockerfile @@ -42,6 +42,7 @@ RUN apt-get update -y -q && \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ autoconf \ + cargo \ ccache \ clang-${llvm} \ cmake \ @@ -55,31 +56,22 @@ RUN apt-get update -y -q && \ libboost-system-dev \ libbrotli-dev \ libbz2-dev \ - libc-ares-dev \ libcurl4-openssl-dev \ libgflags-dev \ libgmock-dev \ libgoogle-glog-dev \ libgrpc++-dev \ - libidn2-dev \ - libkrb5-dev \ - libldap-dev \ liblz4-dev \ - libnghttp2-dev \ + libopentelemetry-proto-dev \ libprotobuf-dev \ libprotoc-dev \ - libpsl-dev \ libre2-dev \ - librtmp-dev \ libsnappy-dev \ libsqlite3-dev \ - libssh-dev \ - libssh2-1-dev \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ libxml2-dev \ - libxsimd-dev \ libzstd-dev \ llvm-${llvm}-dev \ make \ @@ -95,7 +87,9 @@ RUN apt-get update -y -q && \ python3-venv \ rapidjson-dev \ rsync \ + rustc \ tzdata \ + tzdata-legacy \ zlib1g-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -142,4 +136,5 @@ ENV ARROW_ACERO=ON \ google_cloud_cpp_storage_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PATH=/usr/lib/ccache/:$PATH \ - PYTHON=python3 + PYTHON=python3 \ + xsimd_SOURCE=BUNDLED diff --git a/ci/docker/debian-experimental-cpp.dockerfile b/ci/docker/debian-experimental-cpp.dockerfile index d37b58e23071..58b49eb70c96 100644 --- a/ci/docker/debian-experimental-cpp.dockerfile +++ b/ci/docker/debian-experimental-cpp.dockerfile @@ -73,7 +73,6 @@ RUN if [ -n "${gcc}" ]; then \ libthrift-dev \ libutf8proc-dev \ libxml2-dev \ - libxsimd-dev \ libzstd-dev \ make \ ninja-build \ @@ -143,4 +142,5 @@ ENV ARROW_ACERO=ON \ google_cloud_cpp_storage_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PATH=/usr/lib/ccache/:$PATH \ - PYTHON=python3 + PYTHON=python3 \ + xsimd_SOURCE=BUNDLED diff --git a/ci/docker/fedora-42-cpp.dockerfile b/ci/docker/fedora-42-cpp.dockerfile index cabb066fec3c..b5235f2616bf 100644 --- a/ci/docker/fedora-42-cpp.dockerfile +++ b/ci/docker/fedora-42-cpp.dockerfile @@ -65,7 +65,6 @@ RUN dnf update -y && \ utf8proc-devel \ wget \ which \ - xsimd-devel \ zlib-devel COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ @@ -109,4 +108,5 @@ ENV ARROW_ACERO=ON \ PARQUET_BUILD_EXAMPLES=ON \ PARQUET_BUILD_EXECUTABLES=ON \ PATH=/usr/lib/ccache/:$PATH \ - PYARROW_TEST_GANDIVA=OFF + PYARROW_TEST_GANDIVA=OFF \ + xsimd_SOURCE=BUNDLED diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index b9f7c716e520..52090f8bb82a 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -31,11 +31,9 @@ RUN apt-get update -y && \ lsb-release && \ gpg --keyserver keyserver.ubuntu.com \ --recv-key 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 && \ - gpg --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ - gpg --no-default-keyring \ - --keyring /usr/share/keyrings/cran.gpg \ - --import - && \ - echo "deb [signed-by=/usr/share/keyrings/cran.gpg] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ + gpg --armor --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ + tee /usr/share/keyrings/cran.asc && \ + echo "deb [signed-by=/usr/share/keyrings/cran.asc] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ tee /etc/apt/sources.list.d/cran.list && \ if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ sed -i \ diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index ffdd0d44f5f7..4ced75bce559 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -26,7 +26,7 @@ ENV LINUX_WHEEL_KIND='manylinux' ENV LINUX_WHEEL_VERSION=${manylinux} # Install basic dependencies -RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget +RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd perl-Time-Piece wget # A system Python is required for Ninja and vcpkg in this Dockerfile. # On manylinux_2_28 base images, no system Python is installed. @@ -113,10 +113,5 @@ RUN PYTHON_ROOT=$(find /opt/python -name cp${PYTHON_VERSION/./}-${PYTHON_ABI_TAG SHELL ["/bin/bash", "-i", "-c"] ENTRYPOINT ["/bin/bash", "-i", "-c"] -# Remove once there are released Cython wheels for 3.13 free-threaded available -RUN if [ "${python_abi_tag}" = "cp313t" ]; then \ - pip install cython --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" --prefer-binary ; \ - fi - COPY python/requirements-wheel-build.txt /arrow/python/ RUN pip install -r /arrow/python/requirements-wheel-build.txt diff --git a/ci/docker/python-wheel-windows-vs2022-base.dockerfile b/ci/docker/python-wheel-windows-vs2022-base.dockerfile index e63b8fc99455..e4e2eaef82f0 100644 --- a/ci/docker/python-wheel-windows-vs2022-base.dockerfile +++ b/ci/docker/python-wheel-windows-vs2022-base.dockerfile @@ -89,14 +89,15 @@ RUN ` # See https://docs.python.org/dev/using/windows.html#python-install-manager and # https://www.python.org/ftp/python/pymanager/ RUN ` - $pymanager_url = 'https://www.python.org/ftp/python/pymanager/python-manager-25.0.msix'; ` - Invoke-WebRequest -Uri $pymanager_url -OutFile 'C:\Windows\pymanager.msix'; ` - Add-AppxPackage C:\Windows\pymanager.msix + $pymanager_url = 'https://www.python.org/ftp/python/pymanager/python-manager-25.0.msi'; ` + Invoke-WebRequest -Uri $pymanager_url -OutFile 'C:\Windows\pymanager.msi'; ` + Start-Process msiexec.exe -Wait -ArgumentList '/i C:\Windows\pymanager.msi /quiet /norestart'; ` + Remove-Item C:\Windows\pymanager.msi SHELL ["cmd", "/S", "/C"] # Install CMake and other tools -ARG cmake=3.31.2 +ARG cmake=3.31.9 RUN choco install --no-progress -r -y cmake --version=%cmake% --installargs 'ADD_CMAKE_TO_PATH=System' RUN choco install --no-progress -r -y git gzip ninja wget @@ -136,9 +137,10 @@ RUN vcpkg install ` --clean-after-build ` --x-install-root=%VCPKG_ROOT%\installed ` --x-manifest-root=arrow/ci/vcpkg ` - --x-feature=flight` - --x-feature=gcs` - --x-feature=json` - --x-feature=orc` - --x-feature=parquet` + --x-feature=azure ` + --x-feature=flight ` + --x-feature=gcs ` + --x-feature=json ` + --x-feature=orc ` + --x-feature=parquet ` --x-feature=s3 diff --git a/ci/docker/python-wheel-windows-vs2022.dockerfile b/ci/docker/python-wheel-windows-vs2022.dockerfile index d4d5e57cd2c0..e25ebef156c6 100644 --- a/ci/docker/python-wheel-windows-vs2022.dockerfile +++ b/ci/docker/python-wheel-windows-vs2022.dockerfile @@ -24,14 +24,16 @@ FROM ${base} # Define the full version number otherwise choco falls back to patch number 0 (3.10 => 3.10.0) ARG python=3.10 -ARG python_variant=default -ENV PYTHON_VERSION=${python} -ENV PYTHON_VARIANT=${python_variant} -RUN pymanager install --version %PYTHON_VERSION% --variant %PYTHON_VARIANT% +ARG python_variant_suffix="" +ENV PYTHON_VERSION=${python}${python_variant_suffix} + +RUN pymanager install %PYTHON_VERSION% RUN py -%PYTHON_VERSION% -m pip install -U pip setuptools COPY python/requirements-wheel-build.txt C:/arrow/python/ RUN py -%PYTHON_VERSION% -m pip install -r C:/arrow/python/requirements-wheel-build.txt +ENV PYTHON_CMD="py -${python}${python_variant_suffix}" + ENV PYTHON=${python} diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index 4f3acb207bb4..4cb0ce1450cc 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=23.0.0.9000 +pkgver=23.0.1.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 0ad59bc308f1..88239a0bd1e7 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -59,6 +59,7 @@ case "$(uname)" in ;; Darwin) n_jobs=$(sysctl -n hw.ncpu) + exclude_tests+=("arrow-flight-sql-odbc-test") # TODO: https://github.com/apache/arrow/issues/40410 exclude_tests+=("arrow-s3fs-test") ;; @@ -182,6 +183,15 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then # Some fuzz regression files may trigger huge memory allocations, # let the allocator return null instead of aborting. export ASAN_OPTIONS="$ASAN_OPTIONS allocator_may_return_null=1" + export ARROW_FUZZING_VERBOSITY=1 + # Run golden IPC integration files: these should ideally load without errors, + # though some very old ones carry invalid data (such as decimal values + # larger than their advertised precision). + # shellcheck disable=SC2046 + "${binary_output_dir}/arrow-ipc-stream-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.stream") + # shellcheck disable=SC2046 + "${binary_output_dir}/arrow-ipc-file-fuzz" $(find "${ARROW_TEST_DATA}"/arrow-ipc-stream/integration -name "*.arrow_file") + # Run known crash files "${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/crash-* "${binary_output_dir}/arrow-ipc-stream-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-stream/*-testcase-* "${binary_output_dir}/arrow-ipc-file-fuzz" "${ARROW_TEST_DATA}"/arrow-ipc-file/*-testcase-* diff --git a/ci/scripts/install_dask.sh b/ci/scripts/install_dask.sh index 8967e2681d9b..a6ccc2a2611a 100755 --- a/ci/scripts/install_dask.sh +++ b/ci/scripts/install_dask.sh @@ -28,7 +28,6 @@ dask=$1 if [ "${dask}" = "upstream_devel" ]; then pip install "dask[dataframe] @ git+https://github.com/dask/dask.git" - pip install -U git+https://github.com/dask-contrib/dask-expr.git elif [ "${dask}" = "latest" ]; then pip install "dask[dataframe]" else diff --git a/ci/scripts/python_benchmark.sh b/ci/scripts/python_benchmark.sh deleted file mode 100755 index f2f320370bc5..000000000000 --- a/ci/scripts/python_benchmark.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Check the ASV benchmarking setup. -# Unfortunately this won't ensure that all benchmarks succeed -# (see https://github.com/airspeed-velocity/asv/issues/449) -source deactivate -conda create -y -q -n pyarrow_asv python=$PYTHON_VERSION -conda activate pyarrow_asv -pip install -q git+https://github.com/pitrou/asv.git@customize_commands - -export PYARROW_WITH_PARQUET=1 -export PYARROW_WITH_ORC=0 -export PYARROW_WITH_GANDIVA=0 - -pushd $ARROW_PYTHON_DIR -# Workaround for https://github.com/airspeed-velocity/asv/issues/631 -DEFAULT_BRANCH=$(git rev-parse --abbrev-ref origin/HEAD | sed s@origin/@@) -git fetch --depth=100 origin $DEFAULT_BRANCH:$DEFAULT_BRANCH -# Generate machine information (mandatory) -asv machine --yes -# Run benchmarks on the changeset being tested -asv run --no-pull --show-stderr --quick HEAD^! -popd # $ARROW_PYTHON_DIR diff --git a/ci/scripts/python_build.bat b/ci/scripts/python_build.bat index 417cc0d5dd0b..06f5a637223a 100644 --- a/ci/scripts/python_build.bat +++ b/ci/scripts/python_build.bat @@ -40,6 +40,7 @@ ccache -sv echo "=== Building Arrow C++ libraries ===" set ARROW_ACERO=ON +set ARROW_AZURE=ON set ARROW_DATASET=ON set ARROW_FLIGHT=OFF set ARROW_GANDIVA=OFF @@ -67,6 +68,7 @@ pushd %CPP_BUILD_DIR% cmake ^ -DARROW_ACERO=%ARROW_ACERO% ^ + -DARROW_AZURE=%ARROW_AZURE% ^ -DARROW_BUILD_SHARED=ON ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_BUILD_TESTS=OFF ^ @@ -114,6 +116,7 @@ set PYARROW_BUILD_VERBOSE=1 set PYARROW_BUNDLE_ARROW_CPP=ON set PYARROW_CMAKE_GENERATOR=%CMAKE_GENERATOR% set PYARROW_WITH_ACERO=%ARROW_ACERO% +set PYARROW_WITH_AZURE=%ARROW_AZURE% set PYARROW_WITH_DATASET=%ARROW_DATASET% set PYARROW_WITH_FLIGHT=%ARROW_FLIGHT% set PYARROW_WITH_GANDIVA=%ARROW_GANDIVA% diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index e0c64521cdd1..36dc35a2de8b 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -81,6 +81,7 @@ export PYARROW_PARALLEL=${n_jobs} : "${CMAKE_PREFIX_PATH:=${ARROW_HOME}}" export CMAKE_PREFIX_PATH export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} +export DYLD_LIBRARY_PATH=${ARROW_HOME}/lib${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}} # https://github.com/apache/arrow/issues/41429 # TODO: We want to out-of-source build. This is a workaround. We copy @@ -115,6 +116,7 @@ if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then export ARROW_CPP_DOXYGEN_XML=${build_dir}/cpp/apidoc/xml pushd "${build_dir}" sphinx-build \ + -j auto \ -b html \ "${python_build_dir}/docs/source" \ "${build_dir}/docs" diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index f6b9b0d7caba..962501d7b5e5 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -70,3 +70,8 @@ export PYARROW_TEST_S3 # Testing PyArrow pytest -r s ${PYTEST_ARGS} --pyargs pyarrow + +# Testing RST documentation examples (if PYTEST_RST_ARGS is set) +if [ -n "${PYTEST_RST_ARGS}" ]; then + pytest ${PYTEST_RST_ARGS} ${arrow_dir}/docs/source/python +fi diff --git a/cpp/cmake_modules/mimalloc-1138.patch b/ci/scripts/python_test_type_annotations.sh old mode 100644 new mode 100755 similarity index 56% rename from cpp/cmake_modules/mimalloc-1138.patch rename to ci/scripts/python_test_type_annotations.sh index 1ffa4bffbbaf..c1a051b1e56d --- a/cpp/cmake_modules/mimalloc-1138.patch +++ b/ci/scripts/python_test_type_annotations.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -15,19 +17,22 @@ # specific language governing permissions and limitations # under the License. -Fix for https://github.com/microsoft/mimalloc/issues/1138 +set -ex +pyarrow_dir=${1} + +if [ -n "${ARROW_PYTHON_VENV:-}" ]; then + # shellcheck source=/dev/null + . "${ARROW_PYTHON_VENV}/bin/activate" +fi + +# Install library stubs. Note some libraries contain their own type hints so they need to be installed. +pip install fsspec pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil + +# Install type checkers +pip install mypy pyright ty -diff --git a/src/arena.c b/src/arena.c -index b26f4442..d7e99b55 100644 ---- a/src/arena.c -+++ b/src/arena.c -@@ -797,6 +797,9 @@ mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t bloc - else { - page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment); - } -+ if mi_unlikely(page == NULL) { -+ return NULL; -+ } - // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); - mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); - mi_assert_internal(_mi_ptr_page(page)==page); +# Run type checkers +cd "${pyarrow_dir}" +mypy +pyright +ty check diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index bd61154430e0..0990a842e949 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -46,19 +46,9 @@ else exit 1 fi -echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" -export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') - -# Remove once there are released Cython wheels for 3.13 free-threaded available -FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" -if [[ $FREE_THREADED_BUILD == "True" ]]; then - pip install cython --pre --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" --prefer-binary -fi - pip install \ --force-reinstall \ --only-binary=:all: \ - --target $PIP_SITE_PACKAGES \ --upgrade \ -r ${source_dir}/python/requirements-wheel-build.txt pip install "delocate>=0.10.3" @@ -177,7 +167,7 @@ export CMAKE_PREFIX_PATH=${build_dir}/install export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} pushd ${source_dir}/python -python setup.py bdist_wheel +python -m build --sdist --wheel . --no-isolation popd echo "=== (${PYTHON_VERSION}) Show dynamic libraries the wheel depend on ===" diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py index 84fcaba42e69..153a70eb4069 100644 --- a/ci/scripts/python_wheel_validate_contents.py +++ b/ci/scripts/python_wheel_validate_contents.py @@ -33,8 +33,12 @@ def validate_wheel(path): ) ] assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" + for filename in ('LICENSE.txt', 'NOTICE.txt'): + assert any(info.filename.split("/")[-1] == filename + for info in f.filelist), \ + f"{filename} is missing from the wheel." print(f"The wheel: {wheels[0]} seems valid.") - + # TODO(GH-32609): Validate some docstrings were generated and added. def main(): parser = argparse.ArgumentParser() diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index b4b7fed99fd4..e10766ef37e9 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -37,6 +37,7 @@ del /s /q C:\arrow\python\pyarrow\*.so.* echo "=== (%PYTHON%) Building Arrow C++ libraries ===" set ARROW_ACERO=ON +set ARROW_AZURE=ON set ARROW_DATASET=ON set ARROW_FLIGHT=ON set ARROW_GANDIVA=OFF @@ -67,6 +68,7 @@ mkdir C:\arrow-build pushd C:\arrow-build cmake ^ -DARROW_ACERO=%ARROW_ACERO% ^ + -DARROW_AZURE=%ARROW_AZURE% ^ -DARROW_BUILD_SHARED=ON ^ -DARROW_BUILD_STATIC=OFF ^ -DARROW_BUILD_TESTS=OFF ^ @@ -117,6 +119,7 @@ set PYARROW_BUNDLE_ARROW_CPP=ON set PYARROW_CMAKE_GENERATOR=%CMAKE_GENERATOR% set PYARROW_CMAKE_OPTIONS="-DCMAKE_INTERPROCEDURAL_OPTIMIZATION=%CMAKE_INTERPROCEDURAL_OPTIMIZATION%" set PYARROW_WITH_ACERO=%ARROW_ACERO% +set PYARROW_WITH_AZURE=%ARROW_AZURE% set PYARROW_WITH_DATASET=%ARROW_DATASET% set PYARROW_WITH_FLIGHT=%ARROW_FLIGHT% set PYARROW_WITH_GANDIVA=%ARROW_GANDIVA% @@ -133,7 +136,7 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python @REM Build wheel -%PYTHON_CMD% setup.py bdist_wheel || exit /B 1 +%PYTHON_CMD% -m build --sdist --wheel . --no-isolation || exit /B 1 @REM Repair the wheel with delvewheel @REM diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index a686215b93da..1e9cacac8bfa 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -18,6 +18,7 @@ @echo on set PYARROW_TEST_ACERO=ON +set PYARROW_TEST_AZURE=ON set PYARROW_TEST_CYTHON=ON set PYARROW_TEST_DATASET=ON set PYARROW_TEST_FLIGHT=ON @@ -43,6 +44,7 @@ py -0p @REM Test that the modules are importable %PYTHON_CMD% -c "import pyarrow" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._azurefs" || exit /B 1 %PYTHON_CMD% -c "import pyarrow._gcsfs" || exit /B 1 %PYTHON_CMD% -c "import pyarrow._hdfs" || exit /B 1 %PYTHON_CMD% -c "import pyarrow._s3fs" || exit /B 1 diff --git a/ci/scripts/python_wheel_xlinux_build.sh b/ci/scripts/python_wheel_xlinux_build.sh index a3fbeb3c0b3c..ceebbc5ad019 100755 --- a/ci/scripts/python_wheel_xlinux_build.sh +++ b/ci/scripts/python_wheel_xlinux_build.sh @@ -167,7 +167,7 @@ export ARROW_HOME=/tmp/arrow-dist export CMAKE_PREFIX_PATH=/tmp/arrow-dist pushd /arrow/python -python setup.py bdist_wheel +python -m build --sdist --wheel . --no-isolation echo "=== Strip symbols from wheel ===" mkdir -p dist/temp-fix-wheel diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 27e97a5b241f..bef472d9cba5 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -66,15 +66,14 @@ index 0000000000..25568e70cd + } + diff --git a/ports/orc/portfile.cmake b/ports/orc/portfile.cmake -index 77ebf41ec3..4d065594a7 100644 +index 278bc17a1c..d47d859360 100644 --- a/ports/orc/portfile.cmake +++ b/ports/orc/portfile.cmake -@@ -6,6 +6,8 @@ vcpkg_from_github( - REF "v${VERSION}" - SHA512 eabee16a6e984452a8cb715d0524041b20dd1bd88d78bb32534db93e5dbdd786aa4df8c05975406cb0728241eb3025a506c4fefb8c334ef0d8a27e6cb920d44c - HEAD_REF master -+ PATCHES -+ orc-fix-exception-propagation.diff +@@ -9,6 +9,7 @@ vcpkg_from_github( + PATCHES + external-project.diff + tools-build.diff ++ orc-fix-exception-propagation.diff ) - - file(REMOVE "${SOURCE_PATH}/cmake_modules/FindGTest.cmake") + file(GLOB modules "${SOURCE_PATH}/cmake_modules/Find*.cmake") + file(REMOVE ${modules} "${SOURCE_PATH}/c++/libs/libhdfspp/libhdfspp.tar.gz") diff --git a/claude-progress.txt b/claude-progress.txt new file mode 100644 index 000000000000..b1efcb8f289d --- /dev/null +++ b/claude-progress.txt @@ -0,0 +1,46 @@ +# Claude Progress Log + +## Session 1 - 2026-02-20 + +### Task 0: Extend ORC adapter with column statistics APIs + +**Status**: Implementation complete, awaiting verification + +**Changes made**: +1. Added `OrcColumnStatistics` struct in adapter.h + - Provides Arrow-native interface for ORC statistics + - Fields: has_null, num_values, has_minimum, has_maximum, minimum, maximum + +2. Added public methods to ORCFileReader: + - `GetColumnStatistics(int column_index)` - file-level statistics + - `GetStripeColumnStatistics(int64_t stripe_index, int column_index)` - stripe-level statistics + - `GetORCType()` - exposes ORC type tree for column ID mapping + +3. Implemented in ORCFileReader::Impl: + - `GetColumnStatistics()` - wraps reader_->getStatistics() + - `GetStripeColumnStatistics()` - wraps reader_->getStripeStatistics() + - `GetORCType()` - wraps reader_->getType() + - `ConvertColumnStatistics()` - converts liborc statistics to Arrow Scalars + * Supports IntegerColumnStatistics -> Int64Scalar + * Supports DoubleColumnStatistics -> DoubleScalar + * Supports StringColumnStatistics -> StringScalar + +**Verification needed**: +- Build environment has configuration issues (missing Protobuf, RapidJSON) +- Code review complete - no syntax errors found +- Compilation verification pending proper build environment + +**Files modified**: +- cpp/src/arrow/adapters/orc/adapter.h +- cpp/src/arrow/adapters/orc/adapter.cc + +**Commit status**: +- Local commit created: b36d1ed9df +- Branch: task-0-column-statistics-apis +- Push blocked: Network proxy issue (403 tunnel failed) + +**Next steps**: +- Push branch to remote when network access available +- Create PR and merge +- Verify compilation in clean build environment +- Task 0.5: Implement stripe-selective record batch generation diff --git a/compose.yaml b/compose.yaml index 31bc5c81b95c..c799059fe254 100644 --- a/compose.yaml +++ b/compose.yaml @@ -441,7 +441,9 @@ services: ARROW_HOME: /arrow ARROW_DEPENDENCY_SOURCE: BUNDLED LIBARROW_MINIMAL: "false" - ARROW_MIMALLOC: "ON" + # explicitly enable GCS when we build libarrow so that binary libarrow + # users get more fully-featured builds + ARROW_GCS: "ON" volumes: *ubuntu-volumes command: &cpp-static-command /bin/bash -c " @@ -1389,7 +1391,6 @@ services: args: base: ${REPO}:python-wheel-windows-vs2022-base-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} python: ${PYTHON} - python_variant: default context: . dockerfile: ci/docker/python-wheel-windows-vs2022.dockerfile # This should make the pushed images reusable, but the image gets rebuilt. @@ -1406,7 +1407,7 @@ services: args: base: ${REPO}:python-wheel-windows-vs2022-base-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} python: ${PYTHON} - python_variant: freethreaded + python_variant_suffix: t context: . dockerfile: ci/docker/python-wheel-windows-vs2022.dockerfile # This should make the pushed images reusable, but the image gets rebuilt. @@ -1531,13 +1532,15 @@ services: BUILD_DOCS_CPP: "ON" BUILD_DOCS_PYTHON: "ON" PYTEST_ARGS: "--doctest-modules --doctest-cython" + PYTEST_RST_ARGS: "--doctest-glob=*.rst" volumes: *conda-volumes command: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && pip install -e /arrow/dev/archery[numpydoc] && archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 && - /arrow/ci/scripts/python_test.sh /arrow"] + /arrow/ci/scripts/python_test.sh /arrow && + /arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"] conda-python-dask: # Possible $DASK parameters: @@ -1747,7 +1750,7 @@ services: cache_from: - ${REPO}:r-rstudio-r-base-4.2-focal-revdepcheck args: - base: rstudio/r-base:4.2-focal + base: posit/r-base:4.2-focal r_dev: ${ARROW_R_DEV} tz: ${TZ} shm_size: *shm-size diff --git a/cpp/Brewfile b/cpp/Brewfile index 4c42607568c4..811712516bf7 100644 --- a/cpp/Brewfile +++ b/cpp/Brewfile @@ -28,6 +28,7 @@ brew "git" brew "glog" brew "googletest" brew "grpc" +brew "libiodbc" brew "llvm" brew "lz4" brew "mimalloc" diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index e2904db0de61..c3499f6b0061 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -315,6 +315,17 @@ "displayName": "Debug build with tests and Flight SQL", "cacheVariables": {} }, + { + "name": "ninja-debug-flight-sql-odbc", + "inherits": [ + "features-flight-sql", + "base-debug" + ], + "displayName": "Debug build with tests and Flight SQL ODBC", + "cacheVariables": { + "ARROW_FLIGHT_SQL_ODBC": "ON" + } + }, { "name": "ninja-debug-gandiva", "inherits": [ @@ -511,6 +522,17 @@ "displayName": "Release build with Flight SQL", "cacheVariables": {} }, + { + "name": "ninja-release-flight-sql-odbc", + "inherits": [ + "features-flight-sql", + "base-release" + ], + "displayName": "Release build with Flight SQL ODBC", + "cacheVariables": { + "ARROW_FLIGHT_SQL_ODBC": "ON" + } + }, { "name": "ninja-release-gandiva", "inherits": [ diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 0f6674c7143e..5d34ff50e35c 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -107,8 +107,8 @@ macro(tsort_bool_option_dependencies) endmacro() macro(resolve_option_dependencies) - # Arrow Flight SQL ODBC is available only for Windows for now. - if(NOT WIN32) + # Arrow Flight SQL ODBC is available only for Windows and macOS for now. + if(NOT WIN32 AND NOT APPLE) set(ARROW_FLIGHT_SQL_ODBC OFF) endif() if(MSVC_TOOLCHAIN) diff --git a/cpp/cmake_modules/Findutf8proc.cmake b/cpp/cmake_modules/Findutf8proc.cmake index 75d459d0ec74..75485427222b 100644 --- a/cpp/cmake_modules/Findutf8proc.cmake +++ b/cpp/cmake_modules/Findutf8proc.cmake @@ -32,7 +32,6 @@ if(ARROW_VCPKG) endif() find_package(utf8proc NAMES unofficial-utf8proc ${find_package_args}) if(utf8proc_FOUND) - add_library(utf8proc::utf8proc ALIAS utf8proc) return() endif() endif() diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index f4ff0bded3d4..bbd74284f520 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -54,26 +54,27 @@ if(ARROW_CPU_FLAG STREQUAL "x86") # sets available, but they are not set by MSVC (unlike other compilers). # See https://github.com/AcademySoftwareFoundation/OpenImageIO/issues/4265 add_definitions(-D__SSE2__ -D__SSE4_1__ -D__SSE4_2__) - set(ARROW_AVX2_FLAG "/arch:AVX2") + set(ARROW_AVX2_FLAGS "/arch:AVX2") # MSVC has no specific flag for BMI2, it seems to be enabled with AVX2 - set(ARROW_BMI2_FLAG "/arch:AVX2") + set(ARROW_BMI2_FLAGS "/arch:AVX2") set(ARROW_AVX512_FLAG "/arch:AVX512") set(CXX_SUPPORTS_SSE4_2 TRUE) else() set(ARROW_SSE4_2_FLAG "-msse4.2") - set(ARROW_AVX2_FLAG "-march=haswell") + set(ARROW_AVX2_FLAGS "-march=haswell") set(ARROW_BMI2_FLAG "-mbmi2") # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ set(ARROW_AVX512_FLAG "-march=skylake-avx512") # Append the avx2/avx512 subset option also, fix issue ARROW-9877 for homebrew-cpp - set(ARROW_AVX2_FLAG "${ARROW_AVX2_FLAG} -mavx2") + list(APPEND ARROW_AVX2_FLAGS "-mavx2") set(ARROW_AVX512_FLAG "${ARROW_AVX512_FLAG} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw") check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) endif() if(CMAKE_SIZEOF_VOID_P EQUAL 8) # Check for AVX extensions on 64-bit systems only, as 32-bit support seems iffy - check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) + list(JOIN ARROW_AVX2_FLAGS " " ARROW_AVX2_FLAGS_COMMAND_LINE) + check_cxx_compiler_flag("${ARROW_AVX2_FLAGS_COMMAND_LINE}" CXX_SUPPORTS_AVX2) if(MINGW) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 message(STATUS "Disable AVX512 support on MINGW for now") @@ -494,7 +495,8 @@ if(ARROW_CPU_FLAG STREQUAL "x86") if(NOT CXX_SUPPORTS_AVX2) message(FATAL_ERROR "AVX2 required but compiler doesn't support it.") endif() - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_AVX2_FLAG}") + list(JOIN ARROW_AVX2_FLAGS " " ARROW_AVX2_FLAGS_COMMAND_LINE) + string(APPEND CXX_COMMON_FLAGS " ${ARROW_AVX2_FLAGS_COMMAND_LINE}") add_definitions(-DARROW_HAVE_AVX2 -DARROW_HAVE_BMI2 -DARROW_HAVE_SSE4_2) elseif(ARROW_SIMD_LEVEL STREQUAL "SSE4_2") if(NOT CXX_SUPPORTS_SSE4_2) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index df937cc14cb7..e84b2accb8b2 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -813,6 +813,13 @@ else() ) endif() +if(DEFINED ENV{ARROW_WIL_URL}) + set(ARROW_WIL_URL "$ENV{ARROW_WIL_URL}") +else() + set_urls(ARROW_WIL_URL + "https://github.com/microsoft/wil/archive/${ARROW_WIL_BUILD_VERSION}.tar.gz") +endif() + if(DEFINED ENV{ARROW_XSIMD_URL}) set(XSIMD_SOURCE_URL "$ENV{ARROW_XSIMD_URL}") else() @@ -2343,13 +2350,6 @@ if(ARROW_MIMALLOC) set(MIMALLOC_C_FLAGS "${MIMALLOC_C_FLAGS} -DERROR_COMMITMENT_MINIMUM=635") endif() - set(MIMALLOC_PATCH_COMMAND "") - if(${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") - find_program(PATCH patch REQUIRED) - set(MIMALLOC_PATCH_COMMAND ${PATCH} -p1 -i - ${CMAKE_CURRENT_LIST_DIR}/mimalloc-1138.patch) - endif() - set(MIMALLOC_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_C_FLAGS=${MIMALLOC_C_FLAGS}" @@ -2367,7 +2367,6 @@ if(ARROW_MIMALLOC) ${EP_COMMON_OPTIONS} URL ${MIMALLOC_SOURCE_URL} URL_HASH "SHA256=${ARROW_MIMALLOC_BUILD_SHA256_CHECKSUM}" - PATCH_COMMAND ${MIMALLOC_PATCH_COMMAND} CMAKE_ARGS ${MIMALLOC_CMAKE_ARGS} BUILD_BYPRODUCTS "${MIMALLOC_STATIC_LIB}") @@ -2650,7 +2649,7 @@ if(ARROW_USE_XSIMD) IS_RUNTIME_DEPENDENCY FALSE REQUIRED_VERSION - "13.0.0") + "14.0.0") if(xsimd_SOURCE STREQUAL "BUNDLED") set(ARROW_XSIMD arrow::xsimd) @@ -4060,6 +4059,21 @@ endif() function(build_azure_sdk) message(STATUS "Building Azure SDK for C++ from source") + + # On Windows, Azure SDK's WinHTTP transport requires WIL (Windows Implementation Libraries). + # Fetch WIL before Azure SDK so the WIL::WIL target is available. + if(WIN32) + message(STATUS "Fetching WIL (Windows Implementation Libraries) for Azure SDK") + fetchcontent_declare(wil + ${FC_DECLARE_COMMON_OPTIONS} OVERRIDE_FIND_PACKAGE + URL ${ARROW_WIL_URL} + URL_HASH "SHA256=${ARROW_WIL_BUILD_SHA256_CHECKSUM}") + prepare_fetchcontent() + set(WIL_BUILD_PACKAGING OFF) + set(WIL_BUILD_TESTS OFF) + fetchcontent_makeavailable(wil) + endif() + fetchcontent_declare(azure_sdk ${FC_DECLARE_COMMON_OPTIONS} URL ${ARROW_AZURE_SDK_URL} diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 7dcf6a92bdfb..b55b4066bc1c 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -43,8 +43,8 @@ endif() if(UNIX) foreach(FILE ${PARQUET_EXAMPLES_WARNING_SUPPRESSIONS}) set_property(SOURCE ${FILE} - APPEND_STRING - PROPERTY COMPILE_FLAGS "-Wno-unused-variable") + APPEND + PROPERTY COMPILE_OPTIONS "-Wno-unused-variable") endforeach() endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index df9b783d5314..6e9d76a61e05 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -322,22 +322,24 @@ endfunction() macro(append_runtime_avx2_src SRCS SRC) if(ARROW_HAVE_RUNTIME_AVX2) list(APPEND ${SRCS} ${SRC}) - set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS ${ARROW_AVX2_FLAG}) + set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${ARROW_AVX2_FLAGS}") endif() endmacro() macro(append_runtime_avx2_bmi2_src SRCS SRC) if(ARROW_HAVE_RUNTIME_AVX2 AND ARROW_HAVE_RUNTIME_BMI2) list(APPEND ${SRCS} ${SRC}) - set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS - "${ARROW_AVX2_FLAG} ${ARROW_BMI2_FLAG}") + set_source_files_properties(${SRC} + PROPERTIES COMPILE_OPTIONS + "${ARROW_AVX2_FLAGS};${ARROW_BMI2_FLAG}") endif() endmacro() macro(append_runtime_avx512_src SRCS SRC) if(ARROW_HAVE_RUNTIME_AVX512) list(APPEND ${SRCS} ${SRC}) - set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS ${ARROW_AVX512_FLAG}) + separate_arguments(AVX512_FLAG_LIST NATIVE_COMMAND "${ARROW_AVX512_FLAG}") + set_source_files_properties(${SRC} PROPERTIES COMPILE_OPTIONS "${AVX512_FLAG_LIST}") endif() endmacro() @@ -364,6 +366,7 @@ set(ARROW_SRCS extension_type.cc extension/bool8.cc extension/json.cc + extension/parquet_variant.cc extension/uuid.cc pretty_print.cc record_batch.cc @@ -912,8 +915,8 @@ if(ARROW_FILESYSTEM) # Suppress documentation warnings from google-cloud-cpp headers if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|AppleClang") set_source_files_properties(filesystem/gcsfs.cc filesystem/gcsfs_internal.cc - PROPERTIES COMPILE_FLAGS - "-Wno-documentation -Wno-documentation-deprecated-sync" + PROPERTIES COMPILE_OPTIONS + "-Wno-documentation;-Wno-documentation-deprecated-sync" ) endif() endif() diff --git a/cpp/src/arrow/acero/aggregate_benchmark.cc b/cpp/src/arrow/acero/aggregate_benchmark.cc index 2e9cccd80d99..171d19830ad5 100644 --- a/cpp/src/arrow/acero/aggregate_benchmark.cc +++ b/cpp/src/arrow/acero/aggregate_benchmark.cc @@ -17,6 +17,7 @@ #include "benchmark/benchmark.h" +#include #include #include #include @@ -269,7 +270,7 @@ struct SumBitmapVectorizeUnroll : public Summer { local.total += SUM_SHIFT(5); local.total += SUM_SHIFT(6); local.total += SUM_SHIFT(7); - local.valid_count += bit_util::kBytePopcount[valid_byte]; + local.valid_count += std::popcount(valid_byte); } else { // No nulls local.total += values[i + 0] + values[i + 1] + values[i + 2] + values[i + 3] + diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index 30cafd120cae..62071cfcf19b 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -407,14 +408,14 @@ void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build, uint64_t num_negatives = 0ULL; for (int iword = 0; iword < next_batch_size / 64; ++iword) { uint64_t word = reinterpret_cast(result_bit_vector.data())[iword]; - num_negatives += ARROW_POPCOUNT64(~word); + num_negatives += std::popcount(~word); } if (next_batch_size % 64 > 0) { uint64_t word = reinterpret_cast( result_bit_vector.data())[next_batch_size / 64]; uint64_t mask = (1ULL << (next_batch_size % 64)) - 1; word |= ~mask; - num_negatives += ARROW_POPCOUNT64(~word); + num_negatives += std::popcount(~word); } if (i < num_build) { num_negatives_build += num_negatives; diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 97632e0ca090..9b2ebc33e158 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -17,6 +17,7 @@ #include #include // std::upper_bound +#include #include #include #include @@ -666,7 +667,7 @@ void SwissTableMerge::MergePartition(SwissTable* target, const SwissTable* sourc // For each non-empty source slot... constexpr uint64_t kHighBitOfEachByte = 0x8080808080808080ULL; int num_full_slots = SwissTable::kSlotsPerBlock - - static_cast(ARROW_POPCOUNT64(block & kHighBitOfEachByte)); + static_cast(std::popcount(block & kHighBitOfEachByte)); for (int local_slot_id = 0; local_slot_id < num_full_slots; ++local_slot_id) { // Read group id and hash for this slot. // @@ -722,7 +723,7 @@ inline bool SwissTableMerge::InsertNewGroup(SwissTable* target, uint32_t group_i return false; } int local_slot_id = SwissTable::kSlotsPerBlock - - static_cast(ARROW_POPCOUNT64(block & kHighBitOfEachByte)); + static_cast(std::popcount(block & kHighBitOfEachByte)); uint32_t global_slot_id = SwissTable::global_slot_id(block_id, local_slot_id); target->insert_into_empty_slot(global_slot_id, hash, group_id); return true; diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 51cca497485c..9c376500fa60 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -33,6 +33,7 @@ #include "arrow/io/interfaces.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/table_builder.h" @@ -548,6 +549,92 @@ class ORCFileReader::Impl { return NextStripeReader(batch_size, empty_vec); } + Result> GetColumnStatistics(int column_index) { + ORC_BEGIN_CATCH_NOT_OK; + const liborc::Statistics* file_stats = reader_->getStatistics(); + if (!file_stats) { + return Status::IOError("No file statistics available"); + } + return ConvertColumnStatistics(file_stats->getColumnStatistics(column_index)); + ORC_END_CATCH_NOT_OK; + } + + Result> GetStripeColumnStatistics( + int64_t stripe_index, int column_index) { + ORC_BEGIN_CATCH_NOT_OK; + const liborc::Statistics* stripe_stats = + reader_->getStripeStatistics(static_cast(stripe_index)); + if (!stripe_stats) { + return Status::IOError("No stripe statistics available for stripe ", + stripe_index); + } + return ConvertColumnStatistics(stripe_stats->getColumnStatistics(column_index)); + ORC_END_CATCH_NOT_OK; + } + + const void* GetORCType() { + return static_cast(&reader_->getType()); + } + + Result> ConvertColumnStatistics( + const liborc::ColumnStatistics* orc_stats) { + if (!orc_stats) { + return Status::IOError("Column statistics not available"); + } + + auto stats = std::make_shared(); + stats->has_null = orc_stats->hasNull(); + stats->num_values = orc_stats->getNumberOfValues(); + + // Try to extract min/max based on the column type + const liborc::IntegerColumnStatistics* int_stats = + dynamic_cast(orc_stats); + if (int_stats) { + stats->has_minimum = int_stats->hasMinimum(); + stats->has_maximum = int_stats->hasMaximum(); + if (stats->has_minimum) { + stats->minimum = std::make_shared(int_stats->getMinimum()); + } + if (stats->has_maximum) { + stats->maximum = std::make_shared(int_stats->getMaximum()); + } + return stats; + } + + const liborc::DoubleColumnStatistics* double_stats = + dynamic_cast(orc_stats); + if (double_stats) { + stats->has_minimum = double_stats->hasMinimum(); + stats->has_maximum = double_stats->hasMaximum(); + if (stats->has_minimum) { + stats->minimum = std::make_shared(double_stats->getMinimum()); + } + if (stats->has_maximum) { + stats->maximum = std::make_shared(double_stats->getMaximum()); + } + return stats; + } + + const liborc::StringColumnStatistics* string_stats = + dynamic_cast(orc_stats); + if (string_stats) { + stats->has_minimum = string_stats->hasMinimum(); + stats->has_maximum = string_stats->hasMaximum(); + if (stats->has_minimum) { + stats->minimum = std::make_shared(string_stats->getMinimum()); + } + if (stats->has_maximum) { + stats->maximum = std::make_shared(string_stats->getMaximum()); + } + return stats; + } + + // For other types, return statistics without min/max + stats->has_minimum = false; + stats->has_maximum = false; + return stats; + } + private: MemoryPool* pool_; std::unique_ptr reader_; @@ -573,6 +660,18 @@ Result> ORCFileReader::ReadMetadata() { return impl_->ReadMetadata(); } +Result> ORCFileReader::GetColumnStatistics( + int column_index) { + return impl_->GetColumnStatistics(column_index); +} + +Result> ORCFileReader::GetStripeColumnStatistics( + int64_t stripe_index, int column_index) { + return impl_->GetStripeColumnStatistics(stripe_index, column_index); +} + +const void* ORCFileReader::GetORCType() { return impl_->GetORCType(); } + Result> ORCFileReader::ReadSchema() { return impl_->ReadSchema(); } diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 4ffff81f355f..d96f13a1a79f 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -47,6 +47,23 @@ struct StripeInformation { int64_t first_row_id; }; +/// \brief Column statistics from ORC file +/// \details Wraps ORC library's column statistics with Arrow-native interface +struct ARROW_EXPORT OrcColumnStatistics { + /// \brief Whether the column has null values + bool has_null; + /// \brief Number of values in the column (0 means all nulls) + uint64_t num_values; + /// \brief Whether minimum value is available + bool has_minimum; + /// \brief Whether maximum value is available + bool has_maximum; + /// \brief Minimum value as Arrow Scalar (nullptr if not available) + std::shared_ptr minimum; + /// \brief Maximum value as Arrow Scalar (nullptr if not available) + std::shared_ptr maximum; +}; + /// \class ORCFileReader /// \brief Read an Arrow Table or RecordBatch from an ORC file. class ARROW_EXPORT ORCFileReader { @@ -267,6 +284,28 @@ class ARROW_EXPORT ORCFileReader { /// \return A KeyValueMetadata object containing the ORC metadata Result> ReadMetadata(); + /// \brief Get file-level column statistics + /// + /// \param[in] column_index the column index to get statistics for + /// \return the column statistics + Result> GetColumnStatistics(int column_index); + + /// \brief Get stripe-level column statistics + /// + /// \param[in] stripe_index the stripe index + /// \param[in] column_index the column index to get statistics for + /// \return the column statistics + Result> GetStripeColumnStatistics( + int64_t stripe_index, int column_index); + + /// \brief Get the ORC type tree for column ID mapping + /// + /// This is needed for building schema manifests that map Arrow schema fields + /// to ORC physical column indices. + /// + /// \return pointer to the ORC Type object (owned by the ORC reader) + const void* GetORCType(); + private: class Impl; std::unique_ptr impl_; diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index dc82488f6a36..64ea3fd71a73 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -41,6 +41,7 @@ #include "arrow/array/builder_primitive.h" #include "arrow/array/builder_run_end.h" #include "arrow/array/builder_time.h" +#include "arrow/array/concatenate.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -997,6 +998,123 @@ TEST_F(TestArray, TestAppendArraySlice) { } } +class TestBuilderAppendArraySlice : public TestArray { + public: + virtual void AssertResult(const Array& expected, const Array& actual) { + AssertArraysEqual(expected, actual, true); + } + + void CheckAppendArraySlice(const std::shared_ptr& type) { + auto rag = random::RandomArrayGenerator(0xdeadbeef); + const int64_t total_length = 100; + + for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) { + auto array = rag.ArrayOf(type, total_length, null_probability); + + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool_, type, &builder)); + + // Slice the array into multiple pieces + ArrayVector slices; + std::vector offsets = {0, 10, 10, 25, 60, 100}; + for (size_t i = 0; i < offsets.size() - 1; ++i) { + int64_t start = offsets[i]; + int64_t length = offsets[i + 1] - offsets[i]; + auto slice = array->Slice(start, length); + slices.push_back(slice); + + ArraySpan span(*slice->data()); + ASSERT_OK(builder->AppendArraySlice(span, 0, slice->length())); + } + + std::shared_ptr actual; + ASSERT_OK(builder->Finish(&actual)); + ASSERT_OK(actual->ValidateFull()); + + ASSERT_OK_AND_ASSIGN(auto expected, Concatenate(slices, pool_)); + AssertResult(*expected, *actual); + } + } + + void CheckAppendArraySlice(const std::vector>& types) { + for (const auto& type : types) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + CheckAppendArraySlice(type); + } + } +}; + +TEST_F(TestBuilderAppendArraySlice, Primitives) { + CheckAppendArraySlice(PrimitiveTypes()); +} + +TEST_F(TestBuilderAppendArraySlice, Temporals) { CheckAppendArraySlice(TemporalTypes()); } + +TEST_F(TestBuilderAppendArraySlice, Intervals) { CheckAppendArraySlice(IntervalTypes()); } + +TEST_F(TestBuilderAppendArraySlice, Durations) { CheckAppendArraySlice(DurationTypes()); } + +TEST_F(TestBuilderAppendArraySlice, Decimals) { + CheckAppendArraySlice( + {decimal32(7, 2), decimal64(12, 2), decimal128(10, 2), decimal256(10, 2)}); +} + +TEST_F(TestBuilderAppendArraySlice, Nested) { + CheckAppendArraySlice({list(int32()), large_list(int32()), list_view(int32()), + large_list_view(int32()), fixed_size_list(int32(), 3), + struct_({field("a", int32()), field("b", utf8())}), + sparse_union({field("a", int32()), field("b", utf8())}), + dense_union({field("a", int32()), field("b", utf8())})}); +} + +TEST_F(TestBuilderAppendArraySlice, FixedSizeBinary) { + CheckAppendArraySlice(fixed_size_binary(10)); +} + +TEST_F(TestBuilderAppendArraySlice, Float16) { CheckAppendArraySlice(float16()); } + +TEST_F(TestBuilderAppendArraySlice, RunEndEncoded) { + CheckAppendArraySlice(run_end_encoded(int32(), utf8())); + CheckAppendArraySlice(run_end_encoded(int32(), int64())); +} + +// Dictionary types require a custom AssertResult because DictionaryBuilder +// re-encodes values based on discovery order. This can change both the +// dictionary and the indices, causing standard physical equality checks to fail. +// +// Example: Slicing values ["b", "a"] from an array with dictionary ["a", "b"] +// (indices [1, 0]) and appending them to a fresh builder results in a new +// dictionary ["b", "a"] (indices [0, 1]). Both represent the same logical +// data but differ physically. +class TestBuilderAppendArraySliceDictionary : public TestBuilderAppendArraySlice { + public: + void AssertResult(const Array& expected, const Array& actual) override { + const auto& expected_dict = internal::checked_cast(expected); + const auto& actual_dict = internal::checked_cast(actual); + const auto& expected_values = *expected_dict.dictionary(); + const auto& actual_values = *actual_dict.dictionary(); + + ASSERT_EQ(expected.length(), actual.length()); + for (int64_t i = 0; i < expected.length(); ++i) { + if (expected.IsNull(i)) { + ASSERT_TRUE(actual.IsNull(i)); + } else { + ASSERT_FALSE(actual.IsNull(i)); + ASSERT_OK_AND_ASSIGN(auto expected_val, + expected_values.GetScalar(expected_dict.GetValueIndex(i))); + ASSERT_OK_AND_ASSIGN(auto actual_val, + actual_values.GetScalar(actual_dict.GetValueIndex(i))); + AssertScalarsEqual(*expected_val, *actual_val); + } + } + } +}; + +TEST_F(TestBuilderAppendArraySliceDictionary, Dictionary) { + CheckAppendArraySlice(dictionary(int8(), utf8())); + CheckAppendArraySlice(dictionary(int32(), utf8())); +} + // GH-39976: Test out-of-line data size calculation in // BinaryViewBuilder::AppendArraySlice. TEST_F(TestArray, TestBinaryViewAppendArraySlice) { diff --git a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h index 960ba59892ff..b4840061ae75 100644 --- a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h +++ b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" @@ -594,8 +595,7 @@ struct PowerChecked { } // left to right O(logn) power with overflow checks bool overflow = false; - uint64_t bitmask = - 1ULL << (63 - bit_util::CountLeadingZeros(static_cast(exp))); + uint64_t bitmask = 1ULL << (63 - std::countl_zero(static_cast(exp))); T pow = 1; while (bitmask) { overflow |= MultiplyWithOverflow(pow, pow, &pow); diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index ed50025ef5fd..3ab7ff065b28 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -277,8 +277,16 @@ template concept CBooleanConcept = std::same_as; // XXX: Ideally we want to have std::floating_point = true. +// Some older standard library implementations (e.g., macOS 11.x libc++) have partial +// C++20 concepts support with std::same_as but lack std::floating_point. +#if defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L template concept CFloatingPointConcept = std::floating_point || std::same_as; +#else +template +concept CFloatingPointConcept = + std::is_floating_point_v || std::same_as; +#endif template concept CDecimalConcept = std::same_as || std::same_as || diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 1fb0df56bb97..e9c65aff1ce1 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -162,7 +162,8 @@ Result ListSliceOutputType(const ListSliceOptions& opts, "`stop` being set."); } if (opts.step < 1) { - return Status::Invalid("`step` must be >= 1, got: ", opts.step); + return Status::Invalid("`step` must be greater than or equal to 1, got: ", + opts.step); } const auto length = ListSliceLength(opts.start, opts.step, *stop); return fixed_size_list(value_type, static_cast(length)); @@ -183,14 +184,15 @@ struct ListSlice { const auto* list_type = checked_cast(list_array.type); // Pre-conditions - if (opts.start < 0 || (opts.stop.has_value() && opts.start >= opts.stop.value())) { - // TODO(ARROW-18281): support start == stop which should give empty lists - return Status::Invalid("`start`(", opts.start, - ") should be greater than 0 and smaller than `stop`(", - ToString(opts.stop), ")"); + if (opts.start < 0 || (opts.stop.has_value() && opts.start > opts.stop.value())) { + return Status::Invalid( + "`start`(", opts.start, + ") should be greater than or equal to 0 and not greater than `stop`(", + ToString(opts.stop), ")"); } if (opts.step < 1) { - return Status::Invalid("`step` must be >= 1, got: ", opts.step); + return Status::Invalid("`step` must be greater than or equal to 1, got: ", + opts.step); } auto* pool = ctx->memory_pool(); diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index f199f56aa2f0..b5a68d12cb0c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -176,6 +176,12 @@ TEST(TestScalarNested, ListSliceVariableOutput) { auto input = ArrayFromJSON(fixed_size_list(int32(), 1), "[[1]]"); auto expected = ArrayFromJSON(list(int32()), "[[1]]"); CheckScalarUnary("list_slice", input, expected, &args); + + args.start = 0; + args.stop = 0; + auto input_empty = ArrayFromJSON(list(int32()), "[[1, 2, 3], [4, 5], null]"); + auto expected_empty = ArrayFromJSON(list(int32()), "[[], [], null]"); + CheckScalarUnary("list_slice", input_empty, expected_empty, &args); } TEST(TestScalarNested, ListSliceFixedOutput) { @@ -315,7 +321,8 @@ TEST(TestScalarNested, ListSliceBadParameters) { EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr( - "`start`(-1) should be greater than 0 and smaller than `stop`(1)"), + "`start`(-1) should be greater than or equal to 0 and not greater than " + "`stop`(1)"), CallFunction("list_slice", {input}, &args)); // start greater than stop args.start = 1; @@ -323,14 +330,8 @@ TEST(TestScalarNested, ListSliceBadParameters) { EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr( - "`start`(1) should be greater than 0 and smaller than `stop`(0)"), - CallFunction("list_slice", {input}, &args)); - // start same as stop - args.stop = args.start; - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, - ::testing::HasSubstr( - "`start`(1) should be greater than 0 and smaller than `stop`(1)"), + "`start`(1) should be greater than or equal to 0 and not greater than " + "`stop`(0)"), CallFunction("list_slice", {input}, &args)); // stop not set and FixedSizeList requested with variable sized input args.stop = std::nullopt; @@ -343,9 +344,9 @@ TEST(TestScalarNested, ListSliceBadParameters) { args.start = 0; args.stop = 2; args.step = 0; - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, - ::testing::HasSubstr("`step` must be >= 1, got: 0"), - CallFunction("list_slice", {input}, &args)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("`step` must be greater than or equal to 1, got: 0"), + CallFunction("list_slice", {input}, &args)); } TEST(TestScalarNested, StructField) { diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc index 950de47733b5..6e7068f6ecf6 100644 --- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc @@ -183,10 +183,8 @@ class ArrayCompareSorter { const ArraySortOptions& options, ExecContext* ctx) { const auto& dict_array = checked_cast(array); - // TODO: These methods should probably return a const&? They seem capable. - // https://github.com/apache/arrow/issues/35437 - auto dict_values = dict_array.dictionary(); - auto dict_indices = dict_array.indices(); + const auto& dict_values = dict_array.dictionary(); + const auto& dict_indices = dict_array.indices(); // Algorithm: // 1) Use the Rank function to get an exactly-equivalent-order array @@ -237,7 +235,6 @@ class ArrayCompareSorter { RankOptions rank_options(SortOrder::Ascending, NullPlacement::AtEnd, RankOptions::Dense); - // XXX Should this support Type::NA? auto data = array->data(); std::shared_ptr null_bitmap; if (array->null_count() > 0) { diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc index 90f8eb7a56b9..e18fcf37716e 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc @@ -437,6 +437,27 @@ TEST(ArraySortIndicesFunction, AllNullDictionaryArray) { } } +TEST(ArraySortIndicesFunction, NullTypeDictionaryArray) { + // Test that dictionaries with Type::NA (null type) values can be sorted. + // All values in a null-type dictionary are logically null, so sorting + // should just arrange indices based on null placement, preserving order. + for (const auto& index_type : all_dictionary_index_types()) { + ARROW_SCOPED_TRACE("index_type = ", index_type->ToString()); + auto dict_type = dictionary(index_type, null()); + auto dict_arr = DictArrayFromJSON(dict_type, "[null, 0, 0, null]", "[null]"); + + for (auto null_placement : AllNullPlacements()) { + ArraySortOptions options{SortOrder::Ascending, null_placement}; + // All nulls, so output should be identity permutation + auto expected = ArrayFromJSON(uint64(), "[0, 1, 2, 3]"); + ASSERT_OK_AND_ASSIGN(auto actual, + CallFunction("array_sort_indices", {dict_arr}, &options)); + ValidateOutput(actual); + AssertDatumsEqual(expected, actual, /*verbose=*/true); + } + } +} + Result> DecodeDictionary(const Array& array) { const auto& dict_array = checked_cast(array); ARROW_ASSIGN_OR_RAISE(auto decoded_datum, diff --git a/cpp/src/arrow/compute/key_hash_internal.cc b/cpp/src/arrow/compute/key_hash_internal.cc index a0002efb3faf..4608a742e15d 100644 --- a/cpp/src/arrow/compute/key_hash_internal.cc +++ b/cpp/src/arrow/compute/key_hash_internal.cc @@ -20,6 +20,7 @@ #include #include +#include #include #include "arrow/compute/light_array_internal.h" @@ -357,7 +358,7 @@ void Hashing32::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t key_len void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t num_keys, uint64_t key_length, const uint8_t* keys, uint32_t* hashes, uint32_t* temp_hashes_for_combine) { - if (ARROW_POPCOUNT64(key_length) == 1 && key_length <= sizeof(uint64_t)) { + if (std::popcount(key_length) == 1 && key_length <= sizeof(uint64_t)) { HashInt(combine_hashes, num_keys, key_length, keys, hashes); return; } @@ -809,7 +810,7 @@ void Hashing64::HashInt(bool combine_hashes, uint32_t num_keys, uint64_t key_len void Hashing64::HashFixed(bool combine_hashes, uint32_t num_keys, uint64_t key_length, const uint8_t* keys, uint64_t* hashes) { - if (ARROW_POPCOUNT64(key_length) == 1 && key_length <= sizeof(uint64_t)) { + if (std::popcount(key_length) == 1 && key_length <= sizeof(uint64_t)) { HashInt(combine_hashes, num_keys, key_length, keys, hashes); return; } diff --git a/cpp/src/arrow/compute/key_map_internal.cc b/cpp/src/arrow/compute/key_map_internal.cc index 4a2405e754e6..353449cf1646 100644 --- a/cpp/src/arrow/compute/key_map_internal.cc +++ b/cpp/src/arrow/compute/key_map_internal.cc @@ -18,6 +18,7 @@ #include "arrow/compute/key_map_internal.h" #include +#include #include #include "arrow/util/bit_util.h" @@ -27,7 +28,6 @@ namespace arrow { -using bit_util::CountLeadingZeros; using internal::CpuInfo; namespace compute { @@ -91,7 +91,7 @@ inline void SwissTable::search_block(uint64_t block, int stamp, int start_slot, // Now if we or with the highest bits of the block and scan zero bits in reverse, we get // 8x slot index that we were looking for. This formula works in all three cases a), b) // and c). - *out_slot = static_cast(CountLeadingZeros(matches | block_high_bits) >> 3); + *out_slot = static_cast(std::countl_zero(matches | block_high_bits) >> 3); } template @@ -204,8 +204,8 @@ void SwissTable::init_slot_ids_for_new_keys(uint32_t num_ids, const uint16_t* id int num_block_bytes = num_block_bytes_from_num_groupid_bits(num_groupid_bits); if (log_blocks_ == 0) { uint64_t block = *reinterpret_cast(blocks_->mutable_data()); - uint32_t empty_slot = static_cast( - kSlotsPerBlock - ARROW_POPCOUNT64(block & kHighBitOfEachByte)); + uint32_t empty_slot = + static_cast(kSlotsPerBlock - std::popcount(block & kHighBitOfEachByte)); for (uint32_t i = 0; i < num_ids; ++i) { int id = ids[i]; slot_ids[id] = empty_slot; @@ -224,7 +224,7 @@ void SwissTable::init_slot_ids_for_new_keys(uint32_t num_ids, const uint16_t* id } iblock = (iblock + 1) & ((1 << log_blocks_) - 1); } - uint32_t empty_slot = static_cast(kSlotsPerBlock - ARROW_POPCOUNT64(block)); + uint32_t empty_slot = static_cast(kSlotsPerBlock - std::popcount(block)); slot_ids[id] = global_slot_id(iblock, empty_slot); } } @@ -684,7 +684,7 @@ Status SwissTable::grow_double() { mutable_block_data(blocks_new->mutable_data(), 2 * i, block_size_after); uint64_t block = *reinterpret_cast(block_base); - uint32_t full_slots = CountLeadingZeros(block & kHighBitOfEachByte) >> 3; + uint32_t full_slots = std::countl_zero(block & kHighBitOfEachByte) >> 3; uint32_t full_slots_new[2]; full_slots_new[0] = full_slots_new[1] = 0; util::SafeStore(double_block_base_new, kHighBitOfEachByte); @@ -722,7 +722,7 @@ Status SwissTable::grow_double() { // How many full slots in this block const uint8_t* block_base = block_data(i, block_size_before); uint64_t block = util::SafeLoadAs(block_base); - uint32_t full_slots = CountLeadingZeros(block & kHighBitOfEachByte) >> 3; + uint32_t full_slots = std::countl_zero(block & kHighBitOfEachByte) >> 3; for (uint32_t j = 0; j < full_slots; ++j) { uint32_t slot_id = global_slot_id(i, j); @@ -741,13 +741,13 @@ Status SwissTable::grow_double() { mutable_block_data(blocks_new->mutable_data(), block_id_new, block_size_after); uint64_t block_new = util::SafeLoadAs(block_base_new); int full_slots_new = - static_cast(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3); + static_cast(std::countl_zero(block_new & kHighBitOfEachByte) >> 3); while (full_slots_new == kSlotsPerBlock) { block_id_new = (block_id_new + 1) & ((1 << log_blocks_after) - 1); block_base_new = blocks_new->mutable_data() + block_id_new * block_size_after; block_new = util::SafeLoadAs(block_base_new); full_slots_new = - static_cast(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3); + static_cast(std::countl_zero(block_new & kHighBitOfEachByte) >> 3); } hashes_new[block_id_new * kSlotsPerBlock + full_slots_new] = hash; diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index d62333af3700..a342e5a6b1bf 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -341,43 +341,44 @@ struct GrouperImpl : public Grouper { impl->ctx_ = ctx; for (size_t i = 0; i < key_types.size(); ++i) { - // TODO(wesm): eliminate this probably unneeded shared_ptr copy - std::shared_ptr key = key_types[i].GetSharedPtr(); + const auto& key_type = key_types[i]; - if (key->id() == Type::BOOL) { + if (key_type.id() == Type::BOOL) { impl->encoders_[i] = std::make_unique(); continue; } - if (key->id() == Type::DICTIONARY) { - impl->encoders_[i] = - std::make_unique(key, ctx->memory_pool()); + if (key_type.id() == Type::DICTIONARY) { + impl->encoders_[i] = std::make_unique( + key_type.GetSharedPtr(), ctx->memory_pool()); continue; } - if (is_fixed_width(key->id())) { - impl->encoders_[i] = std::make_unique(key); + if (is_fixed_width(key_type.id())) { + impl->encoders_[i] = + std::make_unique(key_type.GetSharedPtr()); continue; } - if (is_binary_like(key->id())) { - impl->encoders_[i] = - std::make_unique>(key); + if (is_binary_like(key_type.id())) { + impl->encoders_[i] = std::make_unique>( + key_type.GetSharedPtr()); continue; } - if (is_large_binary_like(key->id())) { + if (is_large_binary_like(key_type.id())) { impl->encoders_[i] = - std::make_unique>(key); + std::make_unique>( + key_type.GetSharedPtr()); continue; } - if (key->id() == Type::NA) { + if (key_type.id() == Type::NA) { impl->encoders_[i] = std::make_unique(); continue; } - return Status::NotImplemented("Keys of type ", *key); + return Status::NotImplemented("Keys of type ", *key_type); } return impl; diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 6af5458ea9e0..39d0bb0c631c 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -17,6 +17,8 @@ #include "arrow/compute/row/row_internal.h" +#include + #include "arrow/compute/util.h" #include "arrow/util/logging_internal.h" @@ -89,9 +91,9 @@ void RowTableMetadata::FromColumnMetadataVector( std::sort( column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) { bool is_left_pow2 = - !cols[left].is_fixed_length || ARROW_POPCOUNT64(cols[left].fixed_length) <= 1; - bool is_right_pow2 = !cols[right].is_fixed_length || - ARROW_POPCOUNT64(cols[right].fixed_length) <= 1; + !cols[left].is_fixed_length || std::popcount(cols[left].fixed_length) <= 1; + bool is_right_pow2 = + !cols[right].is_fixed_length || std::popcount(cols[right].fixed_length) <= 1; bool is_left_fixedlen = cols[left].is_fixed_length; bool is_right_fixedlen = cols[right].is_fixed_length; uint32_t width_left = @@ -127,7 +129,7 @@ void RowTableMetadata::FromColumnMetadataVector( for (uint32_t i = 0; i < num_cols; ++i) { const KeyColumnMetadata& col = cols[column_order[i]]; if (col.is_fixed_length && col.fixed_length != 0 && - ARROW_POPCOUNT64(col.fixed_length) != 1) { + std::popcount(col.fixed_length) != 1) { offset_within_row += RowTableMetadata::padding_for_alignment_within_row( offset_within_row, string_alignment, col); } diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index 219fcbc51f4d..1c1ed5ca7cdf 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -16,6 +16,7 @@ // under the License. #pragma once +#include #include #include @@ -85,7 +86,7 @@ struct ARROW_COMPUTE_EXPORT RowTableMetadata { /// Alignment must be a power of 2. static inline uint32_t padding_for_alignment_within_row(uint32_t offset, int required_alignment) { - ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1); + ARROW_DCHECK(std::popcount(static_cast(required_alignment)) == 1); return static_cast((-static_cast(offset)) & (required_alignment - 1)); } @@ -94,8 +95,7 @@ struct ARROW_COMPUTE_EXPORT RowTableMetadata { /// choosing required alignment based on the data type of that column. static inline uint32_t padding_for_alignment_within_row( uint32_t offset, int string_alignment, const KeyColumnMetadata& col_metadata) { - if (!col_metadata.is_fixed_length || - ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) { + if (!col_metadata.is_fixed_length || std::popcount(col_metadata.fixed_length) <= 1) { return 0; } else { return padding_for_alignment_within_row(offset, string_alignment); @@ -106,7 +106,7 @@ struct ARROW_COMPUTE_EXPORT RowTableMetadata { /// Alignment must be a power of 2. static inline offset_type padding_for_alignment_row(offset_type row_offset, int required_alignment) { - ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1); + ARROW_DCHECK(std::popcount(static_cast(required_alignment)) == 1); return (-row_offset) & (required_alignment - 1); } diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc index b90b3a64056b..28bbfb7072bc 100644 --- a/cpp/src/arrow/compute/util.cc +++ b/cpp/src/arrow/compute/util.cc @@ -17,12 +17,13 @@ #include "arrow/compute/util.h" +#include + #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" namespace arrow { -using bit_util::CountTrailingZeros; using internal::CpuInfo; namespace util { @@ -65,7 +66,7 @@ inline void bits_to_indexes_helper(uint64_t word, uint16_t base_index, int* num_ uint16_t* indexes) { int n = *num_indexes; while (word) { - indexes[n++] = base_index + static_cast(CountTrailingZeros(word)); + indexes[n++] = base_index + static_cast(std::countr_zero(word)); word &= word - 1; } *num_indexes = n; @@ -75,7 +76,7 @@ inline void bits_filter_indexes_helper(uint64_t word, const uint16_t* input_inde int* num_indexes, uint16_t* indexes) { int n = *num_indexes; while (word) { - indexes[n++] = input_indexes[CountTrailingZeros(word)]; + indexes[n++] = input_indexes[std::countr_zero(word)]; word &= word - 1; } *num_indexes = n; diff --git a/cpp/src/arrow/compute/util_avx2.cc b/cpp/src/arrow/compute/util_avx2.cc index a554e0463f06..9e1b7e4c0f08 100644 --- a/cpp/src/arrow/compute/util_avx2.cc +++ b/cpp/src/arrow/compute/util_avx2.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include "arrow/compute/util.h" @@ -54,7 +55,7 @@ void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, int* num_ _pext_u64(mask, _pdep_u64(word, kEachByteIs1) * 0xff) + base; *reinterpret_cast(byte_indexes + num_indexes_loop) = byte_indexes_next; base += incr; - num_indexes_loop += static_cast(arrow::bit_util::PopCount(word & 0xff)); + num_indexes_loop += static_cast(std::popcount(word & 0xff)); word >>= 8; } // Unpack indexes to 16-bits and either add the base of i * 64 or shuffle input @@ -144,7 +145,7 @@ void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, kByteSequence_0_8_1_9_2_10_3_11, kByteSequence_4_12_5_13_6_14_7_15)); _mm256_storeu_si256((__m256i*)(indexes + num_indexes), output); - num_indexes += static_cast(arrow::bit_util::PopCount(word & 0xffff)); + num_indexes += static_cast(std::popcount(word & 0xffff)); word >>= 16; ++loop_id; } diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index ec31d4b1ceb4..bb59d02cd206 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -105,31 +105,45 @@ Status PresizeBuilder(const BlockParser& parser, BuilderType* builder) { } } +///////////////////////////////////////////////////////////////////////// +// Shared Tries cache to avoid rebuilding them for each decoder instance + +struct TrieCache { + Trie null_trie; + Trie true_trie; + Trie false_trie; + + static Result> Make(const ConvertOptions& options) { + auto cache = std::make_shared(); + RETURN_NOT_OK(InitializeTrie(options.null_values, &cache->null_trie)); + RETURN_NOT_OK(InitializeTrie(options.true_values, &cache->true_trie)); + RETURN_NOT_OK(InitializeTrie(options.false_values, &cache->false_trie)); + return cache; + } +}; + ///////////////////////////////////////////////////////////////////////// // Per-type value decoders struct ValueDecoder { explicit ValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : type_(type), options_(options) {} + const ConvertOptions& options, const TrieCache* trie_cache) + : type_(type), options_(options), trie_cache_(trie_cache) {} - Status Initialize() { - // TODO no need to build a separate Trie for each instance - return InitializeTrie(options_.null_values, &null_trie_); - } + Status Initialize() { return Status::OK(); } bool IsNull(const uint8_t* data, uint32_t size, bool quoted) { if (quoted && !options_.quoted_strings_can_be_null) { return false; } - return null_trie_.Find(std::string_view(reinterpret_cast(data), size)) >= - 0; + return trie_cache_->null_trie.Find( + std::string_view(reinterpret_cast(data), size)) >= 0; } protected: - Trie null_trie_; const std::shared_ptr type_; const ConvertOptions& options_; + const TrieCache* trie_cache_; }; // @@ -140,8 +154,9 @@ struct FixedSizeBinaryValueDecoder : public ValueDecoder { using value_type = const uint8_t*; explicit FixedSizeBinaryValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), byte_width_(checked_cast(*type).byte_width()) {} Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { @@ -207,8 +222,8 @@ struct NumericValueDecoder : public ValueDecoder { using value_type = typename T::c_type; NumericValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), concrete_type_(checked_cast(*type)), string_converter_(MakeStringConverter(options)) {} @@ -236,31 +251,20 @@ struct BooleanValueDecoder : public ValueDecoder { using ValueDecoder::ValueDecoder; - Status Initialize() { - // TODO no need to build separate Tries for each instance - RETURN_NOT_OK(InitializeTrie(options_.true_values, &true_trie_)); - RETURN_NOT_OK(InitializeTrie(options_.false_values, &false_trie_)); - return ValueDecoder::Initialize(); - } - Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { // XXX should quoted values be allowed at all? - if (false_trie_.Find(std::string_view(reinterpret_cast(data), size)) >= - 0) { + if (trie_cache_->false_trie.Find( + std::string_view(reinterpret_cast(data), size)) >= 0) { *out = false; return Status::OK(); } - if (ARROW_PREDICT_TRUE(true_trie_.Find(std::string_view( + if (ARROW_PREDICT_TRUE(trie_cache_->true_trie.Find(std::string_view( reinterpret_cast(data), size)) >= 0)) { *out = true; return Status::OK(); } return GenericConversionError(type_, data, size); } - - protected: - Trie true_trie_; - Trie false_trie_; }; // @@ -271,8 +275,8 @@ struct DecimalValueDecoder : public ValueDecoder { using value_type = Decimal128; explicit DecimalValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), decimal_type_(internal::checked_cast(*type_)), type_precision_(decimal_type_.precision()), type_scale_(decimal_type_.scale()) {} @@ -310,8 +314,10 @@ struct CustomDecimalPointValueDecoder : public ValueDecoder { using value_type = typename WrappedDecoder::value_type; explicit CustomDecimalPointValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), wrapped_decoder_(type, options) {} + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), + wrapped_decoder_(type, options, trie_cache) {} Status Initialize() { RETURN_NOT_OK(wrapped_decoder_.Initialize()); @@ -321,7 +327,7 @@ struct CustomDecimalPointValueDecoder : public ValueDecoder { mapping_[options_.decimal_point] = '.'; mapping_['.'] = options_.decimal_point; // error out on standard decimal point temp_.resize(30); - return Status::OK(); + return ValueDecoder::Initialize(); } Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) { @@ -357,8 +363,9 @@ struct InlineISO8601ValueDecoder : public ValueDecoder { using value_type = int64_t; explicit InlineISO8601ValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), unit_(checked_cast(*type_).unit()), expect_timezone_(!checked_cast(*type_).timezone().empty()) { } @@ -396,8 +403,9 @@ struct SingleParserTimestampValueDecoder : public ValueDecoder { using value_type = int64_t; explicit SingleParserTimestampValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), unit_(checked_cast(*type_).unit()), expect_timezone_(!checked_cast(*type_).timezone().empty()), parser_(*options_.timestamp_parsers[0]) {} @@ -436,8 +444,9 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder { using value_type = int64_t; explicit MultipleParsersTimestampValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), unit_(checked_cast(*type_).unit()), expect_timezone_(!checked_cast(*type_).timezone().empty()), parsers_(GetParsers(options_)) {} @@ -477,8 +486,9 @@ struct DurationValueDecoder : public ValueDecoder { using value_type = int64_t; explicit DurationValueDecoder(const std::shared_ptr& type, - const ConvertOptions& options) - : ValueDecoder(type, options), + const ConvertOptions& options, + const TrieCache* trie_cache) + : ValueDecoder(type, options, trie_cache), concrete_type_(checked_cast(*type)), string_converter_() {} @@ -517,7 +527,8 @@ class NullConverter : public ConcreteConverter { public: NullConverter(const std::shared_ptr& type, const ConvertOptions& options, MemoryPool* pool) - : ConcreteConverter(type, options, pool), decoder_(type_, options_) {} + : ConcreteConverter(type, options, pool), + decoder_(type_, options_, static_cast(trie_cache_.get())) {} Result> Convert(const BlockParser& parser, int32_t col_index) override { @@ -551,7 +562,8 @@ class PrimitiveConverter : public ConcreteConverter { public: PrimitiveConverter(const std::shared_ptr& type, const ConvertOptions& options, MemoryPool* pool) - : ConcreteConverter(type, options, pool), decoder_(type_, options_) {} + : ConcreteConverter(type, options, pool), + decoder_(type_, options_, static_cast(trie_cache_.get())) {} Result> Convert(const BlockParser& parser, int32_t col_index) override { @@ -593,7 +605,8 @@ class TypedDictionaryConverter : public ConcreteDictionaryConverter { TypedDictionaryConverter(const std::shared_ptr& value_type, const ConvertOptions& options, MemoryPool* pool) : ConcreteDictionaryConverter(value_type, options, pool), - decoder_(value_type, options_) {} + decoder_(value_type, options_, static_cast(trie_cache_.get())) { + } Result> Convert(const BlockParser& parser, int32_t col_index) override { @@ -684,7 +697,13 @@ std::shared_ptr MakeRealConverter(const std::shared_ptr Converter::Converter(const std::shared_ptr& type, const ConvertOptions& options, MemoryPool* pool) - : options_(options), pool_(pool), type_(type) {} + : options_(options), pool_(pool), type_(type) { + // Build shared Trie cache (errors handled in Initialize()) + auto maybe_cache = TrieCache::Make(options); + if (maybe_cache.ok()) { + trie_cache_ = std::static_pointer_cast(*std::move(maybe_cache)); + } +} DictionaryConverter::DictionaryConverter(const std::shared_ptr& value_type, const ConvertOptions& options, MemoryPool* pool) diff --git a/cpp/src/arrow/csv/converter.h b/cpp/src/arrow/csv/converter.h index 639f692f26a1..c6254bd7ca1f 100644 --- a/cpp/src/arrow/csv/converter.h +++ b/cpp/src/arrow/csv/converter.h @@ -57,6 +57,9 @@ class ARROW_EXPORT Converter { const ConvertOptions& options_; MemoryPool* pool_; std::shared_ptr type_; + // Opaque TrieCache pointer. TrieCache destructor is called via control block. + // https://en.cppreference.com/w/cpp/memory/shared_ptr + std::shared_ptr trie_cache_; }; class ARROW_EXPORT DictionaryConverter : public Converter { diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc index 5d14fe4b9b10..2db0dba2de71 100644 --- a/cpp/src/arrow/csv/writer.cc +++ b/cpp/src/arrow/csv/writer.cc @@ -541,7 +541,7 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { for (auto maybe_slice : iterator) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr slice, maybe_slice); RETURN_NOT_OK(TranslateMinimalBatch(*slice)); - RETURN_NOT_OK(sink_->Write(data_buffer_)); + RETURN_NOT_OK(FlushToSink()); stats_.num_record_batches++; } return Status::OK(); @@ -554,7 +554,7 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { RETURN_NOT_OK(reader.ReadNext(&batch)); while (batch != nullptr) { RETURN_NOT_OK(TranslateMinimalBatch(*batch)); - RETURN_NOT_OK(sink_->Write(data_buffer_)); + RETURN_NOT_OK(FlushToSink()); RETURN_NOT_OK(reader.ReadNext(&batch)); stats_.num_record_batches++; } @@ -590,6 +590,13 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { return Status::OK(); } + // GH-36889: Flush buffer to sink and clear it to avoid stale content + // being written again if the next batch is empty. + Status FlushToSink() { + RETURN_NOT_OK(sink_->Write(data_buffer_)); + return data_buffer_->Resize(0, /*shrink_to_fit=*/false); + } + int64_t CalculateHeaderSize(QuotingStyle quoting_style) const { int64_t header_length = 0; for (int col = 0; col < schema_->num_fields(); col++) { @@ -654,7 +661,7 @@ class CSVWriterImpl : public ipc::RecordBatchWriter { next += options_.eol.size(); DCHECK_EQ(reinterpret_cast(next), data_buffer_->data() + data_buffer_->size()); - return sink_->Write(data_buffer_); + return FlushToSink(); } Status TranslateMinimalBatch(const RecordBatch& batch) { diff --git a/cpp/src/arrow/csv/writer_test.cc b/cpp/src/arrow/csv/writer_test.cc index 783d7631ab36..ce4d8ab16d01 100644 --- a/cpp/src/arrow/csv/writer_test.cc +++ b/cpp/src/arrow/csv/writer_test.cc @@ -28,6 +28,7 @@ #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/result.h" +#include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" #include "arrow/type.h" @@ -405,5 +406,36 @@ INSTANTIATE_TEST_SUITE_P( "\n2016-02-29 10:42:23-0700,2016-02-29 17:42:23Z\n"))); #endif +TEST(TestWriteCSV, EmptyBatchShouldNotPolluteOutput) { + auto schema = arrow::schema({field("col1", utf8())}); + auto empty_batch = RecordBatchFromJSON(schema, "[]"); + auto batch_a = RecordBatchFromJSON(schema, R"([{"col1": "a"}])"); + auto batch_b = RecordBatchFromJSON(schema, R"([{"col1": "b"}])"); + + struct TestParam { + std::shared_ptr table; + std::string expected_output; + }; + + std::vector test_params = { + // Empty batch in the beginning + {Table::FromRecordBatches(schema, {empty_batch, batch_a, batch_b}).ValueOrDie(), + "\"col1\"\n\"a\"\n\"b\"\n"}, + // Empty batch in the middle + {Table::FromRecordBatches(schema, {batch_a, empty_batch, batch_b}).ValueOrDie(), + "\"col1\"\n\"a\"\n\"b\"\n"}, + // Empty batch in the end + {Table::FromRecordBatches(schema, {batch_a, batch_b, empty_batch}).ValueOrDie(), + "\"col1\"\n\"a\"\n\"b\"\n"}, + }; + + for (const auto& param : test_params) { + ASSERT_OK_AND_ASSIGN(auto out, io::BufferOutputStream::Create()); + ASSERT_OK(WriteCSV(*param.table, WriteOptions::Defaults(), out.get())); + ASSERT_OK_AND_ASSIGN(auto buffer, out->Finish()); + EXPECT_EQ(buffer->ToString(), param.expected_output); + } +} + } // namespace csv } // namespace arrow diff --git a/cpp/src/arrow/extension/meson.build b/cpp/src/arrow/extension/meson.build index 663ebba4d4a3..480c457fadc6 100644 --- a/cpp/src/arrow/extension/meson.build +++ b/cpp/src/arrow/extension/meson.build @@ -32,5 +32,12 @@ exc = executable( test('arrow-canonical-extensions-test', exc) install_headers( - ['bool8.h', 'fixed_shape_tensor.h', 'json.h', 'opaque.h', 'uuid.h'], + [ + 'bool8.h', + 'fixed_shape_tensor.h', + 'json.h', + 'opaque.h', + 'parquet_variant.h', + 'uuid.h', + ], ) diff --git a/cpp/src/parquet/arrow/variant_internal.cc b/cpp/src/arrow/extension/parquet_variant.cc similarity index 84% rename from cpp/src/parquet/arrow/variant_internal.cc rename to cpp/src/arrow/extension/parquet_variant.cc index 87f88efaac75..95aa5a0eb68e 100644 --- a/cpp/src/parquet/arrow/variant_internal.cc +++ b/cpp/src/arrow/extension/parquet_variant.cc @@ -15,28 +15,19 @@ // specific language governing permissions and limitations // under the License. -#include "parquet/arrow/variant_internal.h" +#include "arrow/extension/parquet_variant.h" #include #include "arrow/extension_type.h" #include "arrow/result.h" #include "arrow/status.h" -#include "arrow/type_fwd.h" #include "arrow/util/logging_internal.h" -namespace parquet::arrow { +namespace arrow::extension { -using ::arrow::Array; -using ::arrow::ArrayData; -using ::arrow::DataType; -using ::arrow::ExtensionType; -using ::arrow::Result; -using ::arrow::Type; - -VariantExtensionType::VariantExtensionType( - const std::shared_ptr<::arrow::DataType>& storage_type) - : ::arrow::ExtensionType(storage_type) { +VariantExtensionType::VariantExtensionType(const std::shared_ptr& storage_type) + : ExtensionType(storage_type) { // GH-45948: Shredded variants will need to handle an optional shredded_value as // well as value_ becoming optional. @@ -66,14 +57,13 @@ std::string VariantExtensionType::Serialize() const { return ""; } std::shared_ptr VariantExtensionType::MakeArray( std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); - DCHECK_EQ("parquet.variant", - ::arrow::internal::checked_cast(*data->type) - .extension_name()); + DCHECK_EQ("arrow.parquet.variant", + internal::checked_cast(*data->type).extension_name()); return std::make_shared(data); } namespace { -bool IsBinaryField(const std::shared_ptr<::arrow::Field> field) { +bool IsBinaryField(const std::shared_ptr field) { return field->type()->storage_id() == Type::BINARY || field->type()->storage_id() == Type::LARGE_BINARY; } @@ -116,8 +106,8 @@ bool VariantExtensionType::IsSupportedStorageType( Result> VariantExtensionType::Make( std::shared_ptr storage_type) { if (!IsSupportedStorageType(storage_type)) { - return ::arrow::Status::Invalid("Invalid storage type for VariantExtensionType: ", - storage_type->ToString()); + return Status::Invalid("Invalid storage type for VariantExtensionType: ", + storage_type->ToString()); } return std::make_shared(std::move(storage_type)); @@ -130,4 +120,4 @@ std::shared_ptr variant(std::shared_ptr storage_type) { return VariantExtensionType::Make(std::move(storage_type)).ValueOrDie(); } -} // namespace parquet::arrow +} // namespace arrow::extension diff --git a/cpp/src/parquet/arrow/variant_internal.h b/cpp/src/arrow/extension/parquet_variant.h similarity index 56% rename from cpp/src/parquet/arrow/variant_internal.h rename to cpp/src/arrow/extension/parquet_variant.h index d0b77c72c619..be90923f14e6 100644 --- a/cpp/src/parquet/arrow/variant_internal.h +++ b/cpp/src/arrow/extension/parquet_variant.h @@ -17,17 +17,16 @@ #pragma once -#include #include #include "arrow/extension_type.h" -#include "parquet/platform.h" +#include "arrow/util/visibility.h" -namespace parquet::arrow { +namespace arrow::extension { -class PARQUET_EXPORT VariantArray : public ::arrow::ExtensionArray { +class ARROW_EXPORT VariantArray : public ExtensionArray { public: - using ::arrow::ExtensionArray::ExtensionArray; + using ExtensionArray::ExtensionArray; }; /// EXPERIMENTAL: Variant is not yet fully supported. @@ -46,41 +45,37 @@ class PARQUET_EXPORT VariantArray : public ::arrow::ExtensionArray { /// /// To read more about variant shredding, see the variant shredding spec at /// https://github.com/apache/parquet-format/blob/master/VariantShredding.md -class PARQUET_EXPORT VariantExtensionType : public ::arrow::ExtensionType { +class ARROW_EXPORT VariantExtensionType : public ExtensionType { public: - explicit VariantExtensionType(const std::shared_ptr<::arrow::DataType>& storage_type); + explicit VariantExtensionType(const std::shared_ptr& storage_type); - std::string extension_name() const override { return "parquet.variant"; } + std::string extension_name() const override { return "arrow.parquet.variant"; } - bool ExtensionEquals(const ::arrow::ExtensionType& other) const override; + bool ExtensionEquals(const ExtensionType& other) const override; - ::arrow::Result> Deserialize( - std::shared_ptr<::arrow::DataType> storage_type, + Result> Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const override; std::string Serialize() const override; - std::shared_ptr<::arrow::Array> MakeArray( - std::shared_ptr<::arrow::ArrayData> data) const override; + std::shared_ptr MakeArray(std::shared_ptr data) const override; - static ::arrow::Result> Make( - std::shared_ptr<::arrow::DataType> storage_type); + static Result> Make(std::shared_ptr storage_type); - static bool IsSupportedStorageType( - const std::shared_ptr<::arrow::DataType>& storage_type); + static bool IsSupportedStorageType(const std::shared_ptr& storage_type); - std::shared_ptr<::arrow::Field> metadata() const { return metadata_; } + std::shared_ptr metadata() const { return metadata_; } - std::shared_ptr<::arrow::Field> value() const { return value_; } + std::shared_ptr value() const { return value_; } private: // TODO GH-45948 added shredded_value - std::shared_ptr<::arrow::Field> metadata_; - std::shared_ptr<::arrow::Field> value_; + std::shared_ptr metadata_; + std::shared_ptr value_; }; /// \brief Return a VariantExtensionType instance. -PARQUET_EXPORT std::shared_ptr<::arrow::DataType> variant( - std::shared_ptr<::arrow::DataType> storage_type); +ARROW_EXPORT std::shared_ptr variant(std::shared_ptr storage_type); -} // namespace parquet::arrow +} // namespace arrow::extension diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 6580476d38c8..7aa3e58c1d3b 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -38,6 +38,7 @@ #include #include "arrow/buffer.h" +#include "arrow/config.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" #include "arrow/io/util_internal.h" @@ -303,6 +304,10 @@ Status ExceptionToStatus(const Azure::Core::RequestFailedException& exception, return Status::IOError(std::forward(prefix_args)..., " Azure Error: [", exception.ErrorCode, "] ", exception.what()); } + +std::string BuildApplicationId() { + return "azpartner-arrow/" + GetBuildInfo().version_string; +} } // namespace std::string AzureOptions::AccountBlobUrl(const std::string& account_name) const { @@ -386,9 +391,12 @@ Result> AzureOptions::MakeBlobServiceC return Status::Invalid("AzureOptions::blob_storage_scheme must be http or https: ", blob_storage_scheme); } + Blobs::BlobClientOptions client_options; + client_options.Telemetry.ApplicationId = BuildApplicationId(); switch (credential_kind_) { case CredentialKind::kAnonymous: - return std::make_unique(AccountBlobUrl(account_name)); + return std::make_unique(AccountBlobUrl(account_name), + client_options); case CredentialKind::kDefault: if (!token_credential_) { token_credential_ = std::make_shared(); @@ -399,14 +407,14 @@ Result> AzureOptions::MakeBlobServiceC case CredentialKind::kCLI: case CredentialKind::kWorkloadIdentity: case CredentialKind::kEnvironment: - return std::make_unique(AccountBlobUrl(account_name), - token_credential_); + return std::make_unique( + AccountBlobUrl(account_name), token_credential_, client_options); case CredentialKind::kStorageSharedKey: - return std::make_unique(AccountBlobUrl(account_name), - storage_shared_key_credential_); + return std::make_unique( + AccountBlobUrl(account_name), storage_shared_key_credential_, client_options); case CredentialKind::kSASToken: - return std::make_unique(AccountBlobUrl(account_name) + - sas_token_); + return std::make_unique( + AccountBlobUrl(account_name) + sas_token_, client_options); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } @@ -420,10 +428,12 @@ AzureOptions::MakeDataLakeServiceClient() const { return Status::Invalid("AzureOptions::dfs_storage_scheme must be http or https: ", dfs_storage_scheme); } + DataLake::DataLakeClientOptions client_options; + client_options.Telemetry.ApplicationId = BuildApplicationId(); switch (credential_kind_) { case CredentialKind::kAnonymous: return std::make_unique( - AccountDfsUrl(account_name)); + AccountDfsUrl(account_name), client_options); case CredentialKind::kDefault: if (!token_credential_) { token_credential_ = std::make_shared(); @@ -435,13 +445,13 @@ AzureOptions::MakeDataLakeServiceClient() const { case CredentialKind::kWorkloadIdentity: case CredentialKind::kEnvironment: return std::make_unique( - AccountDfsUrl(account_name), token_credential_); + AccountDfsUrl(account_name), token_credential_, client_options); case CredentialKind::kStorageSharedKey: return std::make_unique( - AccountDfsUrl(account_name), storage_shared_key_credential_); + AccountDfsUrl(account_name), storage_shared_key_credential_, client_options); case CredentialKind::kSASToken: return std::make_unique( - AccountBlobUrl(account_name) + sas_token_); + AccountBlobUrl(account_name) + sas_token_, client_options); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } @@ -967,10 +977,36 @@ Status StageBlock(Blobs::BlockBlobClient* block_blob_client, const std::string& return Status::OK(); } +// Usually if the first page is empty it means there are no results. This was assumed in +// several places in AzureFilesystem. The Azure docs do not guarantee this and we have +// evidence (GH-49043) that there can be subsequent non-empty pages. +// Applying `SkipStartingEmptyPages` on a paged response corrects this assumption. +void SkipStartingEmptyPages(Blobs::ListBlobContainersPagedResponse& paged_response) { + while (paged_response.HasPage() && paged_response.BlobContainers.empty()) { + paged_response.MoveToNextPage(); + } +} +void SkipStartingEmptyPages(Blobs::ListBlobsPagedResponse& paged_response) { + while (paged_response.HasPage() && paged_response.Blobs.size() == 0) { + paged_response.MoveToNextPage(); + } +} +void SkipStartingEmptyPages(Blobs::ListBlobsByHierarchyPagedResponse& paged_response) { + while (paged_response.HasPage() && paged_response.Blobs.empty() && + paged_response.BlobPrefixes.empty()) { + paged_response.MoveToNextPage(); + } +} +void SkipStartingEmptyPages(DataLake::ListPathsPagedResponse& paged_response) { + while (paged_response.HasPage() && paged_response.Paths.empty()) { + paged_response.MoveToNextPage(); + } +} + /// Writes will be buffered up to this size (in bytes) before actually uploading them. static constexpr int64_t kBlockUploadSizeBytes = 10 * 1024 * 1024; /// The maximum size of a block in Azure Blob (as per docs). -static constexpr int64_t kMaxBlockSizeBytes = 4UL * 1024 * 1024 * 1024; +static constexpr int64_t kMaxBlockSizeBytes = 4LL * 1024 * 1024 * 1024; /// This output stream, similar to other arrow OutputStreams, is not thread-safe. class ObjectAppendStream final : public io::OutputStream { @@ -1362,7 +1398,7 @@ Result CheckIfHierarchicalNamespaceIsEnabled( // without hierarchical namespace enabled. directory_client.GetAccessControlList(); return HNSSupport::kEnabled; - } catch (std::out_of_range& exception) { + } catch (const std::out_of_range&) { // Azurite issue detected. DCHECK(IsDfsEmulator(options)); return HNSSupport::kDisabled; @@ -1805,12 +1841,14 @@ class AzureFileSystem::Impl { try { FileInfo info{location.all}; auto list_response = container_client.ListBlobsByHierarchy(kDelimiter, options); + SkipStartingEmptyPages(list_response); // Since PageSizeHint=1, we expect at most one entry in either Blobs or // BlobPrefixes. A BlobPrefix always ends with kDelimiter ("/"), so we can // distinguish between a directory and a file by checking if we received a // prefix or a blob. // This strategy allows us to implement GetFileInfo with just 1 blob storage // operation in almost every case. + if (!list_response.BlobPrefixes.empty()) { // Ensure the returned BlobPrefixes[0] string doesn't contain more characters than // the requested Prefix. For instance, if we request with Prefix="dir/abra" and @@ -1847,6 +1885,7 @@ class AzureFileSystem::Impl { // whether the path is a directory. options.Prefix = internal::EnsureTrailingSlash(location.path); auto list_with_trailing_slash_response = container_client.ListBlobs(options); + SkipStartingEmptyPages(list_with_trailing_slash_response); if (!list_with_trailing_slash_response.Blobs.empty()) { info.set_type(FileType::Directory); return info; @@ -1909,6 +1948,7 @@ class AzureFileSystem::Impl { try { auto container_list_response = blob_service_client_->ListBlobContainers(options, context); + SkipStartingEmptyPages(container_list_response); for (; container_list_response.HasPage(); container_list_response.MoveToNextPage(context)) { for (const auto& container : container_list_response.BlobContainers) { @@ -1950,6 +1990,7 @@ class AzureFileSystem::Impl { auto base_path_depth = internal::GetAbstractPathDepth(base_location.path); try { auto list_response = directory_client.ListPaths(select.recursive, options, context); + SkipStartingEmptyPages(list_response); for (; list_response.HasPage(); list_response.MoveToNextPage(context)) { if (list_response.Paths.empty()) { continue; @@ -2040,6 +2081,7 @@ class AzureFileSystem::Impl { try { auto list_response = container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options, context); + SkipStartingEmptyPages(list_response); for (; list_response.HasPage(); list_response.MoveToNextPage(context)) { if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty()) { continue; @@ -2442,6 +2484,7 @@ class AzureFileSystem::Impl { bool found_dir_marker_blob = false; try { auto list_response = container_client.ListBlobs(options); + SkipStartingEmptyPages(list_response); if (list_response.Blobs.empty()) { if (require_dir_to_exist) { return PathNotFound(location); @@ -2499,7 +2542,7 @@ class AzureFileSystem::Impl { try { auto delete_result = deferred_response.GetResponse(); success = delete_result.Value.Deleted; - } catch (const Core::RequestFailedException& exception) { + } catch (const Core::RequestFailedException&) { success = false; } if (!success) { @@ -2575,6 +2618,7 @@ class AzureFileSystem::Impl { auto directory_client = adlfs_client.GetDirectoryClient(location.path); try { auto list_response = directory_client.ListPaths(false); + SkipStartingEmptyPages(list_response); for (; list_response.HasPage(); list_response.MoveToNextPage()) { for (const auto& path : list_response.Paths) { if (path.IsDirectory) { @@ -2899,6 +2943,7 @@ class AzureFileSystem::Impl { list_blobs_options.PageSizeHint = 1; try { auto dest_list_response = dest_container_client.ListBlobs(list_blobs_options); + SkipStartingEmptyPages(dest_list_response); dest_is_empty = dest_list_response.Blobs.empty(); if (!dest_is_empty) { return NotEmpty(dest); @@ -2952,6 +2997,7 @@ class AzureFileSystem::Impl { list_blobs_options.PageSizeHint = 1; try { auto src_list_response = src_container_client.ListBlobs(list_blobs_options); + SkipStartingEmptyPages(src_list_response); if (!src_list_response.Blobs.empty()) { // Reminder: dest is used here because we're semantically replacing dest // with src. By deleting src if it's empty just like dest. @@ -3218,6 +3264,8 @@ class AzureFileSystem::Impl { std::atomic LeaseGuard::latest_known_expiry_time_ = SteadyClock::time_point{SteadyClock::duration::zero()}; +AzureFileSystem::~AzureFileSystem() = default; + AzureFileSystem::AzureFileSystem(std::unique_ptr&& impl) : FileSystem(impl->io_context()), impl_(std::move(impl)) { default_async_is_sync_ = false; diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index ee0956afdd7a..ae374d487b1a 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -251,7 +251,7 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { void ForceCachedHierarchicalNamespaceSupport(int hns_support); public: - ~AzureFileSystem() override = default; + ~AzureFileSystem() override; static Result> Make( const AzureOptions& options, const io::IOContext& = io::default_io_context()); diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index f75fd970a1ee..0c15f6f18444 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -119,7 +119,6 @@ #include "arrow/util/string.h" #include "arrow/util/task_group.h" #include "arrow/util/thread_pool.h" -#include "arrow/util/value_parsing.h" namespace arrow::fs { @@ -3579,9 +3578,10 @@ S3GlobalOptions S3GlobalOptions::Defaults() { log_level = S3LogLevel::Off; } - value = arrow::internal::GetEnvVar("ARROW_S3_THREADS").ValueOr("1"); - if (uint32_t u; ::arrow::internal::ParseUnsigned(value.data(), value.size(), &u)) { - num_event_loop_threads = u; + auto maybe_num_threads = + arrow::internal::GetEnvVarInteger("ARROW_S3_THREADS", /*min_value=*/1); + if (maybe_num_threads.ok()) { + num_event_loop_threads = static_cast(*maybe_num_threads); } return S3GlobalOptions{log_level, num_event_loop_threads}; } diff --git a/cpp/src/arrow/flight/cookie_internal.cc b/cpp/src/arrow/flight/cookie_internal.cc index 99fa8b238ddc..df09a77afb72 100644 --- a/cpp/src/arrow/flight/cookie_internal.cc +++ b/cpp/src/arrow/flight/cookie_internal.cc @@ -64,6 +64,11 @@ size_t CaseInsensitiveHash::operator()(const std::string& key) const { return std::hash{}(upper_string); } +bool CaseInsensitiveEqual::operator()(const std::string& lhs, + const std::string& rhs) const { + return strcasecmp(lhs.c_str(), rhs.c_str()) == 0; +} + Cookie Cookie::Parse(std::string_view cookie_header_value) { // Parse the cookie string. If the cookie has an expiration, record it. // If the cookie has a max-age, calculate the current time + max_age and set that as diff --git a/cpp/src/arrow/flight/cookie_internal.h b/cpp/src/arrow/flight/cookie_internal.h index 62c0390c585b..98b936edb338 100644 --- a/cpp/src/arrow/flight/cookie_internal.h +++ b/cpp/src/arrow/flight/cookie_internal.h @@ -41,6 +41,12 @@ class ARROW_FLIGHT_EXPORT CaseInsensitiveComparator { bool operator()(const std::string& t1, const std::string& t2) const; }; +/// \brief Case insensitive equality comparator for use by unordered cookie map. +class ARROW_FLIGHT_EXPORT CaseInsensitiveEqual { + public: + bool operator()(const std::string& lhs, const std::string& rhs) const; +}; + /// \brief Case insensitive hasher for use by cookie caching map. Cookies are not /// case-sensitive. class ARROW_FLIGHT_EXPORT CaseInsensitiveHash { @@ -117,7 +123,7 @@ class ARROW_FLIGHT_EXPORT CookieCache { // Mutex must be used to protect cookie cache. std::mutex mutex_; - std::unordered_map + std::unordered_map cookies; }; diff --git a/cpp/src/arrow/flight/sql/CMakeLists.txt b/cpp/src/arrow/flight/sql/CMakeLists.txt index 9695e0c9917a..2299bdfe0a3b 100644 --- a/cpp/src/arrow/flight/sql/CMakeLists.txt +++ b/cpp/src/arrow/flight/sql/CMakeLists.txt @@ -95,7 +95,7 @@ endif() if(MSVC) # Suppress warnings caused by Protobuf (casts) - set_source_files_properties(protocol_internal.cc PROPERTIES COMPILE_FLAGS "/wd4267") + set_source_files_properties(protocol_internal.cc PROPERTIES COMPILE_OPTIONS "/wd4267") endif() foreach(LIB_TARGET ${ARROW_FLIGHT_SQL_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_FLIGHT_SQL_EXPORTING) diff --git a/cpp/src/arrow/flight/sql/odbc/CMakeLists.txt b/cpp/src/arrow/flight/sql/odbc/CMakeLists.txt index c18a8e5de95b..39040c45024d 100644 --- a/cpp/src/arrow/flight/sql/odbc/CMakeLists.txt +++ b/cpp/src/arrow/flight/sql/odbc/CMakeLists.txt @@ -106,6 +106,8 @@ if(ARROW_FLIGHT_SQL_ODBC_INSTALLER) set(CPACK_PACKAGE_VERSION_PATCH ${ODBC_PACKAGE_VERSION_PATCH}) set(CPACK_PACKAGE_NAME ${ODBC_PACKAGE_NAME}) + # Make sure the MSI name contains only hyphens, not spaces + string(REPLACE " " "-" CPACK_PACKAGE_NAME "${CPACK_PACKAGE_NAME}") set(CPACK_PACKAGE_VENDOR ${ODBC_PACKAGE_VENDOR}) set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Apache Arrow Flight SQL ODBC Driver") set(CPACK_PACKAGE_CONTACT "dev@arrow.apache.org") diff --git a/cpp/src/arrow/flight/sql/odbc/README.md b/cpp/src/arrow/flight/sql/odbc/README.md index 8c2d9705a1dc..a8f3bc727f73 100644 --- a/cpp/src/arrow/flight/sql/odbc/README.md +++ b/cpp/src/arrow/flight/sql/odbc/README.md @@ -47,7 +47,7 @@ should show as an available ODBC driver in the x64 ODBC Driver Manager. 3. `cd` to `build` folder. 4. Run `cpack`. -If the generation is successful, you will find `Apache Arrow Flight SQL ODBC--win64.msi` generated under the `build` folder. +If the generation is successful, you will find `Apache-Arrow-Flight-SQL-ODBC--win64.msi` generated under the `build` folder. ## Steps to Enable Logging diff --git a/cpp/src/arrow/flight/sql/odbc/install/mac/install_odbc.sh b/cpp/src/arrow/flight/sql/odbc/install/mac/install_odbc.sh new file mode 100755 index 000000000000..069c534c2973 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/install/mac/install_odbc.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Used by macOS ODBC installer script `install_odbc_ini.sh` and macOS ODBC testing + +set -euo pipefail + +# Admin privilege is needed to add ODBC driver registration +if [ $EUID -ne 0 ]; then + echo "Please run this script with sudo" + exit 1 +fi + +ODBC_64BIT="$1" + +if [[ -z "$ODBC_64BIT" ]]; then + echo "error: 64-bit driver is not specified. Call format: install_odbc abs_path_to_64_bit_driver" + exit 1 +fi + +if [ ! -f "$ODBC_64BIT" ]; then + echo "64-bit driver can not be found: $ODBC_64BIT" + echo "Call format: install_odbc abs_path_to_64_bit_driver" + exit 1 +fi + +USER_ODBCINST_FILE="$HOME/Library/ODBC/odbcinst.ini" +DRIVER_NAME="Apache Arrow Flight SQL ODBC Driver" + +mkdir -p "$HOME"/Library/ODBC + +touch "$USER_ODBCINST_FILE" + +if grep -q "^\[$DRIVER_NAME\]" "$USER_ODBCINST_FILE"; then + echo "Driver [$DRIVER_NAME] already exists in odbcinst.ini" +else + echo "Adding [$DRIVER_NAME] to odbcinst.ini..." + echo " +[$DRIVER_NAME] +Description=An ODBC Driver for Apache Arrow Flight SQL +Driver=$ODBC_64BIT +" >>"$USER_ODBCINST_FILE" +fi + +# Check if [ODBC Drivers] section exists +if grep -q '^\[ODBC Drivers\]' "$USER_ODBCINST_FILE"; then + # Section exists: check if driver entry exists + if ! grep -q "^${DRIVER_NAME}=" "$USER_ODBCINST_FILE"; then + # Driver entry does not exist, add under [ODBC Drivers] + sed -i '' "/^\[ODBC Drivers\]/a\\ +${DRIVER_NAME}=Installed +" "$USER_ODBCINST_FILE" + fi +else + # Section doesn't exist, append both section and driver entry at end + { + echo "" + echo "[ODBC Drivers]" + echo "${DRIVER_NAME}=Installed" + } >>"$USER_ODBCINST_FILE" +fi diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_api.cc b/cpp/src/arrow/flight/sql/odbc/odbc_api.cc index b50c7db609f6..5676b9b05ed9 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_api.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_api.cc @@ -855,7 +855,7 @@ SQLRETURN SQLDriverConnect(SQLHDBC conn, SQLHWND window_handle, } #else // Attempt connection without loading DSN window on macOS/Linux - connection->Connect(dsn, properties, missing_properties); + connection->Connect(dsn_value, properties, missing_properties); #endif // Copy connection string to out_connection_string after connection attempt return ODBC::GetStringAttribute(true, connection_string, false, out_connection_string, diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/CMakeLists.txt b/cpp/src/arrow/flight/sql/odbc/odbc_impl/CMakeLists.txt index a1042cde97b0..e58558258df0 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/CMakeLists.txt +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/CMakeLists.txt @@ -45,10 +45,10 @@ add_library(arrow_odbc_spi_impl config/connection_string_parser.h diagnostics.cc diagnostics.h - error_codes.h encoding.cc encoding.h encoding_utils.h + error_codes.h exceptions.cc exceptions.h flight_sql_auth_method.cc @@ -130,9 +130,18 @@ if(WIN32) system_dsn.h) endif() -target_link_libraries(arrow_odbc_spi_impl - PUBLIC arrow_flight_sql_shared arrow_compute_shared Boost::locale - ${ODBCINST}) +if(APPLE) + target_include_directories(arrow_odbc_spi_impl SYSTEM BEFORE PUBLIC ${ODBC_INCLUDE_DIR}) + target_link_libraries(arrow_odbc_spi_impl + PUBLIC arrow_flight_sql_shared arrow_compute_shared Boost::locale + iodbc) +else() + find_package(ODBC REQUIRED) + target_include_directories(arrow_odbc_spi_impl PUBLIC ${ODBC_INCLUDE_DIR}) + target_link_libraries(arrow_odbc_spi_impl + PUBLIC arrow_flight_sql_shared arrow_compute_shared Boost::locale + ${ODBCINST}) +endif() set_target_properties(arrow_odbc_spi_impl PROPERTIES ARCHIVE_OUTPUT_DIRECTORY diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/address_info.cc b/cpp/src/arrow/flight/sql/odbc/odbc_impl/address_info.cc index 5ee6674c3c26..7bdb4d58cf82 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/address_info.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/address_info.cc @@ -16,6 +16,7 @@ // under the License. #include "arrow/flight/sql/odbc/odbc_impl/address_info.h" +#include namespace driver { @@ -34,7 +35,7 @@ bool AddressInfo::GetAddressInfo(const std::string& host, char* host_name_info, } error = getnameinfo(addrinfo_result_->ai_addr, addrinfo_result_->ai_addrlen, - host_name_info, static_cast(max_host), NULL, 0, 0); + host_name_info, static_cast(max_host), NULL, 0, 0); return error == 0; } diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/config/configuration.h b/cpp/src/arrow/flight/sql/odbc/odbc_impl/config/configuration.h index 9b59f346b29c..66a267639976 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/config/configuration.h +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/config/configuration.h @@ -22,8 +22,10 @@ #include "arrow/flight/sql/odbc/odbc_impl/platform.h" #include "arrow/flight/sql/odbc/odbc_impl/spi/connection.h" +#if defined _WIN32 // winuser.h needs to be included after windows.h, which is defined in platform.h -#include +# include +#endif namespace arrow::flight::sql::odbc { namespace config { diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc index 422c45fc0590..8b2b564d8db8 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_connection.cc @@ -157,9 +157,6 @@ void FlightSqlConnection::Connect(const ConnPropertyMap& properties, client_options_ = BuildFlightClientOptions(properties, missing_attr, flight_ssl_configs); - const std::shared_ptr& cookie_factory = GetCookieFactory(); - client_options_.middleware.push_back(cookie_factory); - std::unique_ptr flight_client; ThrowIfNotOK(FlightClient::Connect(location, client_options_).Value(&flight_client)); PopulateMetadataSettings(properties); diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/odbc_handle.h b/cpp/src/arrow/flight/sql/odbc/odbc_impl/odbc_handle.h index b3fd6e371a26..9dd8fe37baf6 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/odbc_handle.h +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/odbc_handle.h @@ -46,7 +46,18 @@ class ODBCHandle { try { GetDiagnostics().Clear(); rc = function(); - } catch (const arrow::flight::sql::odbc::DriverException& ex) { + } catch (const arrow::flight::sql::odbc::AuthenticationException& ex) { + GetDiagnostics().AddError(arrow::flight::sql::odbc::DriverException( + ex.GetMessageText(), ex.GetSqlState(), ex.GetNativeError())); + } catch (const arrow::flight::sql::odbc::NullWithoutIndicatorException& ex) { + GetDiagnostics().AddError(arrow::flight::sql::odbc::DriverException( + ex.GetMessageText(), ex.GetSqlState(), ex.GetNativeError())); + } + // on mac, DriverException doesn't catch the subclass exceptions hence we added + // the following above. + // GH-48278 TODO investigate if there is a way to catch all the subclass exceptions + // under DriverException + catch (const arrow::flight::sql::odbc::DriverException& ex) { GetDiagnostics().AddError(ex); } catch (const std::bad_alloc&) { GetDiagnostics().AddError(arrow::flight::sql::odbc::DriverException( diff --git a/cpp/src/arrow/flight/sql/odbc/tests/CMakeLists.txt b/cpp/src/arrow/flight/sql/odbc/tests/CMakeLists.txt index 5485ef9b4d49..ef0c7271ec23 100644 --- a/cpp/src/arrow/flight/sql/odbc/tests/CMakeLists.txt +++ b/cpp/src/arrow/flight/sql/odbc/tests/CMakeLists.txt @@ -15,11 +15,6 @@ # specific language governing permissions and limitations # under the License. -add_custom_target(tests) - -find_package(ODBC REQUIRED) -include_directories(${ODBC_INCLUDE_DIRS}) - find_package(SQLite3Alt REQUIRED) set(ARROW_FLIGHT_SQL_MOCK_SERVER_SRCS @@ -54,5 +49,8 @@ add_arrow_test(flight_sql_odbc_test ${SQLite3_LIBRARIES} arrow_odbc_spi_impl) +find_package(ODBC REQUIRED) +target_link_libraries(arrow-flight-sql-odbc-test PRIVATE ODBC::ODBC) + # Disable unity build due to sqlite_sql_info.cc conflict with sql.h and sqlext.h headers. set_target_properties(arrow-flight-sql-odbc-test PROPERTIES UNITY_BUILD OFF) diff --git a/cpp/src/arrow/flight/sql/odbc/tests/connection_test.cc b/cpp/src/arrow/flight/sql/odbc/tests/connection_test.cc index b1081bc1d6af..3ca4a50ef769 100644 --- a/cpp/src/arrow/flight/sql/odbc/tests/connection_test.cc +++ b/cpp/src/arrow/flight/sql/odbc/tests/connection_test.cc @@ -442,7 +442,7 @@ TEST_F(ConnectionRemoteTest, TestSQLDriverConnectInvalidUid) { arrow::util::UTF8ToWideString(connect_str)); std::vector connect_str0(wconnect_str.begin(), wconnect_str.end()); - SQLWCHAR out_str[kOdbcBufferSize]; + SQLWCHAR out_str[kOdbcBufferSize] = {0}; SQLSMALLINT out_str_len; // Connecting to ODBC server. diff --git a/cpp/src/arrow/flight/sql/odbc/tests/odbc_test_suite.cc b/cpp/src/arrow/flight/sql/odbc/tests/odbc_test_suite.cc index 3f12e35c6d64..470a68b3beb3 100644 --- a/cpp/src/arrow/flight/sql/odbc/tests/odbc_test_suite.cc +++ b/cpp/src/arrow/flight/sql/odbc/tests/odbc_test_suite.cc @@ -130,9 +130,9 @@ std::wstring ODBCRemoteTestBase::GetQueryAllDataTypes() { CAST(true AS BOOLEAN) AS bit_true, --Character types - 'Z' AS c_char, '你' AS c_wchar, + 'Z' AS c_char, _utf8'你' AS c_wchar, - '你好' AS c_wvarchar, + _utf8'你好' AS c_wvarchar, 'XYZ' AS c_varchar, @@ -245,7 +245,7 @@ std::string ODBCMockTestBase::GetConnectionString() { std::string connect_str( "driver={Apache Arrow Flight SQL ODBC Driver};HOST=localhost;port=" + std::to_string(port) + ";token=" + std::string(kTestToken) + - ";useEncryption=false;"); + ";useEncryption=false;UseWideChar=true;"); return connect_str; } diff --git a/cpp/src/arrow/flight/sql/odbc/tests/odbc_test_suite.h b/cpp/src/arrow/flight/sql/odbc/tests/odbc_test_suite.h index 7dd77d8fa62d..3115cd627547 100644 --- a/cpp/src/arrow/flight/sql/odbc/tests/odbc_test_suite.h +++ b/cpp/src/arrow/flight/sql/odbc/tests/odbc_test_suite.h @@ -216,8 +216,8 @@ bool CompareConnPropertyMap(Connection::ConnPropertyMap map1, std::string GetOdbcErrorMessage(SQLSMALLINT handle_type, SQLHANDLE handle); static constexpr std::string_view kErrorState01004 = "01004"; -static constexpr std::string_view kErrorState01S07 = "01S07"; static constexpr std::string_view kErrorState01S02 = "01S02"; +static constexpr std::string_view kErrorState01S07 = "01S07"; static constexpr std::string_view kErrorState07009 = "07009"; static constexpr std::string_view kErrorState08003 = "08003"; static constexpr std::string_view kErrorState22002 = "22002"; @@ -236,7 +236,10 @@ static constexpr std::string_view kErrorStateHY106 = "HY106"; static constexpr std::string_view kErrorStateHY114 = "HY114"; static constexpr std::string_view kErrorStateHY118 = "HY118"; static constexpr std::string_view kErrorStateHYC00 = "HYC00"; +static constexpr std::string_view kErrorStateS1002 = "S1002"; static constexpr std::string_view kErrorStateS1004 = "S1004"; +static constexpr std::string_view kErrorStateS1010 = "S1010"; +static constexpr std::string_view kErrorStateS1090 = "S1090"; /// Verify ODBC Error State void VerifyOdbcErrorState(SQLSMALLINT handle_type, SQLHANDLE handle, diff --git a/cpp/src/arrow/flight/sql/odbc/tests/statement_attr_test.cc b/cpp/src/arrow/flight/sql/odbc/tests/statement_attr_test.cc index 5b6821430a11..0a4e99d33a6f 100644 --- a/cpp/src/arrow/flight/sql/odbc/tests/statement_attr_test.cc +++ b/cpp/src/arrow/flight/sql/odbc/tests/statement_attr_test.cc @@ -63,6 +63,8 @@ void GetStmtAttr(SQLHSTMT statement, SQLINTEGER attribute, SQLPOINTER* value) { SQLGetStmtAttr(statement, attribute, value, SQL_IS_POINTER, &string_length)); } +#if defined(SQL_ATTR_ASYNC_STMT_EVENT) || defined(SQL_ATTR_ASYNC_STMT_PCALLBACK) || \ + defined(SQL_ATTR_ASYNC_STMT_PCONTEXT) // Validate error return value and code void ValidateGetStmtAttrErrorCode(SQLHSTMT statement, SQLINTEGER attribute, std::string_view error_code) { @@ -74,6 +76,8 @@ void ValidateGetStmtAttrErrorCode(SQLHSTMT statement, SQLINTEGER attribute, VerifyOdbcErrorState(SQL_HANDLE_STMT, statement, error_code); } +#endif // SQL_ATTR_ASYNC_STMT_EVENT || SQL_ATTR_ASYNC_STMT_PCALLBACK || + // SQL_ATTR_ASYNC_STMT_PCONTEXT // Validate return value for call to SQLSetStmtAttr with SQLULEN void ValidateSetStmtAttr(SQLHSTMT statement, SQLINTEGER attribute, SQLULEN new_value) { diff --git a/cpp/src/arrow/integration/CMakeLists.txt b/cpp/src/arrow/integration/CMakeLists.txt index fd239ff2ab42..267d0adf11bf 100644 --- a/cpp/src/arrow/integration/CMakeLists.txt +++ b/cpp/src/arrow/integration/CMakeLists.txt @@ -33,7 +33,9 @@ elseif(ARROW_BUILD_INTEGRATION) add_dependencies(arrow-json-integration-test arrow arrow_testing) add_dependencies(arrow-integration arrow-json-integration-test) +endif() +if(ARROW_BUILD_INTEGRATION) add_arrow_lib(arrow_c_data_integration SOURCES c_data_integration_internal.cc diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index 623fcde413d8..f37844026dfc 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -45,8 +45,9 @@ add_arrow_test(memory_test PREFIX "arrow-io") add_arrow_benchmark(file_benchmark PREFIX "arrow-io") -if(NOT (${ARROW_SIMD_LEVEL} STREQUAL "NONE") AND NOT (${ARROW_SIMD_LEVEL} STREQUAL "NEON" - )) +if(DEFINED ARROW_SIMD_LEVEL + AND NOT (ARROW_SIMD_LEVEL STREQUAL "NONE") + AND NOT (ARROW_SIMD_LEVEL STREQUAL "NEON")) # This benchmark either requires SSE4.2 or ARMV8 SIMD to be enabled add_arrow_benchmark(memory_benchmark PREFIX "arrow-io") endif() diff --git a/cpp/src/arrow/io/interfaces.cc b/cpp/src/arrow/io/interfaces.cc index 12c124ce213f..cdd2470b629c 100644 --- a/cpp/src/arrow/io/interfaces.cc +++ b/cpp/src/arrow/io/interfaces.cc @@ -390,23 +390,15 @@ namespace { constexpr int kDefaultNumIoThreads = 8; std::shared_ptr MakeIOThreadPool() { - int threads = 0; - auto maybe_env_var = ::arrow::internal::GetEnvVar("ARROW_IO_THREADS"); - if (maybe_env_var.ok()) { - auto str = *std::move(maybe_env_var); - if (!str.empty()) { - try { - threads = std::stoi(str); - } catch (...) { - } - if (threads <= 0) { - ARROW_LOG(WARNING) - << "ARROW_IO_THREADS does not contain a valid number of threads " - "(should be an integer > 0)"; - } - } + int threads = kDefaultNumIoThreads; + auto maybe_num_threads = ::arrow::internal::GetEnvVarInteger( + "ARROW_IO_THREADS", /*min_value=*/1, /*max_value=*/std::numeric_limits::max()); + if (maybe_num_threads.ok()) { + threads = static_cast(*maybe_num_threads); + } else if (!maybe_num_threads.status().IsKeyError()) { + maybe_num_threads.status().Warn(); } - auto maybe_pool = ThreadPool::MakeEternal(threads > 0 ? threads : kDefaultNumIoThreads); + auto maybe_pool = ThreadPool::MakeEternal(threads); if (!maybe_pool.ok()) { maybe_pool.status().Abort("Failed to create global IO thread pool"); } diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index f6c6f342a099..6aceaa7f4480 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -57,6 +57,9 @@ using internal::checked_cast; class ExtensionType; namespace ipc { + +using internal::kArrowMagicBytes; + namespace feather { namespace { @@ -787,8 +790,8 @@ Result> Reader::Open( // IPC Read options are ignored for ReaderV1 RETURN_NOT_OK(result->Open(source)); return result; - } else if (memcmp(buffer->data(), internal::kArrowMagicBytes, - strlen(internal::kArrowMagicBytes)) == 0) { + } else if (std::string_view(buffer->data_as(), kArrowMagicBytes.size()) == + kArrowMagicBytes) { std::shared_ptr result = std::make_shared(); RETURN_NOT_OK(result->Open(source, options)); return result; diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 8be09956f102..c21eb913c389 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -363,9 +364,13 @@ Result> ReadMessage(std::shared_ptr metadata, } } -Result> ReadMessage(int64_t offset, int32_t metadata_length, - io::RandomAccessFile* file, - const FieldsLoaderFunction& fields_loader) { +// Common helper for the two ReadMessage overloads that take a file + offset. +// When body_length is provided, metadata and body are read in a single IO. +// When body_length is absent, metadata is read first, then the body is read +// separately. +static Result> ReadMessageInternal( + int64_t offset, int32_t metadata_length, std::optional body_length, + io::RandomAccessFile* file, const FieldsLoaderFunction& fields_loader) { std::unique_ptr result; auto listener = std::make_shared(&result); MessageDecoder decoder(listener); @@ -375,15 +380,18 @@ Result> ReadMessage(int64_t offset, int32_t metadata_le decoder.next_required_size()); } - // TODO(GH-48846): we should take a body_length just like ReadMessageAsync - // and read metadata + body in one go. - ARROW_ASSIGN_OR_RAISE(auto metadata, file->ReadAt(offset, metadata_length)); + // When body_length is known, read metadata + body in one IO call. + // Otherwise, read only metadata first. + ARROW_ASSIGN_OR_RAISE(std::shared_ptr metadata, + file->ReadAt(offset, metadata_length + body_length.value_or(0))); + if (metadata->size() < metadata_length) { return Status::Invalid("Expected to read ", metadata_length, " metadata bytes at offset ", offset, " but got ", metadata->size()); } - ARROW_RETURN_NOT_OK(decoder.Consume(metadata)); + + ARROW_RETURN_NOT_OK(decoder.Consume(SliceBuffer(metadata, 0, metadata_length))); switch (decoder.state()) { case MessageDecoder::State::INITIAL: @@ -398,14 +406,23 @@ Result> ReadMessage(int64_t offset, int32_t metadata_le case MessageDecoder::State::BODY: { std::shared_ptr body; if (fields_loader) { + // Selective field loading: allocate a body buffer and read only the + // requested field ranges into it. ARROW_ASSIGN_OR_RAISE( body, AllocateBuffer(decoder.next_required_size(), default_memory_pool())); RETURN_NOT_OK(ReadFieldsSubset(offset, metadata_length, file, fields_loader, - metadata, decoder.next_required_size(), body)); + SliceBuffer(metadata, 0, metadata_length), + decoder.next_required_size(), body)); + } else if (body_length.has_value()) { + // Body was already read as part of the combined IO; just slice it out. + body = SliceBuffer(metadata, metadata_length, + std::min(*body_length, metadata->size() - metadata_length)); } else { + // Body length was unknown; do a separate IO to read the body. ARROW_ASSIGN_OR_RAISE( body, file->ReadAt(offset + metadata_length, decoder.next_required_size())); } + if (body->size() < decoder.next_required_size()) { return Status::IOError("Expected to be able to read ", decoder.next_required_size(), @@ -421,6 +438,21 @@ Result> ReadMessage(int64_t offset, int32_t metadata_le } } +Result> ReadMessage(int64_t offset, int32_t metadata_length, + io::RandomAccessFile* file, + const FieldsLoaderFunction& fields_loader) { + return ReadMessageInternal(offset, metadata_length, /*body_length=*/std::nullopt, file, + fields_loader); +} + +Result> ReadMessage(const int64_t offset, + const int32_t metadata_length, + const int64_t body_length, + io::RandomAccessFile* file) { + return ReadMessageInternal(offset, metadata_length, body_length, file, + /*fields_loader=*/{}); +} + Future> ReadMessageAsync(int64_t offset, int32_t metadata_length, int64_t body_length, io::RandomAccessFile* file, diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 1cd72ce993ed..df80b0eba252 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -449,7 +449,7 @@ class ARROW_EXPORT MessageReader { // org::apache::arrow::flatbuf::RecordBatch*) using FieldsLoaderFunction = std::function; -/// \brief Read encapsulated RPC message from position in file +/// \brief Read encapsulated IPC message from position in file /// /// Read a length-prefixed message flatbuffer starting at the indicated file /// offset. If the message has a body with non-zero length, it will also be @@ -469,7 +469,27 @@ Result> ReadMessage( const int64_t offset, const int32_t metadata_length, io::RandomAccessFile* file, const FieldsLoaderFunction& fields_loader = {}); -/// \brief Read encapsulated RPC message from cached buffers +/// \brief Read encapsulated IPC message from position in file +/// +/// Read a length-prefixed message flatbuffer starting at the indicated file +/// offset. +/// +/// The metadata_length includes at least the length prefix and the flatbuffer +/// +/// \param[in] offset the position in the file where the message starts. The +/// first 4 bytes after the offset are the message length +/// \param[in] metadata_length the total number of bytes to read from file +/// \param[in] body_length the number of bytes for the message body +/// \param[in] file the seekable file interface to read from +/// \return the message read + +ARROW_EXPORT +Result> ReadMessage(const int64_t offset, + const int32_t metadata_length, + const int64_t body_length, + io::RandomAccessFile* file); + +/// \brief Read encapsulated IPC message from cached buffers /// /// The buffers should contain an entire message. Partial reads are not handled. /// diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index 914ce3efe69d..2a9574d84a10 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -78,7 +79,7 @@ flatbuf::MetadataVersion MetadataVersionToFlatbuffer(MetadataVersion version); // Whether the type has a validity bitmap in the given IPC version bool HasValidityBitmap(Type::type type_id, MetadataVersion version); -static constexpr const char* kArrowMagicBytes = "ARROW1"; +constexpr const std::string_view kArrowMagicBytes = "ARROW1"; struct FieldMetadata { int64_t length; diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 315d8bd07d9b..15cf0258b2ee 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -552,9 +552,15 @@ class TestIpcRoundTrip : public ::testing::TestWithParam, ASSERT_OK(WriteRecordBatch(*batch, buffer_offset, mmap_.get(), &metadata_length, &body_length, options_)); - ASSERT_OK_AND_ASSIGN(std::unique_ptr message, + ASSERT_OK_AND_ASSIGN(std::unique_ptr message1, ReadMessage(0, metadata_length, mmap_.get())); - ASSERT_EQ(expected_version, message->metadata_version()); + ASSERT_EQ(expected_version, message1->metadata_version()); + + ASSERT_OK_AND_ASSIGN(auto message2, + ReadMessage(0, metadata_length, body_length, mmap_.get())); + ASSERT_EQ(expected_version, message2->metadata_version()); + + ASSERT_TRUE(message1->Equals(*message2)); } }; @@ -613,6 +619,27 @@ TEST(TestReadMessage, CorruptedSmallInput) { ASSERT_EQ(nullptr, message); } +TEST(TestReadMessage, ReadBodyWithLength) { + // Test the optimized ReadMessage(offset, meta_len, body_len, file) overload + std::shared_ptr batch; + ASSERT_OK(MakeIntRecordBatch(&batch)); + + ASSERT_OK_AND_ASSIGN(auto stream, io::BufferOutputStream::Create(0)); + int32_t metadata_length; + int64_t body_length; + ASSERT_OK(WriteRecordBatch(*batch, 0, stream.get(), &metadata_length, &body_length, + IpcWriteOptions::Defaults())); + + ASSERT_OK_AND_ASSIGN(auto buffer, stream->Finish()); + io::BufferReader reader(buffer); + + ASSERT_OK_AND_ASSIGN(auto message, + ReadMessage(0, metadata_length, body_length, &reader)); + + ASSERT_EQ(body_length, message->body_length()); + ASSERT_TRUE(message->Verify()); +} + TEST(TestMetadata, GetMetadataVersion) { ASSERT_EQ(MetadataVersion::V1, ipc::internal::GetMetadataVersion( flatbuf::MetadataVersion::MetadataVersion_V1)); @@ -1094,7 +1121,7 @@ TEST_F(RecursionLimits, ReadLimit) { &schema)); ASSERT_OK_AND_ASSIGN(std::unique_ptr message, - ReadMessage(0, metadata_length, mmap_.get())); + ReadMessage(0, metadata_length, body_length, mmap_.get())); io::BufferReader reader(message->body()); @@ -1119,7 +1146,7 @@ TEST_F(RecursionLimits, StressLimit) { &schema)); ASSERT_OK_AND_ASSIGN(std::unique_ptr message, - ReadMessage(0, metadata_length, mmap_.get())); + ReadMessage(0, metadata_length, body_length, mmap_.get())); DictionaryMemo empty_memo; @@ -1252,40 +1279,55 @@ struct FileGeneratorWriterHelper : public FileWriterHelper { Status ReadBatches(const IpcReadOptions& options, RecordBatchVector* out_batches, ReadStats* out_stats = nullptr, MetadataVector* out_metadata_list = nullptr) override { - std::shared_ptr buf_reader; - if (kCoalesce) { - // Use a non-zero-copy enabled BufferReader so we can test paths properly - buf_reader = std::make_shared(buffer_); - } else { - buf_reader = std::make_shared(buffer_); - } - AsyncGenerator> generator; + // The generator doesn't track stats. + EXPECT_EQ(nullptr, out_stats); - { - auto fut = RecordBatchFileReader::OpenAsync(buf_reader, footer_offset_, options); - // Do NOT assert OK since some tests check whether this fails properly - EXPECT_FINISHES(fut); - ARROW_ASSIGN_OR_RAISE(auto reader, fut.result()); - EXPECT_EQ(num_batches_written_, reader->num_record_batches()); - // Generator will keep reader alive internally - ARROW_ASSIGN_OR_RAISE(generator, reader->GetRecordBatchGenerator(kCoalesce)); - } + auto read_batches = [&](bool pre_buffer) -> Result { + std::shared_ptr buf_reader; + if (kCoalesce) { + // Use a non-zero-copy enabled BufferReader so we can test paths properly + buf_reader = std::make_shared(buffer_); + } else { + buf_reader = std::make_shared(buffer_); + } + AsyncGenerator> generator; + + { + auto fut = RecordBatchFileReader::OpenAsync(buf_reader, footer_offset_, options); + ARROW_ASSIGN_OR_RAISE(auto reader, fut.result()); + EXPECT_EQ(num_batches_written_, reader->num_record_batches()); + if (pre_buffer) { + RETURN_NOT_OK(reader->PreBufferMetadata(/*indices=*/{})); + } + // Generator will keep reader alive internally + ARROW_ASSIGN_OR_RAISE(generator, reader->GetRecordBatchGenerator(kCoalesce)); + } + + // Generator is async-reentrant + std::vector>> futures; + for (int i = 0; i < num_batches_written_; ++i) { + futures.push_back(generator()); + } + auto fut = generator(); + ARROW_ASSIGN_OR_RAISE(auto final_batch, fut.result()); + EXPECT_EQ(nullptr, final_batch); + + RecordBatchVector batches; + for (auto& future : futures) { + ARROW_ASSIGN_OR_RAISE(auto batch, future.result()); + EXPECT_NE(nullptr, batch); + batches.push_back(batch); + } + return batches; + }; - // Generator is async-reentrant - std::vector>> futures; + ARROW_ASSIGN_OR_RAISE(*out_batches, read_batches(/*pre_buffer=*/false)); + // Also read with pre-buffered metadata, and check the results are equal + ARROW_ASSIGN_OR_RAISE(auto batches_pre_buffered, read_batches(/*pre_buffer=*/true)); for (int i = 0; i < num_batches_written_; ++i) { - futures.push_back(generator()); - } - auto fut = generator(); - EXPECT_FINISHES_OK_AND_EQ(nullptr, fut); - for (auto& future : futures) { - EXPECT_FINISHES_OK_AND_ASSIGN(auto batch, future); - out_batches->push_back(batch); + AssertBatchesEqual(*batches_pre_buffered[i], *(*out_batches)[i], + /*check_metadata=*/true); } - - // The generator doesn't track stats. - EXPECT_EQ(nullptr, out_stats); - return Status::OK(); } }; @@ -3003,25 +3045,56 @@ void GetReadRecordBatchReadRanges( auto read_ranges = tracked->get_read_ranges(); - // there are 3 read IOs before reading body: - // 1) read magic and footer length IO - // 2) read footer IO - // 3) read record batch metadata IO - EXPECT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); - const int32_t magic_size = static_cast(strlen(ipc::internal::kArrowMagicBytes)); + const int32_t magic_size = static_cast(ipc::internal::kArrowMagicBytes.size()); // read magic and footer length IO auto file_end_size = magic_size + sizeof(int32_t); auto footer_length_offset = buffer->size() - file_end_size; auto footer_length = bit_util::FromLittleEndian( util::SafeLoadAs(buffer->data() + footer_length_offset)); + + // there are at least 2 read IOs before reading body: + // 1) read magic and footer length IO + // 2) footer IO + EXPECT_GE(read_ranges.size(), 2); + + // read magic and footer length IO EXPECT_EQ(read_ranges[0].length, file_end_size); // read footer IO EXPECT_EQ(read_ranges[1].length, footer_length); - // read record batch metadata. The exact size is tricky to determine but it doesn't - // matter for this test and it should be smaller than the footer. - EXPECT_LE(read_ranges[2].length, footer_length); - for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) { - EXPECT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); + + if (included_fields.empty()) { + // When no fields are explicitly included, the reader optimizes by + // reading metadata and the entire body in a single IO. + // Thus, there are exactly 3 read IOs in total: + // 1) magic and footer length + // 2) footer + // 3) record batch metadata + body + EXPECT_EQ(read_ranges.size(), 3); + + int64_t total_body = 0; + for (auto len : expected_body_read_lengths) total_body += len; + + // In the optimized path (included_fields is empty), the 3rd read operation + // fetches both the message metadata (flatbuffer) and the entire message body + // in one contiguous block. Therefore, its length must at least exceed the + // total body length by the size of the metadata. + EXPECT_GT(read_ranges[2].length, total_body); + EXPECT_LE(read_ranges[2].length, total_body + footer_length); + } else { + // When fields are filtered, we see 3 initial reads followed by N body reads + // (one for each field/buffer range): + // 1) magic and footer length + // 2) footer + // 3) record batch metadata + // 4) individual body buffer reads + EXPECT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); + + // read record batch metadata. The exact size is tricky to determine but it doesn't + // matter for this test and it should be smaller than the footer. + EXPECT_LE(read_ranges[2].length, footer_length); + for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) { + EXPECT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); + } } } @@ -3171,7 +3244,9 @@ class PreBufferingTest : public ::testing::TestWithParam { metadata_reads++; } } - ASSERT_EQ(metadata_reads, reader_->num_record_batches() - num_indices_pre_buffered); + // With ReadMessage optimization, non-prebuffered reads verify metadata and body + // in a single large read, so we no longer see small metadata-only reads here. + ASSERT_EQ(metadata_reads, 0); ASSERT_EQ(data_reads, reader_->num_record_batches()); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 8e125fc5ede7..991d238240f3 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,7 @@ #include "arrow/util/compression.h" #include "arrow/util/endian.h" #include "arrow/util/fuzz_internal.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging_internal.h" #include "arrow/util/parallel.h" @@ -72,6 +74,7 @@ namespace arrow { namespace flatbuf = org::apache::arrow::flatbuf; +using internal::AddWithOverflow; using internal::checked_cast; using internal::checked_pointer_cast; @@ -177,14 +180,16 @@ class ArrayLoader { explicit ArrayLoader(const flatbuf::RecordBatch* metadata, MetadataVersion metadata_version, const IpcReadOptions& options, - int64_t file_offset) + int64_t file_offset, int64_t file_length) : metadata_(metadata), metadata_version_(metadata_version), file_(nullptr), file_offset_(file_offset), + file_length_(file_length), max_recursion_depth_(options.max_recursion_depth) {} Status ReadBuffer(int64_t offset, int64_t length, std::shared_ptr* out) { + // This construct permits overriding GetBuffer at compile time if (skip_io_) { return Status::OK(); } @@ -194,7 +199,10 @@ class ArrayLoader { if (length < 0) { return Status::Invalid("Negative length for reading buffer ", buffer_index_); } - // This construct permits overriding GetBuffer at compile time + auto read_end = AddWithOverflow({offset, length}); + if (!read_end.has_value() || (file_length_.has_value() && read_end > file_length_)) { + return Status::Invalid("Buffer ", buffer_index_, " exceeds IPC file area"); + } if (!bit_util::IsMultipleOf8(offset)) { return Status::Invalid("Buffer ", buffer_index_, " did not start on 8-byte aligned offset: ", offset); @@ -202,6 +210,9 @@ class ArrayLoader { if (file_) { return file_->ReadAt(offset, length).Value(out); } else { + if (!AddWithOverflow({read_end.value(), file_offset_}).has_value()) { + return Status::Invalid("Buffer ", buffer_index_, " exceeds IPC file area"); + } read_request_.RequestRange(offset + file_offset_, length, out); return Status::OK(); } @@ -235,7 +246,7 @@ class ArrayLoader { } Status GetBuffer(int buffer_index, std::shared_ptr* out) { - auto buffers = metadata_->buffers(); + auto* buffers = metadata_->buffers(); CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); if (buffer_index >= static_cast(buffers->size())) { return Status::IOError("buffer_index out of range."); @@ -252,7 +263,9 @@ class ArrayLoader { Result GetVariadicCount(int i) { auto* variadic_counts = metadata_->variadicBufferCounts(); + auto* buffers = metadata_->buffers(); CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicBufferCounts"); + CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); if (i >= static_cast(variadic_counts->size())) { return Status::IOError("variadic_count_index out of range."); } @@ -262,8 +275,7 @@ class ArrayLoader { } // Detect an excessive variadic buffer count to avoid potential memory blowup // (GH-48900). - const auto max_buffer_count = - static_cast(metadata_->buffers()->size()) - buffer_index_; + const auto max_buffer_count = static_cast(buffers->size()) - buffer_index_; if (count > max_buffer_count) { return Status::IOError("variadic buffer count exceeds available number of buffers"); } @@ -285,21 +297,38 @@ class ArrayLoader { return Status::OK(); } - Status LoadCommon(Type::type type_id) { + Status LoadCommon(Type::type type_id, bool allow_validity_bitmap = true) { DCHECK_NE(out_, nullptr); // This only contains the length and null count, which we need to figure // out what to do with the buffers. For example, if null_count == 0, then // we can skip that buffer without reading from shared memory RETURN_NOT_OK(GetFieldMetadata(field_index_++, out_)); + if (::arrow::internal::has_variadic_buffers(type_id)) { + ARROW_ASSIGN_OR_RAISE(auto data_buffer_count, + GetVariadicCount(variadic_count_index_++)); + const int64_t start = static_cast(out_->buffers.size()); + // NOTE: this must be done before any other call to `GetBuffer` because + // BatchDataReadRequest will keep pointers to `std::shared_ptr` + // objects. + out_->buffers.resize(start + data_buffer_count); + } + if (internal::HasValidityBitmap(type_id, metadata_version_)) { - // Extract null_bitmap which is common to all arrays except for unions + // Extract null bitmap which is common to all arrays except for unions // and nulls. if (out_->null_count != 0) { - RETURN_NOT_OK(GetBuffer(buffer_index_, &out_->buffers[0])); + if (allow_validity_bitmap) { + RETURN_NOT_OK(GetBuffer(buffer_index_, &out_->buffers[0])); + } else { + // Caller did not allow this + return Status::Invalid("Cannot read ", ::arrow::internal::ToTypeName(type_id), + " array with top-level validity bitmap"); + } } buffer_index_++; } + return Status::OK(); } @@ -398,14 +427,9 @@ class ArrayLoader { Status Visit(const BinaryViewType& type) { out_->buffers.resize(2); - RETURN_NOT_OK(LoadCommon(type.id())); - RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); - - ARROW_ASSIGN_OR_RAISE(auto data_buffer_count, - GetVariadicCount(variadic_count_index_++)); - out_->buffers.resize(data_buffer_count + 2); - for (int64_t i = 0; i < data_buffer_count; ++i) { - RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); + RETURN_NOT_OK(LoadCommon(type.id())); // also initializes variadic buffers + for (int64_t i = 1; i < static_cast(out_->buffers.size()); ++i) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i])); } return Status::OK(); } @@ -454,9 +478,10 @@ class ArrayLoader { int n_buffers = type.mode() == UnionMode::SPARSE ? 2 : 3; out_->buffers.resize(n_buffers); - RETURN_NOT_OK(LoadCommon(type.id())); - - // With metadata V4, we can get a validity bitmap. + // With metadata V4, we can get a validity bitmap. The bitmap may be there + // if we're loading eagerly, or it might be scheduled for loading if we're + // using a BatchDataReadRequest. + // // Trying to fix up union data to do without the top-level validity bitmap // is hairy: // - type ids must be rewritten to all have valid values (even for former @@ -465,12 +490,9 @@ class ArrayLoader { // by ANDing the top-level validity bitmap // - dense union children must be rewritten (at least one of them) // to insert the required null slots that were formerly omitted - // So instead we bail out. - if (out_->null_count != 0 && out_->buffers[0] != nullptr) { - return Status::Invalid( - "Cannot read pre-1.0.0 Union array with top-level validity bitmap"); - } - out_->buffers[0] = nullptr; + // + // So instead we disallow validity bitmaps. + RETURN_NOT_OK(LoadCommon(type.id(), /*allow_validity_bitmap=*/false)); out_->null_count = 0; if (out_->length > 0) { @@ -503,6 +525,7 @@ class ArrayLoader { const MetadataVersion metadata_version_; io::RandomAccessFile* file_; int64_t file_offset_; + std::optional file_length_; int max_recursion_depth_; int buffer_index_ = 0; int field_index_ = 0; @@ -1173,8 +1196,19 @@ namespace { // Common functions used in both the random-access file reader and the // asynchronous generator -inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) { - return FileBlock{block->offset(), block->metaDataLength(), block->bodyLength()}; +Result FileBlockFromFlatbuffer(const flatbuf::Block* fb_block, + int64_t max_offset) { + auto block = + FileBlock{fb_block->offset(), fb_block->metaDataLength(), fb_block->bodyLength()}; + if (block.metadata_length < 0 || block.body_length < 0 || block.offset < 0) { + return Status::IOError("Invalid Block in IPC file footer"); + } + auto block_end = + AddWithOverflow({block.offset, block.metadata_length, block.body_length}); + if (!block_end.has_value() || block_end > max_offset) { + return Status::IOError("Invalid Block in IPC file footer"); + } + return block; } Status CheckAligned(const FileBlock& block) { @@ -1203,9 +1237,15 @@ Result> ReadMessageFromBlock( const FileBlock& block, io::RandomAccessFile* file, const FieldsLoaderFunction& fields_loader) { RETURN_NOT_OK(CheckAligned(block)); - ARROW_ASSIGN_OR_RAISE(auto message, ReadMessage(block.offset, block.metadata_length, - file, fields_loader)); - return CheckBodyLength(std::move(message), block); + if (fields_loader) { + ARROW_ASSIGN_OR_RAISE(auto message, ReadMessage(block.offset, block.metadata_length, + file, fields_loader)); + return CheckBodyLength(std::move(message), block); + } else { + ARROW_ASSIGN_OR_RAISE(auto message, ReadMessage(block.offset, block.metadata_length, + block.body_length, file)); + return CheckBodyLength(std::move(message), block); + } } Future> ReadMessageFromBlockAsync( @@ -1362,8 +1402,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { read_options, file, schema, &inclusion_mask); }; } - ARROW_ASSIGN_OR_RAISE(auto message, - ReadMessageFromBlock(GetRecordBatchBlock(i), fields_loader)); + ARROW_ASSIGN_OR_RAISE(auto block, GetRecordBatchBlock(i)); + ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(block, fields_loader)); CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); @@ -1379,8 +1419,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Result CountRows() override { int64_t total = 0; for (int i = 0; i < num_record_batches(); i++) { - ARROW_ASSIGN_OR_RAISE(auto outer_message, - ReadMessageFromBlock(GetRecordBatchBlock(i))); + ARROW_ASSIGN_OR_RAISE(auto block, GetRecordBatchBlock(i)); + ARROW_ASSIGN_OR_RAISE(auto outer_message, ReadMessageFromBlock(block)); auto metadata = outer_message->metadata(); const flatbuf::Message* message = nullptr; RETURN_NOT_OK( @@ -1494,13 +1534,13 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Status DoPreBufferMetadata(const std::vector& indices) { RETURN_NOT_OK(CacheMetadata(indices)); - EnsureDictionaryReadStarted(); + RETURN_NOT_OK(EnsureDictionaryReadStarted()); Future<> all_metadata_ready = WaitForMetadatas(indices); for (int index : indices) { Future> metadata_loaded = all_metadata_ready.Then([this, index]() -> Result> { stats_.num_messages.fetch_add(1, std::memory_order_relaxed); - FileBlock block = GetRecordBatchBlock(index); + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetRecordBatchBlock(index)); ARROW_ASSIGN_OR_RAISE( std::shared_ptr metadata, metadata_cache_->Read({block.offset, block.metadata_length})); @@ -1549,12 +1589,12 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { } }; - FileBlock GetRecordBatchBlock(int i) const { - return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i)); + Result GetRecordBatchBlock(int i) const { + return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i), footer_offset_); } - FileBlock GetDictionaryBlock(int i) const { - return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i)); + Result GetDictionaryBlock(int i) const { + return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i), footer_offset_); } Result> ReadMessageFromBlock( @@ -1567,16 +1607,26 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Status ReadDictionaries() { // Read all the dictionaries + std::vector> messages(num_dictionaries()); + for (int i = 0; i < num_dictionaries(); ++i) { + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetDictionaryBlock(i)); + ARROW_ASSIGN_OR_RAISE(messages[i], ReadMessageFromBlock(block)); + } + return ReadDictionaries(messages); + } + + Status ReadDictionaries( + const std::vector>& dictionary_messages) { + DCHECK_EQ(dictionary_messages.size(), static_cast(num_dictionaries())); IpcReadContext context(&dictionary_memo_, options_, swap_endian_); for (int i = 0; i < num_dictionaries(); ++i) { - ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(GetDictionaryBlock(i))); - RETURN_NOT_OK(ReadOneDictionary(message.get(), context)); - stats_.num_dictionary_batches.fetch_add(1, std::memory_order_relaxed); + RETURN_NOT_OK(ReadOneDictionary(i, dictionary_messages[i].get(), context)); } return Status::OK(); } - Status ReadOneDictionary(Message* message, const IpcReadContext& context) { + Status ReadOneDictionary(int dict_index, Message* message, + const IpcReadContext& context) { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); DictionaryKind kind; @@ -1586,44 +1636,48 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { } else if (kind == DictionaryKind::Delta) { stats_.num_dictionary_deltas.fetch_add(1, std::memory_order_relaxed); } + stats_.num_dictionary_batches.fetch_add(1, std::memory_order_relaxed); return Status::OK(); } - void AddDictionaryRanges(std::vector* ranges) const { + Status AddDictionaryRanges(std::vector* ranges) const { // Adds all dictionaries to the range cache for (int i = 0; i < num_dictionaries(); ++i) { - FileBlock block = GetDictionaryBlock(i); + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetDictionaryBlock(i)); ranges->push_back({block.offset, block.metadata_length + block.body_length}); } + return Status::OK(); } - void AddMetadataRanges(const std::vector& indices, - std::vector* ranges) { + Status AddMetadataRanges(const std::vector& indices, + std::vector* ranges) { for (int index : indices) { - FileBlock block = GetRecordBatchBlock(static_cast(index)); + ARROW_ASSIGN_OR_RAISE(FileBlock block, GetRecordBatchBlock(index)); ranges->push_back({block.offset, block.metadata_length}); } + return Status::OK(); } Status CacheMetadata(const std::vector& indices) { std::vector ranges; if (!read_dictionaries_) { - AddDictionaryRanges(&ranges); + RETURN_NOT_OK(AddDictionaryRanges(&ranges)); } - AddMetadataRanges(indices, &ranges); + RETURN_NOT_OK(AddMetadataRanges(indices, &ranges)); return metadata_cache_->Cache(std::move(ranges)); } - void EnsureDictionaryReadStarted() { + Status EnsureDictionaryReadStarted() { if (!dictionary_load_finished_.is_valid()) { read_dictionaries_ = true; std::vector ranges; - AddDictionaryRanges(&ranges); + RETURN_NOT_OK(AddDictionaryRanges(&ranges)); dictionary_load_finished_ = metadata_cache_->WaitFor(std::move(ranges)).Then([this] { return ReadDictionaries(); }); } + return Status::OK(); } Status WaitForDictionaryReadFinished() { @@ -1641,7 +1695,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { Future<> WaitForMetadatas(const std::vector& indices) { std::vector ranges; - AddMetadataRanges(indices, &ranges); + RETURN_NOT_OK(AddMetadataRanges(indices, &ranges)); return metadata_cache_->WaitFor(std::move(ranges)); } @@ -1685,12 +1739,13 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { const flatbuf::RecordBatch* batch, IpcReadContext context, io::RandomAccessFile* file, std::shared_ptr owned_file, - int64_t block_data_offset) + int64_t block_data_offset, int64_t block_data_length) : schema(std::move(sch)), context(std::move(context)), file(file), owned_file(std::move(owned_file)), - loader(batch, context.metadata_version, context.options, block_data_offset), + loader(batch, context.metadata_version, context.options, block_data_offset, + block_data_length), columns(schema->num_fields()), cache(file, file->io_context(), io::CacheOptions::LazyDefaults()), length(batch->length()) {} @@ -1789,14 +1844,15 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { return dictionary_load_finished_.Then([message_fut] { return message_fut; }) .Then([this, index](const std::shared_ptr& message_obj) -> Future> { - FileBlock block = GetRecordBatchBlock(index); + ARROW_ASSIGN_OR_RAISE(auto block, GetRecordBatchBlock(index)); ARROW_ASSIGN_OR_RAISE(auto message, GetFlatbufMessage(message_obj)); ARROW_ASSIGN_OR_RAISE(auto batch, GetBatchFromMessage(message)); ARROW_ASSIGN_OR_RAISE(auto context, GetIpcReadContext(message, batch)); auto read_context = std::make_shared( schema_, batch, std::move(context), file_, owned_file_, - block.offset + static_cast(block.metadata_length)); + block.offset + static_cast(block.metadata_length), + block.body_length); RETURN_NOT_OK(read_context->CalculateLoadRequest()); return read_context->ReadAsync().Then( [read_context] { return read_context->CreateRecordBatch(); }); @@ -1809,26 +1865,28 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { } Future<> ReadFooterAsync(arrow::internal::Executor* executor) { - const int32_t magic_size = static_cast(strlen(kArrowMagicBytes)); + constexpr int32_t kMagicSize = static_cast(kArrowMagicBytes.size()); - if (footer_offset_ <= magic_size * 2 + 4) { + if (footer_offset_ <= kMagicSize * 2 + 4) { return Status::Invalid("File is too small: ", footer_offset_); } - int file_end_size = static_cast(magic_size + sizeof(int32_t)); + int file_end_size = static_cast(kMagicSize + sizeof(int32_t)); auto self = std::dynamic_pointer_cast(shared_from_this()); auto read_magic = file_->ReadAsync(footer_offset_ - file_end_size, file_end_size); if (executor) read_magic = executor->Transfer(std::move(read_magic)); return read_magic .Then([=](const std::shared_ptr& buffer) -> Future> { - const int64_t expected_footer_size = magic_size + sizeof(int32_t); + const int64_t expected_footer_size = kMagicSize + sizeof(int32_t); if (buffer->size() < expected_footer_size) { return Status::Invalid("Unable to read ", expected_footer_size, "from end of file"); } - if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) { + const auto magic_start = buffer->data() + sizeof(int32_t); + if (std::string_view(reinterpret_cast(magic_start), kMagicSize) != + kArrowMagicBytes) { return Status::Invalid("Not an Arrow file"); } @@ -1836,7 +1894,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { *reinterpret_cast(buffer->data())); if (footer_length <= 0 || - footer_length > self->footer_offset_ - magic_size * 2 - 4) { + footer_length > self->footer_offset_ - kMagicSize * 2 - 4) { return Status::Invalid("File is smaller than indicated metadata size"); } @@ -1915,25 +1973,31 @@ Future WholeIpcFileRecordBatchGenerator::operator()() { auto state = state_; if (!read_dictionaries_.is_valid()) { - std::vector>> messages(state->num_dictionaries()); - for (int i = 0; i < state->num_dictionaries(); i++) { - auto block = FileBlockFromFlatbuffer(state->footer_->dictionaries()->Get(i)); - messages[i] = ReadBlock(block); - } - auto read_messages = All(std::move(messages)); - if (executor_) read_messages = executor_->Transfer(read_messages); - read_dictionaries_ = read_messages.Then( - [=](const std::vector>>& maybe_messages) - -> Status { - ARROW_ASSIGN_OR_RAISE(auto messages, - arrow::internal::UnwrapOrRaise(maybe_messages)); - return ReadDictionaries(state.get(), std::move(messages)); - }); + if (state->dictionary_load_finished_.is_valid()) { + // PreBufferMetadata has started reading dictionaries in the background + read_dictionaries_ = state->dictionary_load_finished_; + } else { + // Start reading dictionaries + std::vector>> messages(state->num_dictionaries()); + for (int i = 0; i < state->num_dictionaries(); i++) { + ARROW_ASSIGN_OR_RAISE(auto block, state->GetDictionaryBlock(i)); + messages[i] = ReadBlock(block); + } + auto read_messages = All(std::move(messages)); + if (executor_) read_messages = executor_->Transfer(read_messages); + read_dictionaries_ = read_messages.Then( + [=](const std::vector>>& maybe_messages) + -> Status { + ARROW_ASSIGN_OR_RAISE(auto messages, + arrow::internal::UnwrapOrRaise(maybe_messages)); + return state->ReadDictionaries(messages); + }); + } } if (index_ >= state_->num_record_batches()) { return Future::MakeFinished(IterationTraits::End()); } - auto block = FileBlockFromFlatbuffer(state->footer_->recordBatches()->Get(index_++)); + ARROW_ASSIGN_OR_RAISE(auto block, state->GetRecordBatchBlock(index_++)); auto read_message = ReadBlock(block); auto read_messages = read_dictionaries_.Then([read_message]() { return read_message; }); // Force transfer. This may be wasteful in some cases, but ensures we get off the @@ -1969,16 +2033,6 @@ Future> WholeIpcFileRecordBatchGenerator::ReadBlock( } } -Status WholeIpcFileRecordBatchGenerator::ReadDictionaries( - RecordBatchFileReaderImpl* state, - std::vector> dictionary_messages) { - IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_); - for (const auto& message : dictionary_messages) { - RETURN_NOT_OK(state->ReadOneDictionary(message.get(), context)); - } - return Status::OK(); -} - Result> WholeIpcFileRecordBatchGenerator::ReadRecordBatch( RecordBatchFileReaderImpl* state, Message* message) { CHECK_HAS_BODY(*message); @@ -2630,6 +2684,36 @@ Status ValidateFuzzBatch(const RecordBatch& batch) { return st; } +Status ValidateFuzzBatch(const RecordBatchWithMetadata& batch) { + if (batch.batch) { + RETURN_NOT_OK(ValidateFuzzBatch(*batch.batch)); + } + // XXX do something with custom metadata? + return Status::OK(); +} + +Status CompareFuzzBatches(const RecordBatchWithMetadata& left, + const RecordBatchWithMetadata& right) { + bool ok = true; + if ((left.batch != nullptr) != (right.batch != nullptr)) { + ok = false; + } else if (left.batch) { + ok &= left.batch->Equals(*right.batch, EqualOptions{}.nans_equal(true)); + } + return ok ? Status::OK() : Status::Invalid("Batches unequal"); +} + +Status CompareFuzzBatches(const std::vector& left, + const std::vector& right) { + if (left.size() != right.size()) { + return Status::Invalid("Not the same number of batches"); + } + for (size_t i = 0; i < left.size(); ++i) { + RETURN_NOT_OK(CompareFuzzBatches(left[i], right[i])); + } + return Status::OK(); +} + IpcReadOptions FuzzingOptions() { IpcReadOptions options; options.memory_pool = ::arrow::internal::fuzzing_memory_pool(); @@ -2648,12 +2732,12 @@ Status FuzzIpcStream(const uint8_t* data, int64_t size) { Status st; while (true) { - std::shared_ptr batch; - RETURN_NOT_OK(batch_reader->ReadNext(&batch)); - if (batch == nullptr) { + ARROW_ASSIGN_OR_RAISE(auto batch, batch_reader->ReadNext()); + if (!batch.batch && !batch.custom_metadata) { + // EOS break; } - st &= ValidateFuzzBatch(*batch); + st &= ValidateFuzzBatch(batch); } return st; @@ -2661,20 +2745,81 @@ Status FuzzIpcStream(const uint8_t* data, int64_t size) { Status FuzzIpcFile(const uint8_t* data, int64_t size) { auto buffer = std::make_shared(data, size); - io::BufferReader buffer_reader(buffer); - std::shared_ptr batch_reader; - ARROW_ASSIGN_OR_RAISE(batch_reader, - RecordBatchFileReader::Open(&buffer_reader, FuzzingOptions())); - Status st; + Status final_status; + + // Try to read the IPC file as a stream to compare the results (differential fuzzing) + auto do_stream_read = [&]() -> Result> { + io::BufferReader buffer_reader(buffer); + // Skip magic bytes at the beginning + RETURN_NOT_OK( + buffer_reader.Advance(bit_util::RoundUpToMultipleOf8(kArrowMagicBytes.length()))); + ARROW_ASSIGN_OR_RAISE(auto batch_reader, RecordBatchStreamReader::Open( + &buffer_reader, FuzzingOptions())); - const int n_batches = batch_reader->num_record_batches(); - for (int i = 0; i < n_batches; ++i) { - ARROW_ASSIGN_OR_RAISE(auto batch, batch_reader->ReadRecordBatch(i)); - st &= ValidateFuzzBatch(*batch); + std::vector batches; + while (true) { + ARROW_ASSIGN_OR_RAISE(auto batch, batch_reader->ReadNext()); + if (!batch.batch && !batch.custom_metadata) { + // EOS + break; + } + batches.push_back(batch); + } + return batches; + }; + + auto do_file_read = + [&](bool pre_buffer) -> Result> { + io::BufferReader buffer_reader(buffer); + ARROW_ASSIGN_OR_RAISE(auto batch_reader, + RecordBatchFileReader::Open(&buffer_reader, FuzzingOptions())); + if (pre_buffer) { + // Pre-buffer all record batches + RETURN_NOT_OK(batch_reader->PreBufferMetadata(/*indices=*/{})); + } + + const int n_batches = batch_reader->num_record_batches(); + std::vector batches; + // Delay error return until the end, as we want to access all record batches + Status st; + for (int i = 0; i < n_batches; ++i) { + RecordBatchWithMetadata batch; + st &= batch_reader->ReadRecordBatchWithCustomMetadata(i).Value(&batch); + st &= ValidateFuzzBatch(batch); + batches.push_back(batch); + } + RETURN_NOT_OK(st); + return batches; + }; + + // Lazily-initialized if the IPC reader succeeds + std::optional>> maybe_stream_batches; + + for (const bool pre_buffer : {false, true}) { + auto maybe_file_batches = do_file_read(pre_buffer); + final_status &= maybe_file_batches.status(); + if (maybe_file_batches.ok()) { + // IPC file read successful: differential fuzzing with IPC stream reader, + // if possible. + // NOTE: some valid IPC files may not be readable as IPC streams, + // for example because of excess spacing between IPC messages. + // A regular IPC file writer would not produce them, but fuzzing might. + if (!maybe_stream_batches.has_value()) { + maybe_stream_batches = do_stream_read(); + final_status &= maybe_stream_batches->status(); + } + if (maybe_stream_batches->ok()) { + // XXX: in some rare cases, an IPC file might read unequal to the enclosed + // IPC stream, for example if the footer skips some batches or orders the + // batches differently. We should revisit this if the fuzzer generates such + // files. + ARROW_CHECK_OK(CompareFuzzBatches(*maybe_file_batches, **maybe_stream_batches)); + } + } } - return st; + return final_status; } Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) { diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 02e6b816c0b1..ceca6d9e4340 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include #include @@ -368,19 +369,27 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo return builder.Finish(out); } -template -static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls, - MemoryPool* pool, - std::shared_ptr* out) { - BuilderType builder(pool); +template BuilderType> +static Result> MakeBinaryArrayWithUniqueValues( + BuilderType builder, int64_t length, bool include_nulls) { + if constexpr (std::is_base_of_v) { + // Try to emit several variadic buffers by choosing a small block size. + builder.SetBlockSize(512); + } for (int64_t i = 0; i < length; ++i) { if (include_nulls && (i % 7 == 0)) { RETURN_NOT_OK(builder.AppendNull()); } else { - RETURN_NOT_OK(builder.Append(std::to_string(i))); + // Make sure that some strings are long enough to have non-inline binary views + const auto base = std::to_string(i); + std::string value; + for (int64_t j = 0; j < 3 * (i % 10); ++j) { + value += base; + } + RETURN_NOT_OK(builder.Append(value)); } } - return builder.Finish(out); + return builder.Finish(); } Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls, @@ -390,22 +399,22 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_n ArrayVector arrays; FieldVector fields; - auto AppendColumn = [&](auto& MakeArray) { - arrays.emplace_back(); - RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); - - const auto& type = arrays.back()->type(); - fields.push_back(field(type->ToString(), type)); + auto AppendColumn = [&](auto builder) { + ARROW_ASSIGN_OR_RAISE(auto array, MakeBinaryArrayWithUniqueValues( + std::move(builder), length, with_nulls)); + arrays.push_back(array); + fields.push_back(field(array->type()->ToString(), array->type())); return Status::OK(); }; - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + auto pool = default_memory_pool(); + RETURN_NOT_OK(AppendColumn(StringBuilder(pool))); + RETURN_NOT_OK(AppendColumn(BinaryBuilder(pool))); + RETURN_NOT_OK(AppendColumn(LargeStringBuilder(pool))); + RETURN_NOT_OK(AppendColumn(LargeBinaryBuilder(pool))); if (with_view_types) { - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); - RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(StringViewBuilder(pool))); + RETURN_NOT_OK(AppendColumn(BinaryViewBuilder(pool))); } *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index cba484af1584..09a9aef89752 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -1493,7 +1493,7 @@ class PayloadFileWriter : public internal::IpcPayloadWriter, protected StreamBoo RETURN_NOT_OK(UpdatePosition()); // It is only necessary to align to 8-byte boundary at the start of the file - RETURN_NOT_OK(Write(kArrowMagicBytes, strlen(kArrowMagicBytes))); + RETURN_NOT_OK(Write(kArrowMagicBytes.data(), kArrowMagicBytes.size())); RETURN_NOT_OK(Align()); return Status::OK(); @@ -1521,7 +1521,7 @@ class PayloadFileWriter : public internal::IpcPayloadWriter, protected StreamBoo RETURN_NOT_OK(Write(&footer_length, sizeof(int32_t))); // Write magic bytes to end file - return Write(kArrowMagicBytes, strlen(kArrowMagicBytes)); + return Write(kArrowMagicBytes.data(), kArrowMagicBytes.size()); } protected: diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 423a0123c058..ab2ce9cdc749 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -33,6 +33,7 @@ #include "arrow/json/parser.h" #include "arrow/json/rapidjson_defs.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" #include "arrow/visit_type_inline.h" @@ -110,20 +111,19 @@ struct GenerateImpl { return OK(writer.Double(val)); } - Status GenerateAscii(const DataType&) { - auto size = std::poisson_distribution<>{4}(e); - std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 - std::string s(size, '\0'); - for (char& ch : s) ch = static_cast(gen_char(e)); - return OK(writer.String(s.c_str())); + Status GenerateUtf8(const DataType&) { + auto num_codepoints = std::poisson_distribution<>{4}(e); + auto seed = std::uniform_int_distribution{}(e); + std::string s = RandomUtf8String(seed, num_codepoints); + return OK(writer.String(s)); } template enable_if_base_binary Visit(const T& t) { - return GenerateAscii(t); + return GenerateUtf8(t); } - Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + Status Visit(const BinaryViewType& t) { return GenerateUtf8(t); } template enable_if_list_like Visit(const T& t) { diff --git a/cpp/src/arrow/memory_pool_test.cc b/cpp/src/arrow/memory_pool_test.cc index 20006ebeb49a..0af1ed2d9eca 100644 --- a/cpp/src/arrow/memory_pool_test.cc +++ b/cpp/src/arrow/memory_pool_test.cc @@ -242,10 +242,10 @@ TEST(Jemalloc, GetAllocationStats) { // Check allocated stats change due to allocation ASSERT_NEAR(allocated - allocated0, 70000, 50000); - ASSERT_NEAR(active - active0, 100000, 90000); - ASSERT_NEAR(metadata - metadata0, 500, 460); - ASSERT_NEAR(resident - resident0, 120000, 110000); - ASSERT_NEAR(mapped - mapped0, 100000, 90000); + ASSERT_GE(active - active0, allocated - allocated0); + ASSERT_GT(metadata, metadata0); + ASSERT_GE(resident - resident0, allocated - allocated0); + ASSERT_GE(mapped - mapped0, allocated - allocated0); ASSERT_NEAR(retained - retained0, 0, 40000); ASSERT_NEAR(thread_peak_read - thread_peak_read0, 1024, 700); diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index 48d01db729d7..cd113311c865 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -141,6 +141,7 @@ arrow_components = { 'extension_type.cc', 'extension/bool8.cc', 'extension/json.cc', + 'extension/parquet_variant.cc', 'extension/uuid.cc', 'pretty_print.cc', 'record_batch.cc', diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 1162b4c3bb0d..12e0f553b740 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -266,10 +266,13 @@ Result> RecordBatch::FromStructArray( namespace { Status ValidateColumnLength(const RecordBatch& batch, int i) { - const auto& array = *batch.column(i); - if (ARROW_PREDICT_FALSE(array.length() != batch.num_rows())) { + // This function is part of the validation code path and should + // be robust against invalid data, but `column()` would call MakeArray() + // that can abort on invalid data. + const auto& array = *batch.column_data(i); + if (ARROW_PREDICT_FALSE(array.length != batch.num_rows())) { return Status::Invalid("Number of rows in column ", i, - " did not match batch: ", array.length(), " vs ", + " did not match batch: ", array.length, " vs ", batch.num_rows()); } return Status::OK(); @@ -455,11 +458,12 @@ namespace { Status ValidateBatch(const RecordBatch& batch, bool full_validation) { for (int i = 0; i < batch.num_columns(); ++i) { RETURN_NOT_OK(ValidateColumnLength(batch, i)); - const auto& array = *batch.column(i); + // See ValidateColumnLength about avoiding a ArrayData -> Array conversion + const auto& array = *batch.column_data(i); const auto& schema_type = batch.schema()->field(i)->type(); - if (!array.type()->Equals(schema_type)) { + if (!array.type->Equals(schema_type)) { return Status::Invalid("Column ", i, - " type not match schema: ", array.type()->ToString(), " vs ", + " type not match schema: ", array.type->ToString(), " vs ", schema_type->ToString()); } const auto st = full_validation ? internal::ValidateArrayFull(array) diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index 4516b808a84f..a037d7261efb 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -318,7 +318,6 @@ TEST_F(TestRecordBatch, Validate) { auto a3 = gen.ArrayOf(int16(), 5); auto b1 = RecordBatch::Make(schema, length, {a0, a1, a2}); - ASSERT_OK(b1->ValidateFull()); // Length mismatch @@ -328,6 +327,21 @@ TEST_F(TestRecordBatch, Validate) { // Type mismatch auto b3 = RecordBatch::Make(schema, length, {a0, a1, a0}); ASSERT_RAISES(Invalid, b3->ValidateFull()); + + // Invalid column data (nulls in map key array) that would abort on MakeArray + auto map_field = field("f", map(utf8(), int32())); + schema = ::arrow::schema({map_field}); + auto map_key_data = ArrayFromJSON(utf8(), "[null]")->data(); + auto map_item_data = ArrayFromJSON(int32(), "[null]")->data(); + auto map_data = ArrayData::Make(map_field->type(), /*length=*/1, /*buffers=*/{nullptr}, + /*child_data=*/{map_key_data, map_item_data}); + + auto b4 = RecordBatch::Make(schema, /*num_rows=*/map_data->length, {map_data}); + ASSERT_RAISES(Invalid, b4->ValidateFull()); + + // Length mismatch with a column data that would also fail on MakeArray + auto b5 = RecordBatch::Make(schema, /*num_rows=*/1 + map_data->length, {map_data}); + ASSERT_RAISES(Invalid, b5->Validate()); } TEST_F(TestRecordBatch, Slice) { diff --git a/cpp/src/arrow/result.h b/cpp/src/arrow/result.h index 2b25de694864..a5e4f55db0f7 100644 --- a/cpp/src/arrow/result.h +++ b/cpp/src/arrow/result.h @@ -518,4 +518,9 @@ struct EnsureResult> { using type = Result; }; +template <> +struct EnsureResult { + using type = Status; +}; + } // namespace arrow diff --git a/cpp/src/arrow/result_test.cc b/cpp/src/arrow/result_test.cc index 794ef9b5dc9b..ad92841a6e70 100644 --- a/cpp/src/arrow/result_test.cc +++ b/cpp/src/arrow/result_test.cc @@ -636,6 +636,28 @@ TEST(ResultTest, MapFunctionToRrefError) { EXPECT_EQ(move_error.status(), error); // error is *not* replaced by other_error } +TEST(ResultTest, MapFunctionToStatus) { + static auto error = Status::Invalid("some error message"); + + const Result const_result(MoveOnlyDataType{kIntElement}); + auto const_mapped = + const_result.Map([](const MoveOnlyDataType& m) -> Status { return Status::OK(); }); + static_assert(std::is_same_v); + EXPECT_TRUE(const_mapped.ok()); + + auto move_mapped = Result(MoveOnlyDataType{kIntElement}) + .Map([](MoveOnlyDataType m) -> Status { return Status::OK(); }); + static_assert(std::is_same_v); + EXPECT_TRUE(move_mapped.ok()); + + const Result error_result(error); + auto error_mapped = + error_result.Map([](const MoveOnlyDataType& m) -> Status { return Status::OK(); }); + static_assert(std::is_same_v); + EXPECT_FALSE(error_mapped.ok()); + EXPECT_EQ(error_mapped, error); +} + // Verify that a Result is assignable to a Result, where T // is a type which has an implicit constructor taking a const U &. TEST(ResultTest, TemplateCopyAssign) { diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index b84070b3d288..0852a0cdb898 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -405,13 +405,20 @@ SparseCSFIndex::SparseCSFIndex(const std::vector>& indpt std::string SparseCSFIndex::ToString() const { return std::string("SparseCSFIndex"); } bool SparseCSFIndex::Equals(const SparseCSFIndex& other) const { - for (int64_t i = 0; i < static_cast(indices().size()); ++i) { - if (!indices()[i]->Equals(*other.indices()[i])) return false; - } - for (int64_t i = 0; i < static_cast(indptr().size()); ++i) { - if (!indptr()[i]->Equals(*other.indptr()[i])) return false; - } - return axis_order() == other.axis_order(); + auto eq = [](const auto& a, const auto& b) { return a->Equals(*b); }; +// TODO: remove the use of std::equal when we no longer have partial C++20 support with +// CRAN. +#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 201911L + return axis_order() == other.axis_order() && + std::ranges::equal(indices(), other.indices(), eq) && + std::ranges::equal(indptr(), other.indptr(), eq); +#else + return axis_order() == other.axis_order() && + std::equal(indices().begin(), indices().end(), other.indices().begin(), + other.indices().end(), eq) && + std::equal(indptr().begin(), indptr().end(), other.indptr().begin(), + other.indptr().end(), eq); +#endif } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index c9c28a11b1b3..434f4a1723c7 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -1641,10 +1641,32 @@ TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestNonAscendingShape) { ASSERT_TRUE(st->Equals(*sparse_tensor)); } +TYPED_TEST_P(TestSparseCSFTensorForIndexValueType, TestEqualityMismatchedDimensions) { + using IndexValueType = TypeParam; + using c_index_value_type = typename IndexValueType::c_type; + + // 2D vs 3D - comparing indices with different dimensionality + // 2D CSF: ndim=2, so indptr.size()=1, indices.size()=2 + std::vector axis_order_2D = {0, 1}; + std::vector> indptr_2D = {{0, 1}}; + std::vector> indices_2D = {{0}, {0}}; + auto si_2D = this->MakeSparseCSFIndex(axis_order_2D, indptr_2D, indices_2D); + + // 3D CSF: ndim=3, so indptr.size()=2, indices.size()=3 + std::vector axis_order_3D = {0, 1, 2}; + std::vector> indptr_3D = {{0, 1}, {0, 1}}; + std::vector> indices_3D = {{0}, {0}, {0}}; + auto si_3D = this->MakeSparseCSFIndex(axis_order_3D, indptr_3D, indices_3D); + + ASSERT_FALSE(si_2D->Equals(*si_3D)); + ASSERT_FALSE(si_3D->Equals(*si_2D)); + ASSERT_TRUE(si_2D->Equals(*si_2D)); +} + REGISTER_TYPED_TEST_SUITE_P(TestSparseCSFTensorForIndexValueType, TestCreateSparseTensor, TestTensorToSparseTensor, TestSparseTensorToTensor, TestAlternativeAxisOrder, TestNonAscendingShape, - TestRoundTrip); + TestRoundTrip, TestEqualityMismatchedDimensions); INSTANTIATE_TYPED_TEST_SUITE_P(TestInt8, TestSparseCSFTensorForIndexValueType, Int8Type); INSTANTIATE_TYPED_TEST_SUITE_P(TestUInt8, TestSparseCSFTensorForIndexValueType, diff --git a/cpp/src/arrow/status.cc b/cpp/src/arrow/status.cc index 55ce3fb78d25..4730bca8c6cf 100644 --- a/cpp/src/arrow/status.cc +++ b/cpp/src/arrow/status.cc @@ -13,6 +13,7 @@ #include "arrow/status.h" #include +#include #include #include #ifdef ARROW_EXTRA_ERROR_CONTEXT @@ -131,8 +132,25 @@ std::string Status::ToStringWithoutContextLines() const { if (last_new_line_position == std::string::npos) { break; } - // TODO: We may want to check /:\d+ / - if (message.find(":", last_new_line_position) == std::string::npos) { + // Check for the pattern ":\d+ " (colon followed by one or more digits and a space) + // to identify context lines in the format "filename:line expr" + auto colon_position = message.find(":", last_new_line_position); + if (colon_position == std::string::npos) { + break; + } + // Verify that the colon is followed by one or more digits and then a space + size_t pos = colon_position + 1; + if (pos >= message.size() || + !std::isdigit(static_cast(message[pos]))) { + break; + } + // Skip all digits + while (pos < message.size() && + std::isdigit(static_cast(message[pos]))) { + pos++; + } + // Check if followed by a space + if (pos >= message.size() || message[pos] != ' ') { break; } message = message.substr(0, last_new_line_position); diff --git a/cpp/src/arrow/status_test.cc b/cpp/src/arrow/status_test.cc index 39a52bd2bad1..72998cba78f9 100644 --- a/cpp/src/arrow/status_test.cc +++ b/cpp/src/arrow/status_test.cc @@ -342,4 +342,21 @@ TEST(StatusTest, ReturnIfNotOk) { ASSERT_EQ(StripContext(st.message()), "StatusLike: 43"); } +#ifdef ARROW_EXTRA_ERROR_CONTEXT +TEST(StatusTest, ToStringWithoutContextLines) { + Status status = Status::IOError("base error"); + status.AddContextLine("file1.cc", 42, "expr"); + status.AddContextLine("file2.cc", 100, "expr"); + + ASSERT_EQ(status.ToStringWithoutContextLines(), "IOError: base error"); + + Status status2(StatusCode::Invalid, + "Error message\nThis line has: a colon but no digits"); + status2.AddContextLine("file.cc", 20, "expr"); + + ASSERT_EQ(status2.ToStringWithoutContextLines(), + "Invalid: Error message\nThis line has: a colon but no digits"); +} +#endif + } // namespace arrow diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 1acc47a99d4d..0e2cbdb644ac 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -660,21 +660,24 @@ LocaleGuard::LocaleGuard(const char* new_locale) : impl_(new Impl(new_locale)) { LocaleGuard::~LocaleGuard() {} -EnvVarGuard::EnvVarGuard(const std::string& name, const std::string& value) - : name_(name) { - auto maybe_value = arrow::internal::GetEnvVar(name); +EnvVarGuard::EnvVarGuard(std::string name, std::optional value) + : name_(std::move(name)) { + auto maybe_value = arrow::internal::GetEnvVar(name_); if (maybe_value.ok()) { - was_set_ = true; old_value_ = *std::move(maybe_value); } else { - was_set_ = false; + old_value_ = std::nullopt; + } + if (value.has_value()) { + ARROW_CHECK_OK(arrow::internal::SetEnvVar(name_, *value)); + } else { + ARROW_CHECK_OK(arrow::internal::DelEnvVar(name_)); } - ARROW_CHECK_OK(arrow::internal::SetEnvVar(name, value)); } EnvVarGuard::~EnvVarGuard() { - if (was_set_) { - ARROW_CHECK_OK(arrow::internal::SetEnvVar(name_, old_value_)); + if (old_value_.has_value()) { + ARROW_CHECK_OK(arrow::internal::SetEnvVar(name_, *old_value_)); } else { ARROW_CHECK_OK(arrow::internal::DelEnvVar(name_)); } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 62bf907a2d89..b84d253a89e8 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -418,9 +418,11 @@ ARROW_TESTING_EXPORT void AssertChildExit(int child_pid, int expected_exit_status = 0); #endif -// A RAII-style object that switches to a new locale, and switches back -// to the old locale when going out of scope. Doesn't do anything if the -// new locale doesn't exist on the local machine. +// A RAII-style object that temporarily switches to a new locale +// +// The guard switches back to the old locale when going out of scope. +// It doesn't do anything if the new locale doesn't exist on the local machine. +// // ATTENTION: may crash with an assertion failure on Windows debug builds. // See ARROW-6108, also https://gerrit.libreoffice.org/#/c/54110/ class ARROW_TESTING_EXPORT LocaleGuard { @@ -433,15 +435,20 @@ class ARROW_TESTING_EXPORT LocaleGuard { std::unique_ptr impl_; }; +// A RAII-style object that temporarily sets an environment variable +// +// The guard restores the variable's previous value when going out of scope, +// or deletes the variable if it was not initially set. +// The environment variable can also be temporarily deleted if std::nullopt +// is passed instead of a string value. class ARROW_TESTING_EXPORT EnvVarGuard { public: - EnvVarGuard(const std::string& name, const std::string& value); + EnvVarGuard(std::string name, std::optional value); ~EnvVarGuard(); protected: - const std::string name_; - std::string old_value_; - bool was_set_; + std::string name_; + std::optional old_value_; }; namespace internal { diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index c50387e49094..f73dbd5bbf71 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -1475,4 +1475,81 @@ void rand_month_day_nanos(int64_t N, }); } +std::string RandomUtf8String(random::SeedType seed, int num_chars) { + arrow::random::pcg32 gen(seed); + std::string s; + s.reserve(num_chars * 3); // Reserve for average 3 bytes per codepoint + + std::uniform_int_distribution plane_dist(0, 3); + std::bernoulli_distribution bmp_range_dist(0.5); + std::uniform_int_distribution bmp_lower_dist(0x0020, 0xD7FF); + std::uniform_int_distribution bmp_upper_dist(0xE000, 0xFFFD); + std::uniform_int_distribution smp_dist(0x10000, 0x1FFFF); + std::uniform_int_distribution sip_dist(0x20000, 0x2FFFF); + std::uniform_int_distribution high_plane_dist(0x30000, 0x10FFFF); + + for (int i = 0; i < num_chars; ++i) { + uint32_t codepoint; + uint32_t plane = plane_dist(gen); + + if (plane == 0) { + // Basic Multilingual Plane (BMP): U+0000 to U+FFFF + // Exclude surrogate code points (U+D800 to U+DFFF) + // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.8, D71) + // Exclude control chars below U+0020 for readability + // Generate from two ranges with equal probability (overrepresents the smaller + // upper range): + // - Lower: U+0020 to U+D7FF (55,776 values, 50% selection probability) + // - Upper: U+E000 to U+FFFD (8,190 values, 50% selection probability) + if (bmp_range_dist(gen)) { + // Lower range: U+0020 to U+D7FF (before surrogate range) + codepoint = bmp_lower_dist(gen); + } else { + // Upper range: U+E000 to U+FFFD (after surrogate range) + // Note: Stops at U+FFFD to exclude noncharacters U+FFFE and U+FFFF + // Other noncharacters (U+FDD0-U+FDEF, plane-ending pairs) are included + // as they are valid Unicode scalar values per the Unicode Standard + codepoint = bmp_upper_dist(gen); + } + } else if (plane == 1) { + // Supplementary Multilingual Plane (SMP): U+10000 to U+1FFFF + // https://www.unicode.org/roadmaps/smp/ + codepoint = smp_dist(gen); + } else if (plane == 2) { + // Supplementary Ideographic Plane (SIP): U+20000 to U+2FFFF + // https://www.unicode.org/roadmaps/sip/ + codepoint = sip_dist(gen); + } else { + // Planes 3–16: U+30000–U+10FFFF + // Includes TIP, SSP, PUA-A, PUA-B, and unassigned planes: U+30000 to U+10FFFF + // Max valid Unicode codepoint is U+10FFFF per the Standard + // https://www.unicode.org/versions/Unicode15.1.0/ch03.pdf (Section 3.4, D9) + codepoint = high_plane_dist(gen); + } + + // Encode as UTF-8 per RFC 3629 (Section 3: UTF-8 definition) + // https://www.rfc-editor.org/rfc/rfc3629.html#section-3 + if (codepoint <= 0x7F) { + // 1-byte sequence: 0xxxxxxx + s.push_back(static_cast(codepoint)); + } else if (codepoint <= 0x7FF) { + // 2-byte sequence: 110xxxxx 10xxxxxx + s.push_back(static_cast(0xC0 | (codepoint >> 6))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0xFFFF) { + // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx + s.push_back(static_cast(0xE0 | (codepoint >> 12))); + s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else { + // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + s.push_back(static_cast(0xF0 | (codepoint >> 18))); + s.push_back(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + s.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } + } + return s; +} + } // namespace arrow diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index d9122915a092..f820e643986d 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -729,6 +729,21 @@ ARROW_TESTING_EXPORT void rand_month_day_nanos(int64_t N, std::vector* out); +/// \brief Generate a random UTF-8 encoded string +/// +/// Generates a string with valid UTF-8 encoding from random Unicode scalar values. +/// The generated string contains num_chars code points sampled uniformly +/// across the Basic Multilingual Plane (BMP), Supplementary Multilingual Plane (SMP), +/// Supplementary Ideographic Plane (SIP), and higher planes (up to U+10FFFF). +/// Surrogate code points (U+D800-U+DFFF) are excluded as they are not valid +/// Unicode scalar values. +/// +/// \param[in] seed Random seed for reproducibility +/// \param[in] num_chars Number of Unicode code points to generate +/// \return a generated UTF-8 encoded string +ARROW_TESTING_EXPORT +std::string RandomUtf8String(random::SeedType seed, int num_chars); + template void randint(int64_t N, T lower, T upper, std::vector* out) { const int random_seed = 0; diff --git a/cpp/src/arrow/testing/uniform_real.h b/cpp/src/arrow/testing/uniform_real.h index 8aa04a83288d..4ad106188f27 100644 --- a/cpp/src/arrow/testing/uniform_real.h +++ b/cpp/src/arrow/testing/uniform_real.h @@ -25,6 +25,7 @@ #pragma once +#include #include #include @@ -39,8 +40,8 @@ namespace detail { template RealType generate_canonical(Rng& rng) { const size_t b = std::numeric_limits::digits; - const size_t log2R = 63 - ::arrow::bit_util::CountLeadingZeros( - static_cast(Rng::max() - Rng::min()) + 1); + const size_t log2R = + 63 - std::countl_zero(static_cast(Rng::max() - Rng::min()) + 1); const size_t k = b / log2R + (b % log2R != 0) + (b == 0); const RealType r = static_cast(Rng::max() - Rng::min()) + 1; RealType base = r; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index cba4a0ecd3a3..b9fe6746f936 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -38,6 +38,7 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/table.h" +#include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/hash_util.h" @@ -3552,4 +3553,16 @@ const std::vector& DecimalTypeIds() { return type_ids; } +Result> type_singleton(Type::type id) { + auto visit = [](auto type) -> Result> { + using T = std::decay_t; + if constexpr (TypeTraits::is_parameter_free) { + return TypeTraits::type_singleton(); + } + return Status::TypeError("Type ", internal::ToString(T::type_id), + " is not a parameter-free type"); + }; + return VisitTypeId(id, visit); +} + } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f68d2dcb619d..5d41a45b6fe7 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2575,6 +2575,16 @@ constexpr bool may_have_validity_bitmap(Type::type id) { } } +constexpr bool has_variadic_buffers(Type::type id) { + switch (id) { + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + return true; + default: + return false; + } +} + ARROW_DEPRECATED("Deprecated in 17.0.0. Use may_have_validity_bitmap() instead.") constexpr bool HasValidityBitmap(Type::type id) { return may_have_validity_bitmap(id); } @@ -2635,4 +2645,17 @@ const std::vector>& PrimitiveTypes(); ARROW_EXPORT const std::vector& DecimalTypeIds(); +/// \brief Create a data type instance from a type ID for parameter-free types +/// +/// This function creates a data type instance for types that don't require +/// additional parameters (where TypeTraits::is_parameter_free is true). +/// For types that require parameters (like TimestampType or ListType), +/// this function will return an error. +/// +/// \param[in] id The type ID to create a type instance for +/// \return The type instance for the given type ID, +/// or a TypeError if the type requires parameters +ARROW_EXPORT +Result> type_singleton(Type::type id); + } // namespace arrow diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index e9b1d30e6e73..6197ad58eb40 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -33,6 +33,7 @@ #include "arrow/memory_pool.h" #include "arrow/table.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" @@ -50,6 +51,29 @@ TEST(TestTypeId, AllTypeIds) { ASSERT_EQ(static_cast(all_ids.size()), Type::MAX_ID); } +TEST(TestTypeSingleton, ParameterFreeTypes) { + // Test successful cases - parameter-free types (sample a few) + std::vector>> cases = { + {Type::NA, null()}, {Type::BOOL, boolean()}, {Type::INT32, int32()}, + {Type::STRING, utf8()}, {Type::DATE32, date32()}, + }; + + for (const auto& test_case : cases) { + ARROW_SCOPED_TRACE("Testing type: ", internal::ToString(test_case.first)); + auto result = type_singleton(test_case.first); + ASSERT_OK_AND_ASSIGN(auto type, result); + AssertTypeEqual(*type, *test_case.second); + } +} + +TEST(TestTypeSingleton, ParameterizedTypes) { + // Test error cases - parameterized types (test one representative) + auto result = type_singleton(Type::TIMESTAMP); + ASSERT_RAISES(TypeError, result); + EXPECT_THAT(result.status().message(), + testing::HasSubstr("is not a parameter-free type")); +} + template void CheckTypeIdReprs(ReprFunc&& repr_func, bool expect_uppercase) { std::unordered_set unique_reprs; diff --git a/cpp/src/arrow/util/align_util.h b/cpp/src/arrow/util/align_util.h index 71920e49f4aa..64eb1f7ba642 100644 --- a/cpp/src/arrow/util/align_util.h +++ b/cpp/src/arrow/util/align_util.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include "arrow/memory_pool.h" #include "arrow/type_fwd.h" @@ -43,7 +44,7 @@ struct BitmapWordAlignParams { template inline BitmapWordAlignParams BitmapWordAlign(const uint8_t* data, int64_t bit_offset, int64_t length) { - static_assert(bit_util::IsPowerOf2(ALIGN_IN_BYTES), + static_assert(std::has_single_bit(ALIGN_IN_BYTES), "ALIGN_IN_BYTES should be a positive power of two"); constexpr uint64_t ALIGN_IN_BITS = ALIGN_IN_BYTES * 8; diff --git a/cpp/src/arrow/util/atfork_internal.cc b/cpp/src/arrow/util/atfork_internal.cc index 7772f1c62bea..fa3a09d0a2bd 100644 --- a/cpp/src/arrow/util/atfork_internal.cc +++ b/cpp/src/arrow/util/atfork_internal.cc @@ -34,6 +34,22 @@ namespace internal { namespace { +bool IsAtForkEnabled() { + static bool is_enabled = [] { + auto maybe_value = + GetEnvVarInteger("ARROW_REGISTER_ATFORK", /*min_value=*/0, /*max_value=*/1); + if (maybe_value.ok()) { + return *maybe_value != 0; + } + if (!maybe_value.status().IsKeyError()) { + maybe_value.status().Warn(); + } + // Enabled by default + return true; + }(); + return is_enabled; +} + // Singleton state for at-fork management. // We do not use global variables because of initialization order issues (ARROW-18383). // Instead, a function-local static ensures the state is initialized @@ -147,7 +163,11 @@ AtForkState* GetAtForkState() { }; // namespace void RegisterAtFork(std::weak_ptr weak_handler) { - GetAtForkState()->RegisterAtFork(std::move(weak_handler)); + // Only fetch the atfork state (and thus lazily call pthread_atfork) if enabled at all, + // to minimize potential nastiness with fork and threads. + if (IsAtForkEnabled()) { + GetAtForkState()->RegisterAtFork(std::move(weak_handler)); + } } } // namespace internal diff --git a/cpp/src/arrow/util/atfork_test.cc b/cpp/src/arrow/util/atfork_test.cc index 97910f9539c0..ea9bdca53602 100644 --- a/cpp/src/arrow/util/atfork_test.cc +++ b/cpp/src/arrow/util/atfork_test.cc @@ -190,6 +190,9 @@ TEST_F(TestAtFork, SingleThread) { ASSERT_THAT(child_after_, ElementsAre()); } +// XXX we would like to test the ARROW_REGISTER_ATFORK environment variable, +// but that would require spawning a test subprocess + # if !(defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) || \ defined(THREAD_SANITIZER)) diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc index fc69bcf6f8ec..eddb1aae7b2d 100644 --- a/cpp/src/arrow/util/basic_decimal.cc +++ b/cpp/src/arrow/util/basic_decimal.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -388,7 +389,7 @@ BasicDecimal64 operator%(const BasicDecimal64& left, const BasicDecimal64& right template int32_t SmallBasicDecimal::CountLeadingBinaryZeros() const { - return bit_util::CountLeadingZeros(static_cast>(value_)); + return std::countl_zero(static_cast>(value_)); } // same as kDecimal128PowersOfTen[38] - 1 @@ -892,7 +893,7 @@ static inline DecimalStatus DecimalDivide(const DecimalClass& dividend, // Normalize by shifting both by a multiple of 2 so that // the digit guessing is better. The requirement is that // divisor_array[0] is greater than 2**31. - int64_t normalize_bits = bit_util::CountLeadingZeros(divisor_array[0]); + int64_t normalize_bits = std::countl_zero(divisor_array[0]); ShiftArrayLeft(divisor_array, divisor_length, normalize_bits); ShiftArrayLeft(dividend_array, dividend_length, normalize_bits); @@ -1155,9 +1156,9 @@ int32_t BasicDecimal128::CountLeadingBinaryZeros() const { DCHECK_GE(*this, BasicDecimal128(0)); if (high_bits() == 0) { - return bit_util::CountLeadingZeros(low_bits()) + 64; + return std::countl_zero(low_bits()) + 64; } else { - return bit_util::CountLeadingZeros(static_cast(high_bits())); + return std::countl_zero(static_cast(high_bits())); } } diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h index 73a1ee8600fb..82651a9d3877 100644 --- a/cpp/src/arrow/util/bit_block_counter.h +++ b/cpp/src/arrow/util/bit_block_counter.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include #include @@ -130,10 +131,10 @@ class ARROW_EXPORT BitBlockCounter { if (bits_remaining_ < kFourWordsBits) { return GetBlockSlow(kFourWordsBits); } - total_popcount += bit_util::PopCount(LoadWord(bitmap_)); - total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 8)); - total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 16)); - total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 24)); + total_popcount += std::popcount(LoadWord(bitmap_)); + total_popcount += std::popcount(LoadWord(bitmap_ + 8)); + total_popcount += std::popcount(LoadWord(bitmap_ + 16)); + total_popcount += std::popcount(LoadWord(bitmap_ + 24)); } else { // When the offset is > 0, we need there to be a word beyond the last // aligned word in the bitmap for the bit shifting logic. @@ -142,16 +143,16 @@ class ARROW_EXPORT BitBlockCounter { } auto current = LoadWord(bitmap_); auto next = LoadWord(bitmap_ + 8); - total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_)); + total_popcount += std::popcount(ShiftWord(current, next, offset_)); current = next; next = LoadWord(bitmap_ + 16); - total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_)); + total_popcount += std::popcount(ShiftWord(current, next, offset_)); current = next; next = LoadWord(bitmap_ + 24); - total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_)); + total_popcount += std::popcount(ShiftWord(current, next, offset_)); current = next; next = LoadWord(bitmap_ + 32); - total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_)); + total_popcount += std::popcount(ShiftWord(current, next, offset_)); } bitmap_ += bit_util::BytesForBits(kFourWordsBits); bits_remaining_ -= kFourWordsBits; @@ -175,15 +176,15 @@ class ARROW_EXPORT BitBlockCounter { if (bits_remaining_ < kWordBits) { return GetBlockSlow(kWordBits); } - popcount = bit_util::PopCount(LoadWord(bitmap_)); + popcount = std::popcount(LoadWord(bitmap_)); } else { // When the offset is > 0, we need there to be a word beyond the last // aligned word in the bitmap for the bit shifting logic. if (bits_remaining_ < 2 * kWordBits - offset_) { return GetBlockSlow(kWordBits); } - popcount = bit_util::PopCount( - ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_)); + popcount = + std::popcount(ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_)); } bitmap_ += kWordBits / 8; bits_remaining_ -= kWordBits; @@ -318,14 +319,13 @@ class ARROW_EXPORT BinaryBitBlockCounter { int64_t popcount = 0; if (left_offset_ == 0 && right_offset_ == 0) { - popcount = - bit_util::PopCount(Op::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_))); + popcount = std::popcount(Op::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_))); } else { auto left_word = ShiftWord(LoadWord(left_bitmap_), LoadWord(left_bitmap_ + 8), left_offset_); auto right_word = ShiftWord(LoadWord(right_bitmap_), LoadWord(right_bitmap_ + 8), right_offset_); - popcount = bit_util::PopCount(Op::Call(left_word, right_word)); + popcount = std::popcount(Op::Call(left_word, right_word)); } left_bitmap_ += kWordBits / 8; right_bitmap_ += kWordBits / 8; diff --git a/cpp/src/arrow/util/bit_run_reader.h b/cpp/src/arrow/util/bit_run_reader.h index 7bb00140279a..1a9638880c50 100644 --- a/cpp/src/arrow/util/bit_run_reader.h +++ b/cpp/src/arrow/util/bit_run_reader.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include @@ -93,7 +94,7 @@ class ARROW_EXPORT BitRunReader { return {/*length=*/0, false}; } // This implementation relies on a efficient implementations of - // CountTrailingZeros and assumes that runs are more often then + // std::countr_zero and assumes that runs are more often then // not. The logic is to incrementally find the next bit change // from the current position. This is done by zeroing all // bits in word_ up to position_ and using the TrailingZeroCount @@ -104,12 +105,12 @@ class ARROW_EXPORT BitRunReader { int64_t start_position = position_; int64_t start_bit_offset = start_position & 63; - // Invert the word for proper use of CountTrailingZeros and - // clear bits so CountTrailingZeros can do it magic. + // Invert the word for proper use of std::countr_zero and + // clear bits so std::countr_zero can do it magic. word_ = ~word_ & ~bit_util::LeastSignificantBitMask(start_bit_offset); // Go forward until the next change from unset to set. - int64_t new_bits = bit_util::CountTrailingZeros(word_) - start_bit_offset; + int64_t new_bits = std::countr_zero(word_) - start_bit_offset; position_ += new_bits; if (ARROW_PREDICT_FALSE(bit_util::IsMultipleOf64(position_)) && @@ -129,7 +130,7 @@ class ARROW_EXPORT BitRunReader { // Advance the position of the bitmap for loading. bitmap_ += sizeof(uint64_t); LoadNextWord(); - new_bits = bit_util::CountTrailingZeros(word_); + new_bits = std::countr_zero(word_); // Continue calculating run length. position_ += new_bits; } while (ARROW_PREDICT_FALSE(bit_util::IsMultipleOf64(position_)) && @@ -155,9 +156,9 @@ class ARROW_EXPORT BitRunReader { } // Two cases: - // 1. For unset, CountTrailingZeros works naturally so we don't + // 1. For unset, std::countr_zero works naturally so we don't // invert the word. - // 2. Otherwise invert so we can use CountTrailingZeros. + // 2. Otherwise invert so we can use std::countr_zero. if (current_run_bit_set_) { word_ = ~word_; } @@ -438,12 +439,12 @@ class BaseSetBitRunReader { template <> inline int BaseSetBitRunReader::CountFirstZeros(uint64_t word) { - return bit_util::CountTrailingZeros(word); + return std::countr_zero(word); } template <> inline int BaseSetBitRunReader::CountFirstZeros(uint64_t word) { - return bit_util::CountLeadingZeros(word); + return std::countl_zero(word); } template <> diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index c7849db871ac..0d2b2655ea31 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -17,20 +17,7 @@ #pragma once -#if defined(_MSC_VER) -# if defined(_M_AMD64) || defined(_M_X64) -# include // IWYU pragma: keep -# endif - -# pragma intrinsic(_BitScanReverse) -# pragma intrinsic(_BitScanForward) -# define ARROW_POPCOUNT64 __popcnt64 -# define ARROW_POPCOUNT32 __popcnt -#else -# define ARROW_POPCOUNT64 __builtin_popcountll -# define ARROW_POPCOUNT32 __builtin_popcount -#endif - +#include #include #include @@ -49,26 +36,6 @@ typename std::make_unsigned::type as_unsigned(Integer x) { namespace bit_util { -// The number of set bits in a given unsigned byte value, pre-computed -// -// Generated with the following Python code -// output = 'static constexpr uint8_t kBytePopcount[] = {{{0}}};' -// popcounts = [str(bin(i).count('1')) for i in range(0, 256)] -// print(output.format(', '.join(popcounts))) -static constexpr uint8_t kBytePopcount[] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, - 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, - 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, - 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, - 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, - 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, - 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, - 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; - -static inline uint64_t PopCount(uint64_t bitmap) { return ARROW_POPCOUNT64(bitmap); } -static inline uint32_t PopCount(uint32_t bitmap) { return ARROW_POPCOUNT32(bitmap); } - // // Bit-related computations on integer values // @@ -84,14 +51,6 @@ constexpr int64_t BytesForBits(int64_t bits) { return (bits >> 3) + ((bits & 7) != 0); } -constexpr bool IsPowerOf2(int64_t value) { - return value > 0 && (value & (value - 1)) == 0; -} - -constexpr bool IsPowerOf2(uint64_t value) { - return value > 0 && (value & (value - 1)) == 0; -} - // Returns the smallest power of two that contains v. If v is already a // power of two, it is returned as is. static inline int64_t NextPower2(int64_t n) { @@ -140,13 +99,10 @@ constexpr int64_t RoundDown(int64_t value, int64_t factor) { // The result is undefined on overflow, i.e. if `value > 2**64 - factor`, // since we cannot return the correct result which would be 2**64. constexpr int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) { - // DCHECK(value >= 0); - // DCHECK(IsPowerOf2(factor)); return (value + (factor - 1)) & ~(factor - 1); } constexpr uint64_t RoundUpToPowerOf2(uint64_t value, uint64_t factor) { - // DCHECK(IsPowerOf2(factor)); return (value + (factor - 1)) & ~(factor - 1); } @@ -179,106 +135,10 @@ static inline uint64_t TrailingBits(uint64_t v, int num_bits) { return (v << n) >> n; } -/// \brief Count the number of leading zeros in an unsigned integer. -static inline int CountLeadingZeros(uint32_t value) { -#if defined(__clang__) || defined(__GNUC__) - if (value == 0) return 32; - return static_cast(__builtin_clz(value)); -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - if (_BitScanReverse(&index, static_cast(value))) { // NOLINT - return 31 - static_cast(index); - } else { - return 32; - } -#else - int bitpos = 0; - while (value != 0) { - value >>= 1; - ++bitpos; - } - return 32 - bitpos; -#endif -} - -static inline int CountLeadingZeros(uint64_t value) { -#if defined(__clang__) || defined(__GNUC__) - if (value == 0) return 64; - return static_cast(__builtin_clzll(value)); -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - if (_BitScanReverse64(&index, value)) { // NOLINT - return 63 - static_cast(index); - } else { - return 64; - } -#else - int bitpos = 0; - while (value != 0) { - value >>= 1; - ++bitpos; - } - return 64 - bitpos; -#endif -} - -static inline int CountTrailingZeros(uint32_t value) { -#if defined(__clang__) || defined(__GNUC__) - if (value == 0) return 32; - return static_cast(__builtin_ctzl(value)); -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - if (_BitScanForward(&index, value)) { - return static_cast(index); - } else { - return 32; - } -#else - int bitpos = 0; - if (value) { - while (value & 1 == 0) { - value >>= 1; - ++bitpos; - } - } else { - bitpos = 32; - } - return bitpos; -#endif -} - -static inline int CountTrailingZeros(uint64_t value) { -#if defined(__clang__) || defined(__GNUC__) - if (value == 0) return 64; - return static_cast(__builtin_ctzll(value)); -#elif defined(_MSC_VER) - unsigned long index; // NOLINT - if (_BitScanForward64(&index, value)) { - return static_cast(index); - } else { - return 64; - } -#else - int bitpos = 0; - if (value) { - while (value & 1 == 0) { - value >>= 1; - ++bitpos; - } - } else { - bitpos = 64; - } - return bitpos; -#endif -} - -// Returns the minimum number of bits needed to represent an unsigned value -static inline int NumRequiredBits(uint64_t x) { return 64 - CountLeadingZeros(x); } - // Returns ceil(log2(x)). static inline int Log2(uint64_t x) { // DCHECK_GT(x, 0); - return NumRequiredBits(x - 1); + return std::bit_width(x - 1); } // diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 13aa319d7068..1e7714540eeb 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -739,79 +739,10 @@ TEST(BitUtil, Log2) { EXPECT_EQ(bit_util::Log2(ULLONG_MAX), 64); } -TEST(BitUtil, NumRequiredBits) { - EXPECT_EQ(bit_util::NumRequiredBits(0), 0); - EXPECT_EQ(bit_util::NumRequiredBits(1), 1); - EXPECT_EQ(bit_util::NumRequiredBits(2), 2); - EXPECT_EQ(bit_util::NumRequiredBits(3), 2); - EXPECT_EQ(bit_util::NumRequiredBits(4), 3); - EXPECT_EQ(bit_util::NumRequiredBits(5), 3); - EXPECT_EQ(bit_util::NumRequiredBits(7), 3); - EXPECT_EQ(bit_util::NumRequiredBits(8), 4); - EXPECT_EQ(bit_util::NumRequiredBits(9), 4); - EXPECT_EQ(bit_util::NumRequiredBits(UINT_MAX - 1), 32); - EXPECT_EQ(bit_util::NumRequiredBits(UINT_MAX), 32); - EXPECT_EQ(bit_util::NumRequiredBits(static_cast(UINT_MAX) + 1), 33); - EXPECT_EQ(bit_util::NumRequiredBits(ULLONG_MAX / 2), 63); - EXPECT_EQ(bit_util::NumRequiredBits(ULLONG_MAX / 2 + 1), 64); - EXPECT_EQ(bit_util::NumRequiredBits(ULLONG_MAX - 1), 64); - EXPECT_EQ(bit_util::NumRequiredBits(ULLONG_MAX), 64); -} - #define U32(x) static_cast(x) #define U64(x) static_cast(x) #define S64(x) static_cast(x) -TEST(BitUtil, CountLeadingZeros) { - EXPECT_EQ(bit_util::CountLeadingZeros(U32(0)), 32); - EXPECT_EQ(bit_util::CountLeadingZeros(U32(1)), 31); - EXPECT_EQ(bit_util::CountLeadingZeros(U32(2)), 30); - EXPECT_EQ(bit_util::CountLeadingZeros(U32(3)), 30); - EXPECT_EQ(bit_util::CountLeadingZeros(U32(4)), 29); - EXPECT_EQ(bit_util::CountLeadingZeros(U32(7)), 29); - EXPECT_EQ(bit_util::CountLeadingZeros(U32(8)), 28); - EXPECT_EQ(bit_util::CountLeadingZeros(U32(UINT_MAX / 2)), 1); - EXPECT_EQ(bit_util::CountLeadingZeros(U32(UINT_MAX / 2 + 1)), 0); - EXPECT_EQ(bit_util::CountLeadingZeros(U32(UINT_MAX)), 0); - - EXPECT_EQ(bit_util::CountLeadingZeros(U64(0)), 64); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(1)), 63); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(2)), 62); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(3)), 62); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(4)), 61); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(7)), 61); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(8)), 60); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(UINT_MAX)), 32); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(UINT_MAX) + 1), 31); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(ULLONG_MAX / 2)), 1); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(ULLONG_MAX / 2 + 1)), 0); - EXPECT_EQ(bit_util::CountLeadingZeros(U64(ULLONG_MAX)), 0); -} - -TEST(BitUtil, CountTrailingZeros) { - EXPECT_EQ(bit_util::CountTrailingZeros(U32(0)), 32); - EXPECT_EQ(bit_util::CountTrailingZeros(U32(1) << 31), 31); - EXPECT_EQ(bit_util::CountTrailingZeros(U32(1) << 30), 30); - EXPECT_EQ(bit_util::CountTrailingZeros(U32(1) << 29), 29); - EXPECT_EQ(bit_util::CountTrailingZeros(U32(1) << 28), 28); - EXPECT_EQ(bit_util::CountTrailingZeros(U32(8)), 3); - EXPECT_EQ(bit_util::CountTrailingZeros(U32(4)), 2); - EXPECT_EQ(bit_util::CountTrailingZeros(U32(2)), 1); - EXPECT_EQ(bit_util::CountTrailingZeros(U32(1)), 0); - EXPECT_EQ(bit_util::CountTrailingZeros(U32(ULONG_MAX)), 0); - - EXPECT_EQ(bit_util::CountTrailingZeros(U64(0)), 64); - EXPECT_EQ(bit_util::CountTrailingZeros(U64(1) << 63), 63); - EXPECT_EQ(bit_util::CountTrailingZeros(U64(1) << 62), 62); - EXPECT_EQ(bit_util::CountTrailingZeros(U64(1) << 61), 61); - EXPECT_EQ(bit_util::CountTrailingZeros(U64(1) << 60), 60); - EXPECT_EQ(bit_util::CountTrailingZeros(U64(8)), 3); - EXPECT_EQ(bit_util::CountTrailingZeros(U64(4)), 2); - EXPECT_EQ(bit_util::CountTrailingZeros(U64(2)), 1); - EXPECT_EQ(bit_util::CountTrailingZeros(U64(1)), 0); - EXPECT_EQ(bit_util::CountTrailingZeros(U64(ULLONG_MAX)), 0); -} - TEST(BitUtil, RoundUpToPowerOf2) { EXPECT_EQ(bit_util::RoundUpToPowerOf2(S64(7), 8), 8); EXPECT_EQ(bit_util::RoundUpToPowerOf2(S64(8), 8), 8); diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc index ce2224f2f669..cc24146ae94e 100644 --- a/cpp/src/arrow/util/bitmap_ops.cc +++ b/cpp/src/arrow/util/bitmap_ops.cc @@ -61,10 +61,10 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { // Unroll the loop for better performance for (int64_t i = 0; i < words_rounded; i += kCountUnrollFactor) { // (hand-unrolled as some gcc versions would unnest a nested `for` loop) - count_unroll[0] += bit_util::PopCount(u64_data[0]); - count_unroll[1] += bit_util::PopCount(u64_data[1]); - count_unroll[2] += bit_util::PopCount(u64_data[2]); - count_unroll[3] += bit_util::PopCount(u64_data[3]); + count_unroll[0] += std::popcount(u64_data[0]); + count_unroll[1] += std::popcount(u64_data[1]); + count_unroll[2] += std::popcount(u64_data[2]); + count_unroll[3] += std::popcount(u64_data[3]); u64_data += kCountUnrollFactor; } for (int64_t k = 0; k < kCountUnrollFactor; k++) { @@ -73,7 +73,7 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { // The trailing part for (; u64_data < end; ++u64_data) { - count += bit_util::PopCount(*u64_data); + count += std::popcount(*u64_data); } } diff --git a/cpp/src/arrow/util/bitmap_reader_benchmark.cc b/cpp/src/arrow/util/bitmap_reader_benchmark.cc index b3c199ec3bd5..3563ba75ad66 100644 --- a/cpp/src/arrow/util/bitmap_reader_benchmark.cc +++ b/cpp/src/arrow/util/bitmap_reader_benchmark.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -88,9 +89,9 @@ static void BitmapWordReaderBench(benchmark::State& state) { // if (word == UINT64_MAX) { // set_bits += sizeof(uint64_t) * 8; // } else if (word) { - // set_bits += PopCount(word); + // set_bits += std::popcount(word); // } - set_bits += PopCount(word); + set_bits += std::popcount(word); benchmark::DoNotOptimize(set_bits); } @@ -98,7 +99,7 @@ static void BitmapWordReaderBench(benchmark::State& state) { while (cnt--) { int valid_bits; const auto& byte = static_cast(counter.NextTrailingByte(valid_bits)); - set_bits += PopCount(kPrecedingBitmask[valid_bits] & byte); + set_bits += std::popcount(kPrecedingBitmask[valid_bits] & byte); benchmark::DoNotOptimize(set_bits); } benchmark::ClobberMemory(); diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h index c9ce8012f3eb..8c47793fdebb 100644 --- a/cpp/src/arrow/util/bitmap_writer.h +++ b/cpp/src/arrow/util/bitmap_writer.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -112,7 +113,7 @@ class FirstTimeBitmapWriter { // Update state variables except for current_byte_ here. position_ += number_of_bits; - int64_t bit_offset = bit_util::CountTrailingZeros(static_cast(bit_mask_)); + int64_t bit_offset = std::countr_zero(static_cast(bit_mask_)); bit_mask_ = bit_util::kBitmask[(bit_offset + number_of_bits) % 8]; byte_offset_ += (bit_offset + number_of_bits) / 8; diff --git a/cpp/src/arrow/util/cpu_info.cc b/cpp/src/arrow/util/cpu_info.cc index fdd0728c8e7a..b858da4a3617 100644 --- a/cpp/src/arrow/util/cpu_info.cc +++ b/cpp/src/arrow/util/cpu_info.cc @@ -184,17 +184,19 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, } bool zmm_enabled = false; + bool ymm_enabled = false; if (features_ECX[27]) { // OSXSAVE - // Query if the OS supports saving ZMM registers when switching contexts + // Query if the OS supports saving YMM and ZMM registers when switching contexts int64_t xcr0 = _xgetbv(0); zmm_enabled = (xcr0 & 0xE0) == 0xE0; + ymm_enabled = (xcr0 & 0b110) == 0b110; } if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3; if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1; if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2; if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT; - if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX; + if (ymm_enabled && features_ECX[28]) *hardware_flags |= CpuInfo::AVX; // cpuid with EAX=7, ECX=0: Extended Features register_EAX_id = 7; @@ -203,10 +205,11 @@ void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, std::bitset<32> features_EBX = cpu_info[1]; if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1; - if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2; if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2; + // Only use AVX/AVX2 if enabled by the OS + if (ymm_enabled && features_EBX[5]) *hardware_flags |= CpuInfo::AVX2; // ARROW-11427: only use AVX512 if enabled by the OS - if (zmm_enabled) { + if (ymm_enabled && zmm_enabled) { if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F; if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ; if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD; diff --git a/cpp/src/arrow/util/fuzz_internal.cc b/cpp/src/arrow/util/fuzz_internal.cc index 935089b2bc96..28d210333dda 100644 --- a/cpp/src/arrow/util/fuzz_internal.cc +++ b/cpp/src/arrow/util/fuzz_internal.cc @@ -36,17 +36,16 @@ MemoryPool* fuzzing_memory_pool() { void LogFuzzStatus(const Status& st, const uint8_t* data, int64_t size) { static const int kVerbosity = []() { - auto maybe_env_value = GetEnvVar("ARROW_FUZZING_VERBOSITY"); - if (maybe_env_value.status().IsKeyError()) { - return 0; + auto maybe_env_value = + GetEnvVarInteger("ARROW_FUZZING_VERBOSITY", /*min_value=*/0, /*max_value=*/1); + if (maybe_env_value.ok()) { + return static_cast(*maybe_env_value); } - auto env_value = std::move(maybe_env_value).ValueOrDie(); - int32_t value; - if (!ParseValue(env_value.data(), env_value.length(), &value)) { - Status::Invalid("Invalid value for ARROW_FUZZING_VERBOSITY: '", env_value, "'") - .Abort(); + if (!maybe_env_value.status().IsKeyError()) { + maybe_env_value.status().Abort(); } - return value; + // Quiet by default + return 0; }(); if (!st.ok() && kVerbosity >= 1) { diff --git a/cpp/src/arrow/util/int_util_overflow.h b/cpp/src/arrow/util/int_util_overflow.h index 93066fecafa0..69714a935a48 100644 --- a/cpp/src/arrow/util/int_util_overflow.h +++ b/cpp/src/arrow/util/int_util_overflow.h @@ -18,7 +18,9 @@ #pragma once #include +#include #include +#include #include #include "arrow/status.h" @@ -162,6 +164,37 @@ NON_GENERIC_OPS_WITH_OVERFLOW(DivideWithOverflow) #undef NON_GENERIC_OPS_WITH_OVERFLOW #undef NON_GENERIC_OP_WITH_OVERFLOW +// Convenience functions over an arbitrary number of arguments +template +std::optional AddWithOverflow(std::initializer_list vs) { + if (vs.size() == 0) { + return {}; + } + auto it = vs.begin(); + Int v = *it++; + while (it != vs.end()) { + if (ARROW_PREDICT_FALSE(AddWithOverflowGeneric(v, *it++, &v))) { + return {}; + } + } + return v; +} + +template +std::optional MultiplyWithOverflow(std::initializer_list vs) { + if (vs.size() == 0) { + return {}; + } + auto it = vs.begin(); + Int v = *it++; + while (it != vs.end()) { + if (ARROW_PREDICT_FALSE(MultiplyWithOverflowGeneric(v, *it++, &v))) { + return {}; + } + } + return v; +} + // Define function NegateWithOverflow with the signature `bool(T u, T* out)` // where T is a signed integer type. On overflow, these functions return true. // Otherwise, false is returned and `out` is updated with the result of the diff --git a/cpp/src/arrow/util/int_util_test.cc b/cpp/src/arrow/util/int_util_test.cc index 7217c1097e48..cffa4e9d15eb 100644 --- a/cpp/src/arrow/util/int_util_test.cc +++ b/cpp/src/arrow/util/int_util_test.cc @@ -649,5 +649,23 @@ TYPED_TEST(TestAddWithOverflow, Basics) { this->CheckOk(almost_min, almost_max + T{2}, T{1}); } +TEST(AddWithOverflow, Variadic) { + ASSERT_EQ(AddWithOverflow({}), std::nullopt); + ASSERT_EQ(AddWithOverflow({1, 2, 3}), 6); + ASSERT_EQ(AddWithOverflow({1, 2, 125}), std::nullopt); + ASSERT_EQ(AddWithOverflow({125, 2, 1}), std::nullopt); + ASSERT_EQ(AddWithOverflow({1, 2, 125}), 128); + ASSERT_EQ(AddWithOverflow({125, 2, 1}), 128); +} + +TEST(MultiplyWithOverflow, Variadic) { + ASSERT_EQ(MultiplyWithOverflow({}), std::nullopt); + ASSERT_EQ(MultiplyWithOverflow({1, 2, 3, 4}), 24); + ASSERT_EQ(MultiplyWithOverflow({2, 2, 32}), std::nullopt); + ASSERT_EQ(MultiplyWithOverflow({32, 4, 1}), std::nullopt); + ASSERT_EQ(MultiplyWithOverflow({2, 2, 32}), 128); + ASSERT_EQ(MultiplyWithOverflow({32, 4, 1}), 128); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/io_util.cc b/cpp/src/arrow/util/io_util.cc index b3ef48d29651..03acd8297d41 100644 --- a/cpp/src/arrow/util/io_util.cc +++ b/cpp/src/arrow/util/io_util.cc @@ -99,6 +99,7 @@ #include "arrow/util/io_util.h" #include "arrow/util/logging_internal.h" #include "arrow/util/mutex.h" +#include "arrow/util/value_parsing.h" // For filename conversion #if defined(_WIN32) @@ -1762,19 +1763,28 @@ Result GetEnvVar(std::string_view name) { #ifdef _WIN32 // On Windows, getenv() reads an early copy of the process' environment // which doesn't get updated when SetEnvironmentVariable() is called. - constexpr int32_t bufsize = 2000; - char c_str[bufsize]; - auto res = GetEnvironmentVariableA(name.data(), c_str, bufsize); - if (res >= bufsize) { - return Status::CapacityError("environment variable value too long"); - } else if (res == 0) { - return Status::KeyError("environment variable '", name, "'undefined"); - } - return std::string(c_str); + std::string value(100, '\0'); + + uint32_t res = GetEnvironmentVariableA(name.data(), value.data(), + static_cast(value.size())); + if (res >= value.size()) { + // Value buffer too small, need to upsize + // (`res` includes the null-terminating character in this case) + value.resize(res); + res = GetEnvironmentVariableA(name.data(), value.data(), + static_cast(value.size())); + } + if (res == 0) { + return Status::KeyError("environment variable '", name, "' undefined"); + } + // On success, `res` does not include the null-terminating character + DCHECK_EQ(value.data()[res], 0); + value.resize(res); + return value; #else char* c_str = getenv(name.data()); if (c_str == nullptr) { - return Status::KeyError("environment variable '", name, "'undefined"); + return Status::KeyError("environment variable '", name, "' undefined"); } return std::string(c_str); #endif @@ -1782,18 +1792,25 @@ Result GetEnvVar(std::string_view name) { #ifdef _WIN32 Result GetEnvVarNative(std::string_view name) { - NativePathString w_name; - constexpr int32_t bufsize = 2000; - wchar_t w_str[bufsize]; + ARROW_ASSIGN_OR_RAISE(std::wstring w_name, StringToNative(name)); + std::wstring value(100, '\0'); - ARROW_ASSIGN_OR_RAISE(w_name, StringToNative(name)); - auto res = GetEnvironmentVariableW(w_name.c_str(), w_str, bufsize); - if (res >= bufsize) { - return Status::CapacityError("environment variable value too long"); - } else if (res == 0) { - return Status::KeyError("environment variable '", name, "'undefined"); + uint32_t res = GetEnvironmentVariableW(w_name.data(), value.data(), + static_cast(value.size())); + if (res >= value.size()) { + // Value buffer too small, need to upsize + // (`res` includes the null-terminating character in this case) + value.resize(res); + res = GetEnvironmentVariableW(w_name.data(), value.data(), + static_cast(value.size())); + } + if (res == 0) { + return Status::KeyError("environment variable '", name, "' undefined"); } - return NativePathString(w_str); + // On success, `res` does not include the null-terminating character + DCHECK_EQ(value.data()[res], 0); + value.resize(res); + return value; } #else @@ -1804,6 +1821,18 @@ Result GetEnvVarNative(std::string_view name) { #endif +Result GetEnvVarInteger(std::string_view name, std::optional min_value, + std::optional max_value) { + ARROW_ASSIGN_OR_RAISE(auto env_string, GetEnvVar(name)); + int64_t value; + if (!ParseValue(env_string.data(), env_string.length(), &value) || + (min_value.has_value() && value < *min_value) || + (max_value.has_value() && value > *max_value)) { + return Status::Invalid("Invalid value for ", name, ": '", env_string, "'"); + } + return value; +} + Status SetEnvVar(std::string_view name, std::string_view value) { #ifdef _WIN32 if (SetEnvironmentVariableA(name.data(), value.data())) { diff --git a/cpp/src/arrow/util/io_util.h b/cpp/src/arrow/util/io_util.h index 56bd4eff3d66..fa53c0dc67a6 100644 --- a/cpp/src/arrow/util/io_util.h +++ b/cpp/src/arrow/util/io_util.h @@ -244,6 +244,12 @@ ARROW_EXPORT Result GetEnvVar(std::string_view name); ARROW_EXPORT Result GetEnvVarNative(std::string_view name); +// Returns KeyError if the environment variable doesn't exist, +// Invalid if it's not a valid integer in the given range. +ARROW_EXPORT +Result GetEnvVarInteger(std::string_view name, + std::optional min_value = {}, + std::optional max_value = {}); ARROW_EXPORT Status SetEnvVar(std::string_view name, std::string_view value); diff --git a/cpp/src/arrow/util/io_util_test.cc b/cpp/src/arrow/util/io_util_test.cc index de8458dc1171..44188b3f2ee9 100644 --- a/cpp/src/arrow/util/io_util_test.cc +++ b/cpp/src/arrow/util/io_util_test.cc @@ -1134,5 +1134,44 @@ TEST(CpuAffinity, NumberOfCores) { #endif } +TEST(Environment, GetEnvVar) { + // An environment variable that should exist on roughly all platforms + ASSERT_OK_AND_ASSIGN(auto v, GetEnvVar("PATH")); + ASSERT_FALSE(v.empty()); + ASSERT_OK_AND_ASSIGN(auto w, GetEnvVarNative("PATH")); + ASSERT_FALSE(w.empty()); + // An environment variable that most probably does not exist + ASSERT_RAISES(KeyError, GetEnvVar("BZZT_NONEXISTENT_VAR")); + ASSERT_RAISES(KeyError, GetEnvVarNative("BZZT_NONEXISTENT_VAR")); + // (we try not to rely on EnvVarGuard here as that would be circular) +} + +TEST(Environment, GetEnvVarInteger) { + { + EnvVarGuard guard("FOOBAR", "5"); + ASSERT_OK_AND_EQ(5, GetEnvVarInteger("FOOBAR")); + ASSERT_OK_AND_EQ(5, GetEnvVarInteger("FOOBAR", /*min_value=*/5, /*max_value=*/7)); + ASSERT_RAISES(Invalid, GetEnvVarInteger("FOOBAR", /*min_value=*/6, /*max_value=*/7)); + ASSERT_RAISES(Invalid, GetEnvVarInteger("FOOBAR", /*min_value=*/3, /*max_value=*/4)); + } + { + EnvVarGuard guard("FOOBAR", "BAZ"); + ASSERT_RAISES(Invalid, GetEnvVarInteger("FOOBAR")); + } + { + EnvVarGuard guard("FOOBAR", std::nullopt); + ASSERT_RAISES(KeyError, GetEnvVarInteger("FOOBAR")); + } +} + +TEST(Environment, SetEnvVar) { + EnvVarGuard guard("FOOBAR", "one"); + ASSERT_OK_AND_EQ("one", GetEnvVar("FOOBAR")); + ASSERT_OK(SetEnvVar("FOOBAR", "two")); + ASSERT_OK_AND_EQ("two", GetEnvVar("FOOBAR")); + ASSERT_OK(DelEnvVar("FOOBAR")); + ASSERT_RAISES(KeyError, GetEnvVar("FOOBAR")); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/mutex.cc b/cpp/src/arrow/util/mutex.cc index 353090b6dda9..01ae7215fc3a 100644 --- a/cpp/src/arrow/util/mutex.cc +++ b/cpp/src/arrow/util/mutex.cc @@ -59,31 +59,5 @@ Mutex::Guard Mutex::Lock() { Mutex::Mutex() : impl_(new Impl, [](Impl* impl) { delete impl; }) {} -#ifndef _WIN32 -namespace { - -struct AfterForkState { - // A global instance that will also register the atfork handler when - // constructed. - static AfterForkState instance; - - // The mutex may be used at shutdown, so make it eternal. - // The leak (only in child processes) is a small price to pay for robustness. - Mutex* mutex = nullptr; - - private: - AfterForkState() { - pthread_atfork(/*prepare=*/nullptr, /*parent=*/nullptr, /*child=*/&AfterFork); - } - - static void AfterFork() { instance.mutex = new Mutex; } -}; - -AfterForkState AfterForkState::instance; -} // namespace - -Mutex* GlobalForkSafeMutex() { return AfterForkState::instance.mutex; } -#endif // _WIN32 - } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/util/mutex.h b/cpp/src/arrow/util/mutex.h index ac63cf70cd9a..f4fc64181fb1 100644 --- a/cpp/src/arrow/util/mutex.h +++ b/cpp/src/arrow/util/mutex.h @@ -60,26 +60,5 @@ class ARROW_EXPORT Mutex { std::unique_ptr impl_; }; -#ifndef _WIN32 -/// Return a pointer to a process-wide, process-specific Mutex that can be used -/// at any point in a child process. NULL is returned when called in the parent. -/// -/// The rule is to first check that getpid() corresponds to the parent process pid -/// and, if not, call this function to lock any after-fork reinitialization code. -/// Like this: -/// -/// std::atomic pid{getpid()}; -/// ... -/// if (pid.load() != getpid()) { -/// // In child process -/// auto lock = GlobalForkSafeMutex()->Lock(); -/// if (pid.load() != getpid()) { -/// // Reinitialize internal structures after fork -/// ... -/// pid.store(getpid()); -ARROW_EXPORT -Mutex* GlobalForkSafeMutex(); -#endif - } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index b2d4f7df6f1b..f77f91f6cda0 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -17,6 +17,7 @@ // From Apache Impala (incubating) as of 2016-01-29 +#include #include #include #include @@ -912,7 +913,7 @@ TEST(BitRle, Random) { } parity = !parity; } - if (!CheckRoundTrip(values, bit_util::NumRequiredBits(values.size()))) { + if (!CheckRoundTrip(values, std::bit_width(values.size()))) { FAIL() << "failing seed: " << seed; } } diff --git a/cpp/src/arrow/util/value_parsing.cc b/cpp/src/arrow/util/value_parsing.cc index 1a8e8066d703..0cc71f276df4 100644 --- a/cpp/src/arrow/util/value_parsing.cc +++ b/cpp/src/arrow/util/value_parsing.cc @@ -35,7 +35,10 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, float* out) ::arrow_vendored::fast_float::chars_format::general, decimal_point}; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, *out, options); - return res.ec == std::errc() && res.ptr == s + length; + const bool is_valid_number = + res.ec == std::errc() || res.ec == std::errc::result_out_of_range; + const bool consumed_entire_string = res.ptr == s + length; + return is_valid_number && consumed_entire_string; } bool StringToFloat(const char* s, size_t length, char decimal_point, double* out) { @@ -43,7 +46,10 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, double* out ::arrow_vendored::fast_float::chars_format::general, decimal_point}; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, *out, options); - return res.ec == std::errc() && res.ptr == s + length; + const bool is_valid_number = + res.ec == std::errc() || res.ec == std::errc::result_out_of_range; + const bool consumed_entire_string = res.ptr == s + length; + return is_valid_number && consumed_entire_string; } // Half float @@ -53,7 +59,10 @@ bool StringToFloat(const char* s, size_t length, char decimal_point, Float16* ou float temp_out; const auto res = ::arrow_vendored::fast_float::from_chars_advanced(s, s + length, temp_out, options); - const bool ok = res.ec == std::errc() && res.ptr == s + length; + const bool is_valid_number = + res.ec == std::errc() || res.ec == std::errc::result_out_of_range; + const bool consumed_entire_string = res.ptr == s + length; + const bool ok = is_valid_number && consumed_entire_string; if (ok) { *out = Float16::FromFloat(temp_out); } diff --git a/cpp/src/arrow/util/value_parsing_test.cc b/cpp/src/arrow/util/value_parsing_test.cc index b9e3b18444fa..b61f777685b7 100644 --- a/cpp/src/arrow/util/value_parsing_test.cc +++ b/cpp/src/arrow/util/value_parsing_test.cc @@ -141,6 +141,10 @@ TEST(StringConversion, ToFloat) { AssertConversion("0", 0.0f); AssertConversion("-0.0", -0.0f); AssertConversion("-1e20", -1e20f); + AssertConversion("4e38", std::numeric_limits::infinity()); + AssertConversion("-4e38", -std::numeric_limits::infinity()); + AssertConversion("1e-46", 0.0f); + AssertConversion("-1e-46", -0.0f); AssertConversion("+Infinity", std::numeric_limits::infinity()); AssertConversion("-Infinity", -std::numeric_limits::infinity()); AssertConversion("Infinity", std::numeric_limits::infinity()); @@ -166,6 +170,10 @@ TEST(StringConversion, ToDouble) { AssertConversion("0", 0); AssertConversion("-0.0", -0.0); AssertConversion("-1e100", -1e100); + AssertConversion("2e308", std::numeric_limits::infinity()); + AssertConversion("-2e308", -std::numeric_limits::infinity()); + AssertConversion("1e-325", 0.0); + AssertConversion("-1e-325", -0.0); AssertConversion("+Infinity", std::numeric_limits::infinity()); AssertConversion("-Infinity", -std::numeric_limits::infinity()); AssertConversion("Infinity", std::numeric_limits::infinity()); @@ -185,6 +193,10 @@ TEST(StringConversion, ToHalfFloat) { AssertConversion("0", Float16(0.0f)); AssertConversion("-0.0", Float16(-0.0f)); AssertConversion("-1e15", Float16(-1e15)); + AssertConversion("7e4", Float16::FromBits(0x7c00)); + AssertConversion("-7e4", Float16::FromBits(0xfc00)); + AssertConversion("1e-9", Float16(0.0f)); + AssertConversion("-1e-9", Float16(-0.0f)); AssertConversion("+Infinity", Float16::FromBits(0x7c00)); AssertConversion("-Infinity", Float16::FromBits(0xfc00)); AssertConversion("Infinity", Float16::FromBits(0x7c00)); diff --git a/cpp/src/arrow/vendored/whereami/whereami.cc b/cpp/src/arrow/vendored/whereami/whereami.cc index 945226193f99..94437361ec0c 100644 --- a/cpp/src/arrow/vendored/whereami/whereami.cc +++ b/cpp/src/arrow/vendored/whereami/whereami.cc @@ -159,7 +159,7 @@ WAI_NOINLINE WAI_FUNCSPEC int WAI_PREFIX(getModulePath)(char* out, int capacity, return length; } -#elif defined(__linux__) || defined(__CYGWIN__) || defined(__sun) || \ +#elif defined(__APPLE__) || defined(__linux__) || defined(__CYGWIN__) || defined(__sun) || \ defined(WAI_USE_PROC_SELF_EXE) # include diff --git a/cpp/src/arrow/visit_type_inline.h b/cpp/src/arrow/visit_type_inline.h index 30f5bb541621..84d162d15c7b 100644 --- a/cpp/src/arrow/visit_type_inline.h +++ b/cpp/src/arrow/visit_type_inline.h @@ -71,10 +71,8 @@ inline Status VisitTypeInline(const DataType& type, VISITOR* visitor, ARGS&&... /// \tparam ARGS Additional arguments, if any, will be passed to the Visit function after /// the `type` argument /// -/// Unlike VisitTypeInline which calls `visitor.Visit`, here `visitor` +/// Unlike VisitTypeInline which calls `visitor->Visit`, here `visitor` /// itself is called. -/// `visitor` must support a `const DataType&` argument as a fallback, -/// in addition to concrete type classes. /// /// The intent is for this to be called on a generic lambda /// that may internally use `if constexpr` or similar constructs. @@ -114,4 +112,32 @@ inline Status VisitTypeIdInline(Type::type id, VISITOR* visitor, ARGS&&... args) #undef TYPE_ID_VISIT_INLINE +#define TYPE_ID_VISIT_INLINE(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: { \ + const TYPE_CLASS##Type* concrete_ptr = NULLPTR; \ + return std::forward(visitor)(concrete_ptr, std::forward(args)...); \ + } + +/// \brief Calls `visitor` with a nullptr of the corresponding concrete type class +/// \tparam ARGS Additional arguments, if any, will be passed to the Visit function after +/// the `type` argument +/// +/// Unlike VisitTypeIdInline which calls `visitor->Visit`, here `visitor` +/// itself is called. +/// +/// The intent is for this to be called on a generic lambda +/// that may internally use `if constexpr` or similar constructs. +template +inline auto VisitTypeId(Type::type id, VISITOR&& visitor, ARGS&&... args) + -> decltype(std::forward(visitor)(std::declval(), args...)) { + switch (id) { + ARROW_GENERATE_FOR_ALL_TYPES(TYPE_ID_VISIT_INLINE); + default: + break; + } + return Status::NotImplemented("Type not implemented"); +} + +#undef TYPE_ID_VISIT_INLINE + } // namespace arrow diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index a718a8006058..cc10eb352dbd 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -219,7 +219,10 @@ Status UseJITLinkIfEnabled(llvm::orc::LLJITBuilder& jit_builder) { Result> BuildJIT( llvm::orc::JITTargetMachineBuilder jtmb, - std::optional>& object_cache) { + std::shared_ptr target_machine, + std::optional> object_cache) { + auto data_layout = target_machine->createDataLayout(); + llvm::orc::LLJITBuilder jit_builder; #ifdef JIT_LINK_SUPPORTED @@ -227,20 +230,24 @@ Result> BuildJIT( #endif jit_builder.setJITTargetMachineBuilder(std::move(jtmb)); +#if LLVM_VERSION_MAJOR >= 16 + jit_builder.setDataLayout(std::make_optional(data_layout)); +#else + jit_builder.setDataLayout(llvm::Optional(data_layout)); +#endif + if (object_cache.has_value()) { jit_builder.setCompileFunctionCreator( - [&object_cache](llvm::orc::JITTargetMachineBuilder JTMB) + [tm = std::move(target_machine), + &object_cache](llvm::orc::JITTargetMachineBuilder JTMB) -> llvm::Expected> { - auto target_machine = JTMB.createTargetMachine(); - if (!target_machine) { - return target_machine.takeError(); - } // after compilation, the object code will be stored into the given object // cache - return std::make_unique( - std::move(*target_machine), &object_cache.value().get()); + return std::make_unique(*tm, + &object_cache.value().get()); }); } + auto maybe_jit = jit_builder.create(); ARROW_ASSIGN_OR_RAISE(auto jit, AsArrowResult(maybe_jit, "Could not create LLJIT instance: ")); @@ -317,7 +324,7 @@ void Engine::InitOnce() { Engine::Engine(const std::shared_ptr& conf, std::unique_ptr lljit, - std::unique_ptr target_machine, bool cached) + std::shared_ptr target_machine, bool cached) : context_(std::make_unique()), lljit_(std::move(lljit)), ir_builder_(std::make_unique>(*context_)), @@ -367,14 +374,21 @@ Result> Engine::Make( std::optional> object_cache) { std::call_once(llvm_init_once_flag, InitOnce); + // Create the target machine ARROW_ASSIGN_OR_RAISE(auto jtmb, MakeTargetMachineBuilder(*conf)); - ARROW_ASSIGN_OR_RAISE(auto jit, BuildJIT(jtmb, object_cache)); auto maybe_tm = jtmb.createTargetMachine(); ARROW_ASSIGN_OR_RAISE(auto target_machine, AsArrowResult(maybe_tm, "Could not create target machine: ")); + auto shared_target_machine = + std::shared_ptr(std::move(target_machine)); + + // Build the LLJIT instance + ARROW_ASSIGN_OR_RAISE(auto jit, + BuildJIT(std::move(jtmb), shared_target_machine, object_cache)); + std::unique_ptr engine{ - new Engine(conf, std::move(jit), std::move(target_machine), cached)}; + new Engine(conf, std::move(jit), std::move(shared_target_machine), cached)}; ARROW_RETURN_NOT_OK(engine->Init()); return engine; diff --git a/cpp/src/gandiva/engine.h b/cpp/src/gandiva/engine.h index 3c8914a7b4d9..20165787cb66 100644 --- a/cpp/src/gandiva/engine.h +++ b/cpp/src/gandiva/engine.h @@ -96,7 +96,7 @@ class GANDIVA_EXPORT Engine { private: Engine(const std::shared_ptr& conf, std::unique_ptr lljit, - std::unique_ptr target_machine, bool cached); + std::shared_ptr target_machine, bool cached); // Post construction init. This _must_ be called after the constructor. Status Init(); @@ -130,7 +130,9 @@ class GANDIVA_EXPORT Engine { bool functions_loaded_ = false; std::shared_ptr function_registry_; std::string module_ir_; - std::unique_ptr target_machine_; + // The lifetime of the TargetMachine is shared with LLJIT. This prevents unnecessary + // duplication of this expensive object. + std::shared_ptr target_machine_; const std::shared_ptr conf_; }; diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 2bc6936d77b3..be57ce4f4768 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -432,7 +432,8 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), NativeFunction("binary_string", {}, DataTypeVector{utf8()}, binary(), - kResultNullIfNull, "binary_string", NativeFunction::kNeedsContext), + kResultNullIfNull, "binary_string", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), NativeFunction("left", {}, DataTypeVector{utf8(), int32()}, utf8(), kResultNullIfNull, "left_utf8_int32", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 0b31c769c99f..0b787f461c21 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -841,7 +841,12 @@ const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_le *out_len = 0; return ""; } - *out_len = repeat_number * in_len; + if (ARROW_PREDICT_FALSE( + arrow::internal::MultiplyWithOverflow(repeat_number, in_len, out_len))) { + gdv_fn_context_set_error_msg(context, "Would overflow maximum output size"); + *out_len = 0; + return ""; + } char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); if (ret == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); @@ -2252,6 +2257,11 @@ const char* right_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text FORCE_INLINE const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_len, gdv_int32* out_len) { + if (text_len == 0) { + *out_len = 0; + return ""; + } + gdv_binary ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, text_len)); @@ -2261,11 +2271,6 @@ const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_le return ""; } - if (text_len == 0) { - *out_len = 0; - return ""; - } - // converting hex encoded string to normal string int j = 0; for (int i = 0; i < text_len; i++, j++) { diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 9d0a4d71afef..e0248667e3df 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -387,6 +387,13 @@ TEST(TestStringOps, TestRepeat) { EXPECT_EQ(std::string(out_str, out_len), ""); EXPECT_THAT(ctx.get_error(), ::testing::HasSubstr("Repeat number can't be negative")); ctx.Reset(); + + out_str = repeat_utf8_int32(ctx_ptr, "aa", 2, + std::numeric_limits::max() / 2 + 1, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr("Would overflow maximum output size")); + ctx.Reset(); } TEST(TestStringOps, TestCastBoolToVarchar) { @@ -1883,10 +1890,6 @@ TEST(TestStringOps, TestBinaryString) { std::string output = std::string(out_str, out_len); EXPECT_EQ(output, "TestString"); - out_str = binary_string(ctx_ptr, "", 0, &out_len); - output = std::string(out_str, out_len); - EXPECT_EQ(output, ""); - out_str = binary_string(ctx_ptr, "T", 1, &out_len); output = std::string(out_str, out_len); EXPECT_EQ(output, "T"); @@ -1912,6 +1915,22 @@ TEST(TestStringOps, TestBinaryString) { EXPECT_EQ(output, "OM"); } +TEST(TestStringOps, TestBinaryStringNull) { + // This test is only valid if it is the first to trigger a memory allocation in the + // context. + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + std::string output; + + out_str = binary_string(ctx_ptr, "", 0, &out_len); + ASSERT_FALSE(ctx.has_error()); + output = std::string(out_str, out_len); + EXPECT_EQ(output, ""); +} + TEST(TestStringOps, TestSplitPart) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); diff --git a/cpp/src/gandiva/precompiled/time.cc b/cpp/src/gandiva/precompiled/time.cc index 8bbd0930991c..e1e9ac44567c 100644 --- a/cpp/src/gandiva/precompiled/time.cc +++ b/cpp/src/gandiva/precompiled/time.cc @@ -566,6 +566,27 @@ bool is_valid_time(const int hours, const int minutes, const int seconds) { seconds < 60; } +// Normalize sub-seconds value to milliseconds precision (3 digits). +// Truncates if more than 3 digits are provided, pads with zeros if fewer than 3 digits +static inline int32_t normalize_subseconds_to_millis(int32_t subseconds, + int32_t num_digits) { + if (num_digits <= 0 || num_digits == 3) { + // No need to adjust + return subseconds; + } + // Calculate the power of 10 adjustment needed + int32_t digit_diff = num_digits - 3; + while (digit_diff > 0) { + subseconds /= 10; + digit_diff--; + } + while (digit_diff < 0) { + subseconds *= 10; + digit_diff++; + } + return subseconds; +} + // MONTHS_BETWEEN returns number of months between dates date1 and date2. // If date1 is later than date2, then the result is positive. // If date1 is earlier than date2, then the result is negative. @@ -746,17 +767,8 @@ gdv_timestamp castTIMESTAMP_utf8(int64_t context, const char* input, gdv_int32 l } // adjust the milliseconds - if (sub_seconds_len > 0) { - if (sub_seconds_len > 3) { - const char* msg = "Invalid millis for timestamp value "; - set_error_for_date(length, input, msg, context); - return 0; - } - while (sub_seconds_len < 3) { - ts_fields[TimeFields::kSubSeconds] *= 10; - sub_seconds_len++; - } - } + ts_fields[TimeFields::kSubSeconds] = + normalize_subseconds_to_millis(ts_fields[TimeFields::kSubSeconds], sub_seconds_len); // handle timezone if (encountered_zone) { int err = 0; @@ -866,18 +878,9 @@ gdv_time32 castTIME_utf8(int64_t context, const char* input, int32_t length) { } // adjust the milliseconds - if (sub_seconds_len > 0) { - if (sub_seconds_len > 3) { - const char* msg = "Invalid millis for time value "; - set_error_for_date(length, input, msg, context); - return 0; - } - - while (sub_seconds_len < 3) { - time_fields[TimeFields::kSubSeconds - TimeFields::kHours] *= 10; - sub_seconds_len++; - } - } + time_fields[TimeFields::kSubSeconds - TimeFields::kHours] = + normalize_subseconds_to_millis( + time_fields[TimeFields::kSubSeconds - TimeFields::kHours], sub_seconds_len); int32_t input_hours = time_fields[TimeFields::kHours - TimeFields::kHours]; int32_t input_minutes = time_fields[TimeFields::kMinutes - TimeFields::kHours]; diff --git a/cpp/src/gandiva/precompiled/time_test.cc b/cpp/src/gandiva/precompiled/time_test.cc index 0d3b348754ae..82b38d1b5777 100644 --- a/cpp/src/gandiva/precompiled/time_test.cc +++ b/cpp/src/gandiva/precompiled/time_test.cc @@ -122,15 +122,26 @@ TEST(TestTime, TestCastTimestamp) { "Not a valid time for timestamp value 2000-01-01 00:00:100"); context.Reset(); - EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.0001", 24), 0); - EXPECT_EQ(context.get_error(), - "Invalid millis for timestamp value 2000-01-01 00:00:00.0001"); - context.Reset(); - - EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.1000", 24), 0); - EXPECT_EQ(context.get_error(), - "Invalid millis for timestamp value 2000-01-01 00:00:00.1000"); - context.Reset(); + // Test truncation of subseconds to 3 digits (milliseconds) + // "2000-01-01 00:00:00.0001" should truncate to "2000-01-01 00:00:00.000" + EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.0001", 24), + castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.000", 23)); + + // "2000-01-01 00:00:00.1000" should truncate to "2000-01-01 00:00:00.100" + EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.1000", 24), + castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.100", 23)); + + // "2000-01-01 00:00:00.123456789" should truncate to "2000-01-01 00:00:00.123" + EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.123456789", 29), + castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.123", 23)); + + // "2000-01-01 00:00:00.1999" should truncate to "2000-01-01 00:00:00.199" + EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.1999", 24), + castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.199", 23)); + + // "2000-01-01 00:00:00.1994" should truncate to "2000-01-01 00:00:00.199" + EXPECT_EQ(castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.1994", 24), + castTIMESTAMP_utf8(context_ptr, "2000-01-01 00:00:00.199", 23)); } TEST(TestTime, TestCastTimeUtf8) { @@ -166,13 +177,26 @@ TEST(TestTime, TestCastTimeUtf8) { EXPECT_EQ(context.get_error(), "Not a valid time value 00:00:100"); context.Reset(); - EXPECT_EQ(castTIME_utf8(context_ptr, "00:00:00.0001", 13), 0); - EXPECT_EQ(context.get_error(), "Invalid millis for time value 00:00:00.0001"); - context.Reset(); + // Test truncation of subseconds to 3 digits (milliseconds) + // "00:00:00.0001" should truncate to "00:00:00.000" + EXPECT_EQ(castTIME_utf8(context_ptr, "00:00:00.0001", 13), + castTIME_utf8(context_ptr, "00:00:00.000", 12)); - EXPECT_EQ(castTIME_utf8(context_ptr, "00:00:00.1000", 13), 0); - EXPECT_EQ(context.get_error(), "Invalid millis for time value 00:00:00.1000"); - context.Reset(); + // "00:00:00.1000" should truncate to "00:00:00.100" + EXPECT_EQ(castTIME_utf8(context_ptr, "00:00:00.1000", 13), + castTIME_utf8(context_ptr, "00:00:00.100", 12)); + + // "9:45:30.123456789" should truncate to "9:45:30.123" + EXPECT_EQ(castTIME_utf8(context_ptr, "9:45:30.123456789", 17), + castTIME_utf8(context_ptr, "9:45:30.123", 11)); + + // "00:00:00.1999" should truncate to "00:00:00.199" + EXPECT_EQ(castTIME_utf8(context_ptr, "00:00:00.1999", 13), + castTIME_utf8(context_ptr, "00:00:00.199", 12)); + + // "00:00:00.1994" should truncate to "00:00:00.199" + EXPECT_EQ(castTIME_utf8(context_ptr, "00:00:00.1994", 13), + castTIME_utf8(context_ptr, "00:00:00.199", 12)); } #ifndef _WIN32 diff --git a/cpp/src/gandiva/selection_vector.cc b/cpp/src/gandiva/selection_vector.cc index 8d5f9f4210af..0d8ecb66b7ad 100644 --- a/cpp/src/gandiva/selection_vector.cc +++ b/cpp/src/gandiva/selection_vector.cc @@ -17,6 +17,7 @@ #include "gandiva/selection_vector.h" +#include #include #include #include @@ -64,7 +65,7 @@ Status SelectionVector::PopulateFromBitMap(const uint8_t* bitmap, int64_t bitmap # pragma warning(pop) #endif - int pos_in_word = arrow::bit_util::CountTrailingZeros(highest_only); + int pos_in_word = std::countr_zero(highest_only); int64_t pos_in_bitmap = bitmap_idx * 64 + pos_in_word; if (pos_in_bitmap > max_bitmap_index) { diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 0bc5dc06472e..6c1550dcc2f7 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -144,7 +144,7 @@ set_source_files_properties("${PARQUET_THRIFT_SOURCE_DIR}/parquet_types.cpp" if(NOT MSVC) set_source_files_properties(src/parquet/parquet_types.cpp - PROPERTIES COMPILE_FLAGS -Wno-unused-variable) + PROPERTIES COMPILE_OPTIONS -Wno-unused-variable) endif() # @@ -157,7 +157,6 @@ set(PARQUET_SRCS arrow/reader_internal.cc arrow/schema.cc arrow/schema_internal.cc - arrow/variant_internal.cc arrow/writer.cc bloom_filter.cc bloom_filter_reader.cc @@ -200,14 +199,12 @@ if(ARROW_HAVE_RUNTIME_AVX2) # violation with -DCMAKE_BUILD_TYPE=MinSizeRel. CMAKE_CXX_FLAGS_RELEASE # will force inlining as much as possible. # See also: ARROW-15664 and ARROW-15678 - # - # TODO: Use COMPILE_OPTIONS instead of COMPILE_FLAGS when we require - # CMake 3.11 or later. - set(AVX2_FLAGS "${ARROW_AVX2_FLAG}") + set(AVX2_FLAGS ${ARROW_AVX2_FLAGS}) if(NOT MSVC) - string(APPEND AVX2_FLAGS " ${CMAKE_CXX_FLAGS_RELEASE}") + separate_arguments(RELEASE_FLAGS NATIVE_COMMAND "${CMAKE_CXX_FLAGS_RELEASE}") + list(APPEND AVX2_FLAGS ${RELEASE_FLAGS}) endif() - set_source_files_properties(level_comparison_avx2.cc PROPERTIES COMPILE_FLAGS + set_source_files_properties(level_comparison_avx2.cc PROPERTIES COMPILE_OPTIONS "${AVX2_FLAGS}") # WARNING: DO NOT BLINDLY COPY THIS CODE FOR OTHER BMI2 USE CASES. # This code is always guarded by runtime dispatch which verifies @@ -218,14 +215,11 @@ if(ARROW_HAVE_RUNTIME_AVX2) # violation with -DCMAKE_BUILD_TYPE=MinSizeRel. CMAKE_CXX_FLAGS_RELEASE # will force inlining as much as possible. # See also: ARROW-15664 and ARROW-15678 - # - # TODO: Use COMPILE_OPTIONS instead of COMPILE_FLAGS when we require - # CMake 3.11 or later. if(ARROW_HAVE_RUNTIME_BMI2) # Need to pass ARROW_HAVE_BMI2 for level_conversion_inc.h to compile # the BMI2 path. - set(BMI2_FLAGS "${AVX2_FLAGS} ${ARROW_BMI2_FLAG} -DARROW_HAVE_BMI2") - set_source_files_properties(level_conversion_bmi2.cc PROPERTIES COMPILE_FLAGS + set(BMI2_FLAGS ${AVX2_FLAGS} ${ARROW_BMI2_FLAG} -DARROW_HAVE_BMI2) + set_source_files_properties(level_conversion_bmi2.cc PROPERTIES COMPILE_OPTIONS "${BMI2_FLAGS}") endif() endif() diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index edb59d9de305..e2384972cf55 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -2451,12 +2451,12 @@ TEST(TestArrowReadWrite, ReadSingleRowGroup) { ASSERT_EQ(2, reader->num_row_groups()); - std::shared_ptr
r1, r2, r3, r4; + std::shared_ptr
r2; // Read everything - ASSERT_OK_NO_THROW(reader->ReadRowGroup(0, &r1)); + ASSERT_OK_AND_ASSIGN(auto r1, reader->ReadRowGroup(0)); ASSERT_OK_NO_THROW(reader->RowGroup(1)->ReadTable(&r2)); - ASSERT_OK_NO_THROW(reader->ReadRowGroups({0, 1}, &r3)); - ASSERT_OK_NO_THROW(reader->ReadRowGroups({1}, &r4)); + ASSERT_OK_AND_ASSIGN(auto r3, reader->ReadRowGroups({0, 1})); + ASSERT_OK_AND_ASSIGN(auto r4, reader->ReadRowGroups({1})); std::shared_ptr
concatenated; @@ -4085,7 +4085,7 @@ TEST_F(TestNestedSchemaRead, ReadTablePartial) { ASSERT_NO_FATAL_FAILURE(ValidateTableArrayTypes(*table)); // columns: {group1.leaf1, leaf3} - ASSERT_OK_NO_THROW(reader_->ReadRowGroup(0, {0, 2}, &table)); + ASSERT_OK_AND_ASSIGN(table, reader_->ReadRowGroup(0, {0, 2})); ASSERT_EQ(table->num_rows(), NUM_SIMPLE_TEST_ROWS); ASSERT_EQ(table->num_columns(), 2); ASSERT_EQ(table->schema()->field(0)->name(), "group1"); @@ -5889,5 +5889,34 @@ TEST(TestArrowReadWrite, OperationsOnClosedWriter) { ASSERT_RAISES(Invalid, writer->WriteTable(*table, 1)); } +TEST(TestArrowReadWrite, AllNulls) { + auto schema = ::arrow::schema({::arrow::field("all_nulls", ::arrow::int8())}); + + constexpr int64_t length = 3; + ASSERT_OK_AND_ASSIGN(auto null_bitmap, ::arrow::AllocateEmptyBitmap(length)); + auto array_data = ::arrow::ArrayData::Make( + ::arrow::int8(), length, {null_bitmap, /*values=*/nullptr}, /*null_count=*/length); + auto array = ::arrow::MakeArray(array_data); + auto record_batch = ::arrow::RecordBatch::Make(schema, length, {array}); + + auto sink = CreateOutputStream(); + ASSERT_OK_AND_ASSIGN(auto writer, parquet::arrow::FileWriter::Open( + *schema, ::arrow::default_memory_pool(), sink, + parquet::default_writer_properties(), + parquet::default_arrow_writer_properties())); + ASSERT_OK(writer->WriteRecordBatch(*record_batch)); + ASSERT_OK(writer->Close()); + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + + std::shared_ptr<::arrow::Table> read_table; + ASSERT_OK_AND_ASSIGN(auto reader, + parquet::arrow::OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); + ASSERT_OK(reader->ReadTable(&read_table)); + auto expected_table = ::arrow::Table::Make( + schema, {::arrow::ArrayFromJSON(::arrow::int8(), R"([null, null, null])")}); + ASSERT_TRUE(expected_table->Equals(*read_table)); +} + } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 73ce8ea69e3c..721244fdbe26 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -25,7 +25,6 @@ #include "parquet/arrow/reader.h" #include "parquet/arrow/reader_internal.h" #include "parquet/arrow/schema.h" -#include "parquet/arrow/variant_internal.h" #include "parquet/file_reader.h" #include "parquet/schema.h" #include "parquet/schema_internal.h" @@ -34,12 +33,14 @@ #include "arrow/array.h" #include "arrow/extension/json.h" +#include "arrow/extension/parquet_variant.h" #include "arrow/extension/uuid.h" #include "arrow/ipc/writer.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/util/base64.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/key_value_metadata.h" using arrow::Field; @@ -950,7 +951,7 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) { auto arrow_metadata = ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); - auto variant_extension = std::make_shared(arrow_variant); + auto variant_extension = ::arrow::extension::variant(arrow_variant); { // Parquet file does not contain Arrow schema. @@ -2018,6 +2019,53 @@ TEST_F(TestConvertRoundTrip, FieldIdPreserveAllColumnTypes) { ASSERT_EQ(thrift_field_ids, expected_field_ids); } +TEST_F(TestConvertRoundTrip, MapNestedFieldMetadataPreserved) { + auto key_meta = ::arrow::key_value_metadata({"k"}, {"v"}); + auto inner_meta = ::arrow::key_value_metadata({"inner_k"}, {"inner_v"}); + + auto map_key = ::arrow::field("key", UTF8, /*nullable=*/false, key_meta); + auto map_value = ::arrow::field( + "value", + ::arrow::struct_({::arrow::field("inner", INT64, /*nullable=*/true, inner_meta)}), + /*nullable=*/true, inner_meta); + auto sorted_map = + std::make_shared<::arrow::MapType>(map_key, map_value, /*keys_sorted=*/true); + auto arrow_schema = ::arrow::schema( + {::arrow::field("m", sorted_map, /*nullable=*/true, FieldIdMetadata(99))}); + + std::shared_ptr parquet_schema; + ASSERT_OK(ToParquetSchema(arrow_schema.get(), *::parquet::default_writer_properties(), + &parquet_schema)); + + std::shared_ptr kv_metadata; + ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, kv_metadata)); + + std::shared_ptr<::arrow::Schema> restored_schema; + ASSERT_OK(FromParquetSchema(parquet_schema.get(), ArrowReaderProperties(), kv_metadata, + &restored_schema)); + ASSERT_EQ(restored_schema->num_fields(), 1); + + auto restored_map = ::arrow::internal::checked_pointer_cast<::arrow::MapType>( + restored_schema->field(0)->type()); + ASSERT_EQ(GetFieldId(*restored_schema->field(0)), 99); + + // It's a pity that we cannot directly use AssertTypeEqual on restored_map and + // sorted_map because ::arrow::MapType uses "entries" as the inner field name + // but Parquet uses "key_value" (see MapToNode in parquet/arrow/schema.cc). + ASSERT_TRUE(restored_map->keys_sorted()); + ASSERT_NE(restored_map->key_field()->metadata(), nullptr); + ASSERT_EQ(restored_map->key_field()->metadata()->Get("k").ValueOrDie(), "v"); + + ASSERT_NE(restored_map->item_field()->metadata(), nullptr); + ASSERT_EQ(restored_map->item_field()->metadata()->Get("inner_k").ValueOrDie(), + "inner_v"); + + auto restored_struct = restored_map->item_type(); + ASSERT_NE(restored_struct->field(0)->metadata(), nullptr); + ASSERT_EQ(restored_struct->field(0)->metadata()->Get("inner_k").ValueOrDie(), + "inner_v"); +} + TEST(InvalidSchema, ParquetNegativeDecimalScale) { const auto& type = ::arrow::decimal128(23, -2); const auto& field = ::arrow::field("f0", type); diff --git a/cpp/src/parquet/arrow/fuzz_internal.cc b/cpp/src/parquet/arrow/fuzz_internal.cc index 7c4539bf518b..8618a85fcca1 100644 --- a/cpp/src/parquet/arrow/fuzz_internal.cc +++ b/cpp/src/parquet/arrow/fuzz_internal.cc @@ -98,16 +98,16 @@ namespace { Status FuzzReadData(std::unique_ptr reader) { auto final_status = Status::OK(); for (int i = 0; i < reader->num_row_groups(); ++i) { - std::shared_ptr
table; - auto row_group_status = reader->ReadRowGroup(i, &table); - if (row_group_status.ok()) { + auto table_result = reader->ReadRowGroup(i); + if (table_result.ok()) { // When reading returns successfully, the Arrow data should be structurally // valid so that it can be read normally. If that is not the case, abort // so that the error can be published by OSS-Fuzz. + auto table = *table_result; ARROW_CHECK_OK(table->Validate()); - row_group_status &= table->ValidateFull(); + final_status &= table->ValidateFull(); } - final_status &= row_group_status; + final_status &= table_result.status(); } return final_status; } diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 434430a875e4..a77323d29fad 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -204,10 +204,7 @@ class FileReaderImpl : public FileReader { Result> ReadTable( const std::vector& column_indices) override { - std::shared_ptr
table; - RETURN_NOT_OK(ReadRowGroups(Iota(reader_->metadata()->num_row_groups()), - column_indices, &table)); - return table; + return ReadRowGroups(Iota(reader_->metadata()->num_row_groups()), column_indices); } Status GetFieldReader(int i, @@ -312,9 +309,8 @@ class FileReaderImpl : public FileReader { return ReadTable(Iota(reader_->metadata()->num_columns())); } - Status ReadRowGroups(const std::vector& row_groups, - const std::vector& indices, - std::shared_ptr
* table) override; + Result> ReadRowGroups(const std::vector& row_groups, + const std::vector& indices) override; // Helper method used by ReadRowGroups - read the given row groups/columns, skipping // bounds checks and pre-buffering. Takes a shared_ptr to self to keep the reader @@ -323,18 +319,18 @@ class FileReaderImpl : public FileReader { std::shared_ptr self, const std::vector& row_groups, const std::vector& column_indices, ::arrow::internal::Executor* cpu_executor); - Status ReadRowGroups(const std::vector& row_groups, - std::shared_ptr
* table) override { - return ReadRowGroups(row_groups, Iota(reader_->metadata()->num_columns()), table); + Result> ReadRowGroups( + const std::vector& row_groups) override { + return ReadRowGroups(row_groups, Iota(reader_->metadata()->num_columns())); } - Status ReadRowGroup(int row_group_index, const std::vector& column_indices, - std::shared_ptr
* out) override { - return ReadRowGroups({row_group_index}, column_indices, out); + Result> ReadRowGroup( + int row_group_index, const std::vector& column_indices) override { + return ReadRowGroups({row_group_index}, column_indices); } - Status ReadRowGroup(int i, std::shared_ptr
* table) override { - return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table); + Result> ReadRowGroup(int i) override { + return ReadRowGroup(i, Iota(reader_->metadata()->num_columns())); } Result> GetRecordBatchReader( @@ -437,11 +433,13 @@ class RowGroupReaderImpl : public RowGroupReader { Status ReadTable(const std::vector& column_indices, std::shared_ptr<::arrow::Table>* out) override { - return impl_->ReadRowGroup(row_group_index_, column_indices, out); + ARROW_ASSIGN_OR_RAISE(*out, impl_->ReadRowGroup(row_group_index_, column_indices)); + return Status::OK(); } Status ReadTable(std::shared_ptr<::arrow::Table>* out) override { - return impl_->ReadRowGroup(row_group_index_, out); + ARROW_ASSIGN_OR_RAISE(*out, impl_->ReadRowGroup(row_group_index_)); + return Status::OK(); } private: @@ -1254,9 +1252,8 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto return Status::OK(); } -Status FileReaderImpl::ReadRowGroups(const std::vector& row_groups, - const std::vector& column_indices, - std::shared_ptr
* out) { +Result> FileReaderImpl::ReadRowGroups( + const std::vector& row_groups, const std::vector& column_indices) { RETURN_NOT_OK(BoundsCheck(row_groups, column_indices)); // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled @@ -1270,8 +1267,7 @@ Status FileReaderImpl::ReadRowGroups(const std::vector& row_groups, auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices, /*cpu_executor=*/nullptr); - ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult()); - return Status::OK(); + return fut.MoveResult(); } Future> FileReaderImpl::DecodeRowGroups( @@ -1353,6 +1349,30 @@ Status FileReader::ReadTable(const std::vector& column_indices, return Status::OK(); } +Status FileReader::ReadRowGroup(int i, const std::vector& column_indices, + std::shared_ptr
* out) { + ARROW_ASSIGN_OR_RAISE(*out, ReadRowGroup(i, column_indices)); + return Status::OK(); +} + +Status FileReader::ReadRowGroup(int i, std::shared_ptr
* out) { + ARROW_ASSIGN_OR_RAISE(*out, ReadRowGroup(i)); + return Status::OK(); +} + +Status FileReader::ReadRowGroups(const std::vector& row_groups, + const std::vector& column_indices, + std::shared_ptr
* out) { + ARROW_ASSIGN_OR_RAISE(*out, ReadRowGroups(row_groups, column_indices)); + return Status::OK(); +} + +Status FileReader::ReadRowGroups(const std::vector& row_groups, + std::shared_ptr
* out) { + ARROW_ASSIGN_OR_RAISE(*out, ReadRowGroups(row_groups)); + return Status::OK(); +} + Status FileReader::Make(::arrow::MemoryPool* pool, std::unique_ptr reader, const ArrowReaderProperties& properties, diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index d0665ea3106d..642546335f16 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -266,17 +266,40 @@ class PARQUET_EXPORT FileReader { ::arrow::Status ReadTable(const std::vector& column_indices, std::shared_ptr<::arrow::Table>* out); - virtual ::arrow::Status ReadRowGroup(int i, const std::vector& column_indices, - std::shared_ptr<::arrow::Table>* out) = 0; + /// \brief Read the given row group columns into a Table + virtual ::arrow::Result> ReadRowGroup( + int i, const std::vector& column_indices) = 0; - virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0; + /// \brief Read the given row group into a Table + virtual ::arrow::Result> ReadRowGroup(int i) = 0; - virtual ::arrow::Status ReadRowGroups(const std::vector& row_groups, - const std::vector& column_indices, - std::shared_ptr<::arrow::Table>* out) = 0; + /// \brief Read the given row groups columns into a Table + virtual ::arrow::Result> ReadRowGroups( + const std::vector& row_groups, const std::vector& column_indices) = 0; - virtual ::arrow::Status ReadRowGroups(const std::vector& row_groups, - std::shared_ptr<::arrow::Table>* out) = 0; + /// \brief Read the given row groups into a Table + virtual ::arrow::Result> ReadRowGroups( + const std::vector& row_groups) = 0; + + /// \deprecated Deprecated in 24.0.0. Use arrow::Result version instead. + ARROW_DEPRECATED("Deprecated in 24.0.0. Use arrow::Result version instead.") + ::arrow::Status ReadRowGroup(int i, const std::vector& column_indices, + std::shared_ptr<::arrow::Table>* out); + + /// \deprecated Deprecated in 24.0.0. Use arrow::Result version instead. + ARROW_DEPRECATED("Deprecated in 24.0.0. Use arrow::Result version instead.") + ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out); + + /// \deprecated Deprecated in 24.0.0. Use arrow::Result version instead. + ARROW_DEPRECATED("Deprecated in 24.0.0. Use arrow::Result version instead.") + ::arrow::Status ReadRowGroups(const std::vector& row_groups, + const std::vector& column_indices, + std::shared_ptr<::arrow::Table>* out); + + /// \deprecated Deprecated in 24.0.0. Use arrow::Result version instead. + ARROW_DEPRECATED("Deprecated in 24.0.0. Use arrow::Result version instead.") + ::arrow::Status ReadRowGroups(const std::vector& row_groups, + std::shared_ptr<::arrow::Table>* out); /// \brief Scan file contents with one thread, return number of rows virtual ::arrow::Status ScanContents(std::vector columns, diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc index 7523a781d891..2f288fd2eb0f 100644 --- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc +++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc @@ -777,8 +777,7 @@ static void BM_ReadMultipleRowGroups(::benchmark::State& state) { EXIT_NOT_OK(arrow_reader_result.status()); auto arrow_reader = std::move(*arrow_reader_result); - std::shared_ptr
table; - EXIT_NOT_OK(arrow_reader->ReadRowGroups(rgs, &table)); + PARQUET_ASSIGN_OR_THROW(auto table, arrow_reader->ReadRowGroups(rgs)); } SetBytesProcessed(state); } diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 266215a8104e..ed30661f9b4e 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -22,6 +22,7 @@ #include #include "arrow/extension/json.h" +#include "arrow/extension/parquet_variant.h" #include "arrow/extension/uuid.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" @@ -36,7 +37,6 @@ #include "arrow/util/value_parsing.h" #include "parquet/arrow/schema_internal.h" -#include "parquet/arrow/variant_internal.h" #include "parquet/exception.h" #include "parquet/geospatial/util_json_internal.h" #include "parquet/metadata.h" @@ -129,10 +129,11 @@ Status MapToNode(const std::shared_ptr<::arrow::MapType>& type, const std::strin return Status::OK(); } -Status VariantToNode(const std::shared_ptr& type, - const std::string& name, bool nullable, int field_id, - const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, NodePtr* out) { +Status VariantToNode( + const std::shared_ptr<::arrow::extension::VariantExtensionType>& type, + const std::string& name, bool nullable, int field_id, + const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, + NodePtr* out) { NodePtr metadata_node; RETURN_NOT_OK(FieldToNode("metadata", type->metadata(), properties, arrow_properties, &metadata_node)); @@ -485,8 +486,10 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, ARROW_ASSIGN_OR_RAISE(logical_type, LogicalTypeFromGeoArrowMetadata(ext_type->Serialize())); break; - } else if (ext_type->extension_name() == std::string("parquet.variant")) { - auto variant_type = std::static_pointer_cast(field->type()); + } else if (ext_type->extension_name() == std::string("arrow.parquet.variant")) { + auto variant_type = + std::static_pointer_cast<::arrow::extension::VariantExtensionType>( + field->type()); return VariantToNode(variant_type, name, field->nullable(), field_id, properties, arrow_properties, out); @@ -597,7 +600,7 @@ Status GroupToStruct(const GroupNode& node, LevelInfo current_levels, auto struct_type = ::arrow::struct_(arrow_fields); if (ctx->properties.get_arrow_extensions_enabled() && node.logical_type()->is_variant()) { - auto extension_type = ::arrow::GetExtensionType("parquet.variant"); + auto extension_type = ::arrow::GetExtensionType("arrow.parquet.variant"); if (extension_type) { ARROW_ASSIGN_OR_RAISE( struct_type, @@ -988,6 +991,16 @@ std::function(FieldVector)> GetNestedFactory( }; } break; + case ::arrow::Type::MAP: + if (origin_type.id() == ::arrow::Type::MAP) { + const bool keys_sorted = + checked_cast(origin_type).keys_sorted(); + return [keys_sorted](FieldVector fields) { + DCHECK_EQ(fields.size(), 1); + return std::make_shared<::arrow::MapType>(std::move(fields[0]), keys_sorted); + }; + } + break; default: break; } @@ -1147,10 +1160,10 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer extension_supports_inferred_storage = arrow_extension_inferred || ::arrow::extension::UuidType::IsSupportedStorageType(inferred_type); - } else if (origin_extension_name == "parquet.variant") { + } else if (origin_extension_name == "arrow.parquet.variant") { extension_supports_inferred_storage = arrow_extension_inferred || - VariantExtensionType::IsSupportedStorageType(inferred_type); + ::arrow::extension::VariantExtensionType::IsSupportedStorageType(inferred_type); } else { extension_supports_inferred_storage = origin_extension_type.storage_type()->Equals(*inferred_type); diff --git a/cpp/src/parquet/arrow/variant_test.cc b/cpp/src/parquet/arrow/variant_test.cc index caf63d8e3d72..04f46d2e444d 100644 --- a/cpp/src/parquet/arrow/variant_test.cc +++ b/cpp/src/parquet/arrow/variant_test.cc @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include "parquet/arrow/variant_internal.h" - #include "arrow/array/validate.h" +#include "arrow/extension/parquet_variant.h" #include "arrow/ipc/test_common.h" #include "arrow/record_batch.h" #include "arrow/testing/gtest_util.h" @@ -29,16 +28,20 @@ using ::arrow::binary; using ::arrow::struct_; TEST(TestVariantExtensionType, StorageTypeValidation) { - auto variant1 = variant(struct_({field("metadata", binary(), /*nullable=*/false), - field("value", binary(), /*nullable=*/false)})); - auto variant2 = variant(struct_({field("metadata", binary(), /*nullable=*/false), - field("value", binary(), /*nullable=*/false)})); + auto variant1 = ::arrow::extension::variant( + struct_({field("metadata", binary(), /*nullable=*/false), + field("value", binary(), /*nullable=*/false)})); + auto variant2 = ::arrow::extension::variant( + struct_({field("metadata", binary(), /*nullable=*/false), + field("value", binary(), /*nullable=*/false)})); ASSERT_TRUE(variant1->Equals(variant2)); // Metadata and value fields can be provided in either order - auto variantFieldsFlipped = std::dynamic_pointer_cast( - variant(struct_({field("value", binary(), /*nullable=*/false), + auto variantFieldsFlipped = + std::dynamic_pointer_cast<::arrow::extension::VariantExtensionType>( + ::arrow::extension::variant( + struct_({field("value", binary(), /*nullable=*/false), field("metadata", binary(), /*nullable=*/false)}))); ASSERT_EQ("metadata", variantFieldsFlipped->metadata()->name()); @@ -62,7 +65,7 @@ TEST(TestVariantExtensionType, StorageTypeValidation) { Invalid, "Invalid: Invalid storage type for VariantExtensionType: " + storage_type->ToString(), - VariantExtensionType::Make(storage_type)); + ::arrow::extension::VariantExtensionType::Make(storage_type)); } } diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index cc0a386f4c11..5cd31f8334c8 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -17,6 +17,7 @@ #include "parquet/chunker_internal.h" +#include #include #include #include @@ -85,7 +86,8 @@ uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, int norm_ // assuming that the gear hash has a uniform distribution, we can calculate the mask // by taking the floor(log2(target_size)) - int mask_bits = std::max(0, ::arrow::bit_util::NumRequiredBits(target_size) - 1); + auto target_bits = std::bit_width(static_cast(target_size)); + int mask_bits = target_bits == 0 ? 0 : static_cast(target_bits - 1); // a user defined `norm_level` can be used to adjust the mask size, hence the matching // probability, by increasing the norm_level we increase the probability of matching diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 20b8cc98cac2..797d435e73e8 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2099,7 +2099,12 @@ Status TypedColumnWriterImpl::WriteArrowSerialize( PARQUET_THROW_NOT_OK(ctx->GetScratchData(array.length(), &buffer)); SerializeFunctor functor; - RETURN_NOT_OK(functor.Serialize(checked_cast(array), ctx, buffer)); + // The value buffer could be empty if all values are nulls. + // The output buffer will then remain uninitialized, but that's ok since + // null value slots are not written in Parquet. + if (array.null_count() != array.length()) { + RETURN_NOT_OK(functor.Serialize(checked_cast(array), ctx, buffer)); + } bool no_nulls = this->descr()->schema_node()->is_required() || (array.null_count() == 0); if (!maybe_parent_nulls && no_nulls) { diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index 3ce2323d29a1..5d32d39e5f46 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -1000,8 +1000,9 @@ class DictDecoderImpl : public TypedDecoderImpl, public DictDecoder inline void DecodeDict(TypedDecoder* dictionary) { dictionary_length_ = static_cast(dictionary->values_left()); - PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T), - /*shrink_to_fit=*/false)); + PARQUET_THROW_NOT_OK( + dictionary_->Resize(static_cast(dictionary_length_) * sizeof(T), + /*shrink_to_fit=*/false)); dictionary->Decode(dictionary_->mutable_data_as(), dictionary_length_); } @@ -1044,15 +1045,15 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictio auto* dict_values = dictionary_->mutable_data_as(); - int total_size = 0; + int64_t total_size = 0; for (int i = 0; i < dictionary_length_; ++i) { total_size += dict_values[i].len; } PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, /*shrink_to_fit=*/false)); - PARQUET_THROW_NOT_OK( - byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t), - /*shrink_to_fit=*/false)); + PARQUET_THROW_NOT_OK(byte_array_offsets_->Resize( + (static_cast(dictionary_length_) + 1) * sizeof(int32_t), + /*shrink_to_fit=*/false)); int32_t offset = 0; uint8_t* bytes_data = byte_array_data_->mutable_data(); @@ -1073,7 +1074,7 @@ inline void DictDecoderImpl::SetDict(TypedDecoder* dictionar auto* dict_values = dictionary_->mutable_data_as(); int fixed_len = this->type_length_; - int total_size = dictionary_length_ * fixed_len; + int64_t total_size = static_cast(dictionary_length_) * fixed_len; PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, /*shrink_to_fit=*/false)); diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index 0e8c0ba32b63..97a5d77d4184 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -18,6 +18,7 @@ #include "parquet/encoding.h" #include +#include #include #include #include @@ -1164,8 +1165,8 @@ void DeltaBitPackEncoder::FlushBlock() { // The minimum number of bits required to write any of values in deltas_ vector. // See overflow comment above. - const auto bit_width = bit_width_data[i] = bit_util::NumRequiredBits( - static_cast(max_delta) - static_cast(min_delta)); + const auto bit_width = bit_width_data[i] = + std::bit_width(static_cast(max_delta) - static_cast(min_delta)); for (uint32_t j = start; j < start + values_current_mini_block; j++) { // Convert delta to frame of reference. See overflow comment above. diff --git a/cpp/src/parquet/level_conversion_inc.h b/cpp/src/parquet/level_conversion_inc.h index 335f5b92154b..33b4fae08494 100644 --- a/cpp/src/parquet/level_conversion_inc.h +++ b/cpp/src/parquet/level_conversion_inc.h @@ -19,6 +19,7 @@ #include "parquet/level_conversion.h" #include +#include #include #include @@ -248,7 +249,7 @@ inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) { int bit_len = 0; constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1; while (select_bitmap != 0) { - const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask); + const auto mask_len = std::popcount(select_bitmap & kLookupMask); const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask]; bit_value |= (value << bit_len); bit_len += mask_len; @@ -309,12 +310,12 @@ int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_si ::arrow::bit_util::FromLittleEndian(internal::GreaterThanBitmap( def_levels, batch_size, level_info.repeated_ancestor_def_level - 1))); auto selected_bits = ExtractBits(defined_bitmap, present_bitmap); - int64_t selected_count = ::arrow::bit_util::PopCount(present_bitmap); + int64_t selected_count = std::popcount(present_bitmap); if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) { throw ParquetException("Values read exceeded upper bound"); } writer->AppendWord(selected_bits, selected_count); - return ::arrow::bit_util::PopCount(selected_bits); + return std::popcount(selected_bits); } else { if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) { std::stringstream ss; @@ -323,7 +324,7 @@ int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_si } writer->AppendWord(defined_bitmap, batch_size); - return ::arrow::bit_util::PopCount(defined_bitmap); + return std::popcount(defined_bitmap); } } diff --git a/cpp/src/parquet/meson.build b/cpp/src/parquet/meson.build index b334bf916e1a..9069ccb5fd1a 100644 --- a/cpp/src/parquet/meson.build +++ b/cpp/src/parquet/meson.build @@ -23,7 +23,6 @@ parquet_srcs = files( 'arrow/reader_internal.cc', 'arrow/schema.cc', 'arrow/schema_internal.cc', - 'arrow/variant_internal.cc', 'arrow/writer.cc', 'bloom_filter.cc', 'bloom_filter_reader.cc', diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 442cde2c9c0b..d94bf652ee86 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -114,6 +114,9 @@ ARROW_THRIFT_BUILD_VERSION=0.22.0 ARROW_THRIFT_BUILD_SHA256_CHECKSUM=794a0e455787960d9f27ab92c38e34da27e8deeda7a5db0e59dc64a00df8a1e5 ARROW_UTF8PROC_BUILD_VERSION=v2.10.0 ARROW_UTF8PROC_BUILD_SHA256_CHECKSUM=6f4f1b639daa6dca9f80bc5db1233e9cbaa31a67790887106160b33ef743f136 +# WIL (Windows Implementation Libraries) is required by Azure SDK on Windows for WinHTTP transport +ARROW_WIL_BUILD_VERSION=v1.0.250325.1 +ARROW_WIL_BUILD_SHA256_CHECKSUM=c9e667d5f86ded43d17b5669d243e95ca7b437e3a167c170805ffd4aa8a9a786 ARROW_XSIMD_BUILD_VERSION=14.0.0 ARROW_XSIMD_BUILD_SHA256_CHECKSUM=17de0236954955c10c09d6938d4c5f3a3b92d31be5dadd1d5d09fc1b15490dce ARROW_ZLIB_BUILD_VERSION=1.3.1 @@ -142,6 +145,7 @@ DEPENDENCIES=( "ARROW_AWS_CRT_CPP_URL aws-crt-cpp-${ARROW_AWS_CRT_CPP_BUILD_VERSION}.tar.gz https://github.com/awslabs/aws-crt-cpp/archive/${ARROW_AWS_CRT_CPP_BUILD_VERSION}.tar.gz" "ARROW_AWS_LC_URL aws-lc-${ARROW_AWS_LC_BUILD_VERSION}.tar.gz https://github.com/awslabs/aws-lc/archive/${ARROW_AWS_LC_BUILD_VERSION}.tar.gz" "ARROW_AWSSDK_URL aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz https://github.com/aws/aws-sdk-cpp/archive/${ARROW_AWSSDK_BUILD_VERSION}.tar.gz" + "ARROW_AZURE_SDK_URL azure-sdk-for-cpp-${ARROW_AZURE_SDK_BUILD_VERSION}.tar.gz https://github.com/Azure/azure-sdk-for-cpp/archive/${ARROW_AZURE_SDK_BUILD_VERSION}.tar.gz" "ARROW_BOOST_URL boost-${ARROW_BOOST_BUILD_VERSION}-cmake.tar.gz https://github.com/boostorg/boost/releases/download/boost-${ARROW_BOOST_BUILD_VERSION}/boost-${ARROW_BOOST_BUILD_VERSION}-cmake.tar.gz" "ARROW_BROTLI_URL brotli-${ARROW_BROTLI_BUILD_VERSION}.tar.gz https://github.com/google/brotli/archive/${ARROW_BROTLI_BUILD_VERSION}.tar.gz" "ARROW_BZIP2_URL bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" @@ -168,6 +172,7 @@ DEPENDENCIES=( "ARROW_SUBSTRAIT_URL substrait-${ARROW_SUBSTRAIT_BUILD_VERSION}.tar.gz https://github.com/substrait-io/substrait/archive/${ARROW_SUBSTRAIT_BUILD_VERSION}.tar.gz" "ARROW_THRIFT_URL thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz https://www.apache.org/dyn/closer.lua/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz?action=download" "ARROW_UTF8PROC_URL utf8proc-${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz https://github.com/JuliaStrings/utf8proc/archive/${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz" + "ARROW_WIL_URL wil-${ARROW_WIL_BUILD_VERSION}.tar.gz https://github.com/microsoft/wil/archive/refs/tags/${ARROW_WIL_BUILD_VERSION}.tar.gz" "ARROW_XSIMD_URL xsimd-${ARROW_XSIMD_BUILD_VERSION}.tar.gz https://github.com/xtensor-stack/xsimd/archive/${ARROW_XSIMD_BUILD_VERSION}.tar.gz" "ARROW_ZLIB_URL zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz https://zlib.net/fossils/zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz" "ARROW_ZSTD_URL zstd-${ARROW_ZSTD_BUILD_VERSION}.tar.gz https://github.com/facebook/zstd/releases/download/v${ARROW_ZSTD_BUILD_VERSION}/zstd-${ARROW_ZSTD_BUILD_VERSION}.tar.gz" diff --git a/cpp/vcpkg.json b/cpp/vcpkg.json index d854fc339d62..ba3c8e1851b0 100644 --- a/cpp/vcpkg.json +++ b/cpp/vcpkg.json @@ -41,12 +41,7 @@ ] }, "grpc", - { - "name": "gtest", - "features": [ - "cxx17" - ] - }, + "gtest", "lz4", "openssl", "orc", @@ -62,5 +57,5 @@ "zstd" ], "$comment": "We can update builtin-baseline by 'vcpkg x-update-baseline'", - "builtin-baseline": "09f6a4ef2f08252f7f4d924fd9c2d42165fb21c9" + "builtin-baseline": "40c89449f0ccce12d21f8a906639f6c2c649b9e7" } diff --git a/dev/conbench_envs/hooks.sh b/dev/conbench_envs/hooks.sh index 60a482adfcf5..5cf75a5c7342 100755 --- a/dev/conbench_envs/hooks.sh +++ b/dev/conbench_envs/hooks.sh @@ -59,6 +59,25 @@ build_arrow_python() { build_arrow_r() { cat ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site + + # Ensure CXX20 is configured in R's Makeconf. + # conda-forge's R may have empty CXX20 entries even though the compiler supports it. + # Arrow requires C++20, so we need to add these settings if missing. + MAKECONF="$(R RHOME)/etc/Makeconf" + if [ -z "$(R CMD config CXX20)" ]; then + echo "*** CXX20 not configured in R, adding it to Makeconf" + cat >> "$MAKECONF" << 'EOF' + +# Added for Arrow C++20 support +CXX20 = g++ +CXX20FLAGS = -g -O2 $(LTO) +CXX20PICFLAGS = -fpic +CXX20STD = -std=gnu++20 +SHLIB_CXX20LD = $(CXX20) $(CXX20STD) +SHLIB_CXX20LDFLAGS = -shared +EOF + fi + ci/scripts/r_deps.sh $(pwd) $(pwd) (cd r; R CMD INSTALL .;) } diff --git a/dev/release/02-source-test.rb b/dev/release/02-source-test.rb index fe2c7b775912..5bd7c717709f 100644 --- a/dev/release/02-source-test.rb +++ b/dev/release/02-source-test.rb @@ -44,8 +44,12 @@ def source(*targets) env["SOURCE_#{target}"] = "1" end sh(env, @tarball_script, @release_version, "0") + FileUtils.mkdir_p("artifacts") + sh("mv", @archive_name, "artifacts/") + File.write("artifacts/#{@archive_name}.sha512", + sh(env, "shasum", "-a", "512", "artifacts/#{@archive_name}")) output = sh(env, @script, @release_version, "0") - sh("tar", "xf", @archive_name) + sh("tar", "xf", "artifacts/#{@archive_name}") output end @@ -106,6 +110,7 @@ def test_vote verify_pr_url = (JSON.parse(response.read)[0] || {})["html_url"] end output = source("VOTE") + tarball_hash = Digest::SHA512.file("artifacts/#{@archive_name}").to_s assert_equal(<<-VOTE.strip, output[/^-+$(.+?)^-+$/m, 1].strip) To: dev@arrow.apache.org Subject: [VOTE] Release Apache Arrow #{@release_version} - RC0 @@ -124,9 +129,10 @@ def test_vote The changelog is located at [10]. Please download, verify checksums and signatures, run the unit tests, -and vote on the release. See [11] for how to validate a release candidate. +and vote on the release. See [11] for the SHA-512 checksum for this RC and [12] +for how to validate a release candidate. -See also a verification result on GitHub pull request [12]. +See also a verification result on GitHub pull request [13]. The vote will be open for at least 72 hours. @@ -144,8 +150,9 @@ def test_vote [8]: https://packages.apache.org/artifactory/arrow/ubuntu-rc/ [9]: https://github.com/apache/arrow/releases/tag/apache-arrow-#{@release_version}-rc0 [10]: https://github.com/apache/arrow/blob/#{@current_commit}/CHANGELOG.md -[11]: https://arrow.apache.org/docs/developers/release_verification.html -[12]: #{verify_pr_url || "null"} +[11]: #{tarball_hash} +[12]: https://arrow.apache.org/docs/developers/release_verification.html +[13]: #{verify_pr_url || "null"} VOTE end end diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index 5f813eb80bc0..a99e529065e6 100755 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -130,6 +130,10 @@ if [ ${SOURCE_VOTE} -gt 0 ]; then curl_options+=(--data "head=apache:${rc_branch}") curl_options+=(https://api.github.com/repos/${GITHUB_REPOSITORY}/pulls) verify_pr_url=$(curl "${curl_options[@]}" | jq -r ".[0].html_url") + # Read the checksum so we can include it in the vote thread email. + sha512_path="artifacts/${tarball}.sha512" + [[ -f "${sha512_path}" ]] || { echo "Error: ${sha512_path} must exist"; exit 1; } + tarball_hash=$(cat "${sha512_path}" | awk '{print $1}') echo "The following draft email has been created to send to the" echo "dev@arrow.apache.org mailing list" @@ -153,9 +157,10 @@ The binary artifacts are hosted at [4][5][6][7][8][9]. The changelog is located at [10]. Please download, verify checksums and signatures, run the unit tests, -and vote on the release. See [11] for how to validate a release candidate. +and vote on the release. See [11] for the SHA-512 checksum for this RC and [12] +for how to validate a release candidate. -See also a verification result on GitHub pull request [12]. +See also a verification result on GitHub pull request [13]. The vote will be open for at least 72 hours. @@ -173,8 +178,9 @@ The vote will be open for at least 72 hours. [8]: https://packages.apache.org/artifactory/arrow/ubuntu-rc/ [9]: https://github.com/apache/arrow/releases/tag/apache-arrow-${version}-rc${rc} [10]: https://github.com/apache/arrow/blob/${release_hash}/CHANGELOG.md -[11]: https://arrow.apache.org/docs/developers/release_verification.html -[12]: ${verify_pr_url} +[11]: ${tarball_hash} +[12]: https://arrow.apache.org/docs/developers/release_verification.html +[13]: ${verify_pr_url} MAIL echo "---------------------------------------------------------" fi diff --git a/dev/release/04-binary-download.sh b/dev/release/04-binary-download.sh index 68e1664b5997..210a9406c2d0 100755 --- a/dev/release/04-binary-download.sh +++ b/dev/release/04-binary-download.sh @@ -46,7 +46,7 @@ tag="apache-arrow-${version_with_rc}" archery crossbow download-artifacts --no-fetch ${CROSSBOW_JOB_ID} "$@" -# Download Linux packages. +# Download Linux packages and ODBC MSI. gh release download "${tag}" \ --dir "packages/${CROSSBOW_JOB_ID}" \ --pattern "almalinux-*.tar.gz" \ @@ -54,5 +54,6 @@ gh release download "${tag}" \ --pattern "centos-*.tar.gz" \ --pattern "debian-*.tar.gz" \ --pattern "ubuntu-*.tar.gz" \ + --pattern "Apache-Arrow-Flight-SQL-ODBC-*-win64.msi" \ --repo "${REPOSITORY:-apache/arrow}" \ --skip-existing diff --git a/dev/release/05-binary-upload.sh b/dev/release/05-binary-upload.sh index f628cce0e0bf..45793dd6ec5e 100755 --- a/dev/release/05-binary-upload.sh +++ b/dev/release/05-binary-upload.sh @@ -67,6 +67,7 @@ cd "${SOURCE_DIR}" : "${UPLOAD_CENTOS:=${UPLOAD_DEFAULT}}" : "${UPLOAD_DEBIAN:=${UPLOAD_DEFAULT}}" : "${UPLOAD_DOCS:=${UPLOAD_DEFAULT}}" +: "${UPLOAD_ODBC:=${UPLOAD_DEFAULT}}" : "${UPLOAD_PYTHON:=${UPLOAD_DEFAULT}}" : "${UPLOAD_R:=${UPLOAD_DEFAULT}}" : "${UPLOAD_UBUNTU:=${UPLOAD_DEFAULT}}" @@ -108,6 +109,10 @@ upload_to_github_release() { if [ "${UPLOAD_DOCS}" -gt 0 ]; then upload_to_github_release docs "${ARROW_ARTIFACTS_DIR}"/*-docs/* fi +if [ "${UPLOAD_ODBC}" -gt 0 ]; then + upload_to_github_release odbc \ + "${ARROW_ARTIFACTS_DIR}"/Apache-Arrow-Flight-SQL-ODBC-*-win64.msi +fi if [ "${UPLOAD_PYTHON}" -gt 0 ]; then upload_to_github_release python \ "${ARROW_ARTIFACTS_DIR}"/{python-sdist,wheel-*}/* diff --git a/dev/release/setup-ubuntu.sh b/dev/release/setup-ubuntu.sh index 6951226bd765..ac274694d58b 100755 --- a/dev/release/setup-ubuntu.sh +++ b/dev/release/setup-ubuntu.sh @@ -45,7 +45,6 @@ apt-get install -y -q --no-install-recommends \ libglib2.0-dev \ libsqlite3-dev \ libssl-dev \ - libxsimd-dev \ llvm-dev \ ninja-build \ nlohmann-json3-dev \ diff --git a/dev/release/test-helper.rb b/dev/release/test-helper.rb index 45c0065ba1f6..f25d60276475 100644 --- a/dev/release/test-helper.rb +++ b/dev/release/test-helper.rb @@ -17,6 +17,7 @@ require "English" require "cgi/util" +require 'digest' require "fileutils" require "find" require 'net/http' diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index d846826c3c1d..9340c4e4e56e 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (23.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 10 Feb 2026 10:45:08 -0000 + apache-arrow-apt-source (23.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 0579df694f06..50f678253672 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -85,6 +85,9 @@ else fi %changelog +* Tue Feb 10 2026 Raúl Cumplido - 23.0.1-1 +- New upstream release. + * Tue Jan 13 2026 Raúl Cumplido - 23.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-forky/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/debian-forky/Dockerfile index bfb3728b57b8..1f0524471392 100644 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-forky/Dockerfile +++ b/dev/tasks/linux-packages/apache-arrow/apt/debian-forky/Dockerfile @@ -66,7 +66,6 @@ RUN \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ - libxsimd-dev \ libxxhash-dev \ libzstd-dev \ llvm-dev \ diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index 1f9e65a654b9..ca57c5009616 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (23.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Tue, 10 Feb 2026 10:45:08 -0000 + apache-arrow (23.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/control.in b/dev/tasks/linux-packages/apache-arrow/debian/control.in index 78f435a0fc9e..092987556215 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/control.in +++ b/dev/tasks/linux-packages/apache-arrow/debian/control.in @@ -381,6 +381,17 @@ Description: Apache Arrow is a data processing library for analysis . This package provides GLib based library files for CUDA support. +Package: libarrow-cuda-glib-doc +Section: doc +Architecture: @CUDA_ARCHITECTURE@ +Multi-Arch: foreign +Depends: + ${misc:Depends} +Recommends: libarrow-glib-doc +Description: Apache Arrow is a data processing library for analysis + . + This package provides documentations for CUDA support. + Package: gir1.2-arrow-cuda-24.0 Section: introspection Architecture: @CUDA_ARCHITECTURE@ diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-doc.doc-base b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-doc.doc-base new file mode 100644 index 000000000000..f7f29f811eb3 --- /dev/null +++ b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-doc.doc-base @@ -0,0 +1,9 @@ +Document: arrow-cuda-glib +Title: Apache Arrow CUDA GLib Reference Manual +Author: The Apache Software Foundation +Abstract: Apache Arrow CUDA GLib provides an API for CUDA integration. +Section: Programming + +Format: HTML +Index: /usr/share/doc/libarrow-cuda-glib-doc/arrow-cuda-glib/index.html +Files: /usr/share/doc/libarrow-cuda-glib-doc/arrow-cuda-glib/*.html diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-doc.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-doc.install new file mode 100644 index 000000000000..24a3c0db2619 --- /dev/null +++ b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-doc.install @@ -0,0 +1 @@ +usr/share/doc/arrow-cuda-glib usr/share/doc/libarrow-cuda-glib-doc/ diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-doc.links b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-doc.links new file mode 100644 index 000000000000..b0e7594b7042 --- /dev/null +++ b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-doc.links @@ -0,0 +1,5 @@ +usr/share/doc/libarrow-cuda-glib-doc/arrow-cuda-glib usr/share/devhelp/books/arrow-cuda-glib +usr/share/doc/libarrow-glib-doc/arrow-glib usr/share/doc/libarrow-cuda-glib-doc/arrow-glib +usr/share/doc/libglib2.0-doc/gio usr/share/doc/libarrow-cuda-glib-doc/gio +usr/share/doc/libglib2.0-doc/glib usr/share/doc/libarrow-cuda-glib-doc/glib +usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libarrow-cuda-glib-doc/gobject diff --git a/dev/tasks/linux-packages/apache-arrow/debian/rules b/dev/tasks/linux-packages/apache-arrow/debian/rules index 19dba393b146..08aa1c8384f4 100755 --- a/dev/tasks/linux-packages/apache-arrow/debian/rules +++ b/dev/tasks/linux-packages/apache-arrow/debian/rules @@ -85,13 +85,13 @@ override_dh_auto_build: --buildsystem=meson+ninja override_dh_auto_install: + dh_auto_install \ + --sourcedirectory=cpp \ + --builddirectory=cpp_build dh_auto_install \ --sourcedirectory=c_glib \ --builddirectory=c_glib_build \ --buildsystem=meson+ninja - dh_auto_install \ - --sourcedirectory=cpp \ - --builddirectory=cpp_build override_dh_auto_test: # TODO: We need Boost 1.64 or later to build tests for diff --git a/dev/tasks/linux-packages/apache-arrow/yum/almalinux-10/Dockerfile b/dev/tasks/linux-packages/apache-arrow/yum/almalinux-10/Dockerfile index 78134ab81662..43550cee3546 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/almalinux-10/Dockerfile +++ b/dev/tasks/linux-packages/apache-arrow/yum/almalinux-10/Dockerfile @@ -62,6 +62,5 @@ RUN \ utf8proc-devel \ vala \ which \ - xsimd-devel \ zlib-devel && \ dnf clean ${quiet} all diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 7bf8bd556a91..894b56d52443 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -877,6 +877,9 @@ Documentation for Apache Parquet GLib. %endif %changelog +* Tue Feb 10 2026 Raúl Cumplido - 23.0.1-1 +- New upstream release. + * Tue Jan 13 2026 Raúl Cumplido - 23.0.0-1 - New upstream release. diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja index 01541dcecbc7..7562939f351c 100644 --- a/dev/tasks/macros.jinja +++ b/dev/tasks/macros.jinja @@ -134,7 +134,7 @@ env: echo "No wheel files found!" exit 1 fi - python3 -m pip install git+https://github.com/Anaconda-Platform/anaconda-client.git@1.12.3 + python3 -m pip install git+https://github.com/Anaconda-Platform/anaconda-client.git@1.14.1 anaconda -t ${CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN} upload --force -u scientific-python-nightly-wheels --label main {{ pattern }} env: CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN: {{ '${{ secrets.CROSSBOW_SCIENTIFIC_PYTHON_UPLOAD_TOKEN }}' }} @@ -223,11 +223,13 @@ env: path: repo/libarrow {% endif %} {% if get_nix %} - - name: Get Linux binary + {% for arch in ["x86_64", "arm64"] %} + - name: Get Linux {{ arch }} binary uses: actions/download-artifact@v4 with: - name: r-libarrow-linux-x86_64 + name: r-libarrow-linux-{{ arch }} path: repo/libarrow + {% endfor %} {% endif %} {% if get_mac %} {% for arch in ["x86_64", "arm64"] %} diff --git a/dev/tasks/python-sdist/github.yml b/dev/tasks/python-sdist/github.yml index 0ed13f921895..ac357f1e8833 100644 --- a/dev/tasks/python-sdist/github.yml +++ b/dev/tasks/python-sdist/github.yml @@ -42,4 +42,9 @@ jobs: UBUNTU: 22.04 PYARROW_VERSION: {{ arrow.no_rc_version }} + - uses: actions/upload-artifact@v6 + with: + name: sdist + path: arrow/python/dist/*.tar.gz + {{ macros.github_upload_releases("arrow/python/dist/*.tar.gz")|indent }} diff --git a/dev/tasks/r/github.linux.arrow.version.back.compat.yml b/dev/tasks/r/github.linux.arrow.version.back.compat.yml index 8e2ccba0189e..774c3e09f4c8 100644 --- a/dev/tasks/r/github.linux.arrow.version.back.compat.yml +++ b/dev/tasks/r/github.linux.arrow.version.back.compat.yml @@ -73,7 +73,7 @@ jobs: config: # We use the R version that was released at the time of the arrow release in order # to make sure we can download binaries from RSPM. - - { old_arrow_version: '22.0.0', r: '4.5' } + - { old_arrow_version: '22.0.0.1', r: '4.5' } - { old_arrow_version: '21.0.0.1', r: '4.5' } - { old_arrow_version: '20.0.0.2', r: '4.5' } - { old_arrow_version: '20.0.0', r: '4.5' } diff --git a/dev/tasks/r/github.linux.versions.yml b/dev/tasks/r/github.linux.versions.yml index b7b55ca82524..e5ed151a937c 100644 --- a/dev/tasks/r/github.linux.versions.yml +++ b/dev/tasks/r/github.linux.versions.yml @@ -21,12 +21,12 @@ jobs: r-versions: - name: "rstudio/r-base:{{ MATRIX }}-jammy" + name: "posit/r-base:{{ MATRIX }}-jammy" runs-on: ubuntu-latest strategy: fail-fast: false matrix: - # See https://hub.docker.com/r/rstudio/r-base + # See https://hub.docker.com/r/posit/r-base r_version: # We test devel, release, and oldrel in regular CI. # This is for older versions diff --git a/dev/tasks/r/github.macos.cran.yml b/dev/tasks/r/github.macos.cran.yml index dda8ac7fd785..930f7c5587eb 100644 --- a/dev/tasks/r/github.macos.cran.yml +++ b/dev/tasks/r/github.macos.cran.yml @@ -21,10 +21,12 @@ jobs: macos-cran: - name: "macOS similar to CRAN" + name: "macOS {{ '${{ matrix.config }}' }}" runs-on: macOS-latest strategy: fail-fast: false + matrix: + config: ["cran-m1", "cran-release"] steps: {{ macros.github_checkout_arrow()|indent }} @@ -58,7 +60,35 @@ jobs: extra-packages: | any::rcmdcheck any::sys - - name: Install + - name: Install MacOSX 11.3 SDK + if: matrix.config == 'cran-release' + env: + SDK_TOKEN: {{ '${{ secrets.JONKEANE_MACOS_11_SDK_DOWNLOAD_TOKEN }}' }} + run: | + # Download, Confirm integrity, expand. This will fail if the hash does not match. + curl -fsSL -H "Authorization: Bearer $SDK_TOKEN" \ + -H "Accept: application/vnd.github+json" \ + https://api.github.com/repos/jonkeane/crossbow_11_sdk/tarball/v0.0.1 \ + -o /tmp/MacOSX11.3.sdk.tar.gz + echo "493570e56d6c6af26128e9096de738822589cc3cdb1b29aa5854f3f4c99756ac /tmp/MacOSX11.3.sdk.tar.gz" | shasum -a 256 -c - + sudo tar -xzf /tmp/MacOSX11.3.sdk.tar.gz -C /Library/Developer/CommandLineTools/SDKs/ + # Move SDK from extracted folder (GitHub archives as {owner}-{repo}-{sha}/) + sudo mv /Library/Developer/CommandLineTools/SDKs/jonkeane-crossbow_11_sdk-*/MacOSX11.3.sdk \ + /Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk + sudo rm -rf /Library/Developer/CommandLineTools/SDKs/jonkeane-crossbow_11_sdk-* + ls -la /Library/Developer/CommandLineTools/SDKs/ + - name: Install (cran-release) + if: matrix.config == 'cran-release' + env: + _R_CHECK_CRAN_INCOMING_: false + SDKROOT: '/Library/Developer/CommandLineTools/SDKs/MacOSX11.3.sdk' + NOT_CRAN: false + run: | + sccache --start-server || echo 'sccache not found' + cd arrow/r + R CMD INSTALL . --install-tests + - name: Install (cran-m1) + if: matrix.config == 'cran-m1' env: _R_CHECK_CRAN_INCOMING_: false CXX: "clang++ -mmacos-version-min=14.6" @@ -77,6 +107,6 @@ jobs: - name: Save the test output uses: actions/upload-artifact@v4 with: - name: test-output + name: test-output-{{ '${{ matrix.config }}' }} path: arrow-tests/testthat.Rout* if: always() diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index cedb567f2cd9..b488476cd591 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -81,7 +81,6 @@ jobs: env: {{ macros.github_set_sccache_envvars()|indent(8) }} MACOSX_DEPLOYMENT_TARGET: "11.6" - ARROW_S3: ON ARROW_GCS: ON ARROW_DEPENDENCY_SOURCE: BUNDLED CMAKE_GENERATOR: Ninja @@ -111,14 +110,22 @@ jobs: {{ '${{ env.PKG_FILE }}' }}.sha512 linux-cpp: - name: C++ Binary Linux - runs-on: ubuntu-latest + name: C++ Binary Linux {{ '${{ matrix.arch }}' }} + runs-on: {{ '${{ matrix.runs-on }}' }} needs: source strategy: fail-fast: false + matrix: + include: + - arch: x86_64 + runs-on: ubuntu-latest + ubuntu: "22.04" + - arch: arm64 + runs-on: ubuntu-24.04-arm + ubuntu: "22.04" env: - PKG_ID: r-libarrow-linux-x86_64 - PKG_FILE: r-libarrow-linux-x86_64-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip + PKG_ID: r-libarrow-linux-{{ '${{ matrix.arch }}' }} + PKG_FILE: r-libarrow-linux-{{ '${{ matrix.arch }}' }}-{{ '${{ needs.source.outputs.pkg_version }}' }}.zip steps: {{ macros.github_checkout_arrow()|indent }} {{ macros.github_change_r_pkg_version(is_fork, '${{ needs.source.outputs.pkg_version }}')|indent }} @@ -126,7 +133,8 @@ jobs: - name: Build libarrow shell: bash env: - UBUNTU: "22.04" + ARCH: {{ "${{ matrix.arch == 'x86_64' && 'amd64' || 'arm64v8' }}" }} + UBUNTU: {{ '${{ matrix.ubuntu }}' }} {{ macros.github_set_sccache_envvars()|indent(8) }} run: | source arrow/ci/scripts/util_enable_core_dumps.sh @@ -292,8 +300,8 @@ jobs: path: arrow_* test-linux-binary: needs: [source, linux-cpp] - name: Test binary {{ '${{ matrix.config.image }}' }} - runs-on: ubuntu-latest + name: Test binary {{ '${{ matrix.config.image }}' }} {{ '${{ matrix.config.runner }}' }} + runs-on: {{ '${{ matrix.config.runner }}' }} container: {{ '${{ matrix.config.image }}' }} strategy: fail-fast: false @@ -304,13 +312,18 @@ jobs: # an OS that is not in the allowlist, so we have to opt-in to use the # binary. Other env vars used in r_docker_configure.sh can be added # here and wired up in the later steps. - - {image: "rhub/ubuntu-clang", libarrow_binary: "TRUE"} + # x86_64 tests + - {image: "rhub/ubuntu-clang", libarrow_binary: "TRUE", runner: "ubuntu-latest"} # fedora-clang-devel cannot use binaries bc of libc++ (uncomment to see the error) - # - {image: "rhub/fedora-clang-devel", libarrow_binary: "TRUE"} - - {image: "rhub/ubuntu-release"} # currently ubuntu-22.04 - - {image: "rstudio/r-base:4.1-jammy"} - - {image: "rstudio/r-base:4.2-jammy"} - - {image: "rstudio/r-base:4.3-noble"} + # - {image: "rhub/fedora-clang-devel", libarrow_binary: "TRUE", runner: "ubuntu-latest"} + - {image: "rhub/ubuntu-release", runner: "ubuntu-latest"} # currently ubuntu-24.04 + - {image: "posit/r-base:4.3-noble", runner: "ubuntu-latest"} + - {image: "posit/r-base:4.4-noble", runner: "ubuntu-latest"} + - {image: "posit/r-base:4.5-noble", runner: "ubuntu-latest"} + # ARM64 tests + - {image: "posit/r-base:4.3-noble", runner: "ubuntu-24.04-arm"} + - {image: "posit/r-base:4.4-noble", runner: "ubuntu-24.04-arm"} + - {image: "posit/r-base:4.5-noble", runner: "ubuntu-24.04-arm"} steps: # Get the arrow checkout just for the docker config scripts # Don't need submodules for this (hence false arg to macro): they fail on @@ -339,41 +352,6 @@ jobs: Rscript -e ' {{ macros.github_test_r_src_pkg()|indent(8) }} ' - - name: Upload binary artifact - if: matrix.config.devtoolset - uses: actions/upload-artifact@v4 - with: - name: r-pkg_centos7 - path: arrow_* - - test-centos-binary: - # arrow binary package not on ppm currently see #37922 - if: false - needs: test-linux-binary - runs-on: ubuntu-latest - container: "rstudio/r-base:4.2-centos7" - steps: - - uses: actions/download-artifact@v4 - with: - name: r-pkg_centos7 - - name: Install DTS Package - shell: Rscript {0} - run: | - pkg <- list.files(pattern = "arrow_*") - if(length(pkg) > 1) { - pkg <- pkg[[1]] - warning("Multiple packages found! Using first one.") - } - - # Install dependencies from RSPM - install.packages("arrow", repos = "https://packagemanager.rstudio.com/all/__linux__/centos7/latest") - remove.packages("arrow") - - install.packages(pkg) - library(arrow) - read_parquet(system.file("v0.7.1.parquet", package = "arrow")) - print(arrow_info()) - test-source: needs: source name: Test {{ '${{ matrix.platform.name }}' }} source build diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 931b6da784d0..97843d2ef0cb 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -451,7 +451,7 @@ tasks: flags: -e CC=gcc-14 -e CXX=g++-14 -e RapidJSON_SOURCE=BUNDLED image: ubuntu-cpp -{% for debian_version in ["12"] %} +{% for debian_version in ["13"] %} test-debian-{{ debian_version }}-cpp-amd64: ci: github template: docker-tests/github.linux.yml @@ -589,23 +589,25 @@ tasks: UBUNTU: 22.04 image: ubuntu-python-313-freethreading - test-debian-12-python-3-amd64: +{% for debian_version in ["13"] %} + test-debian-{{ debian_version }}-python-3-amd64: ci: github template: docker-tests/github.linux.yml params: env: - DEBIAN: 12 + DEBIAN: "{{ debian_version }}" image: debian-python - test-debian-12-python-3-i386: + test-debian-{{ debian_version }}-python-3-i386: ci: github template: docker-tests/github.linux.yml params: env: ARCH: i386 - DEBIAN: 12 + DEBIAN: "{{ debian_version }}" flags: "-e ARROW_S3=OFF -e ARROW_GANDIVA=OFF" image: debian-python +{% endfor %} test-ubuntu-22.04-python-3: ci: github @@ -756,7 +758,7 @@ tasks: template: r/github.macos.m1san.yml # be sure to update binary-task.rb when upgrading Debian - test-debian-12-docs: + test-debian-13-docs: ci: github template: docs/github.linux.yml params: diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json index 4a4d2c948c0b..0e6075a52be6 100644 --- a/docs/source/_static/versions.json +++ b/docs/source/_static/versions.json @@ -128,6 +128,6 @@ { "name": "1.0", "version": "1.0/", - "url": "https://arrow.apache.org/docs/dev/" + "url": "https://arrow.apache.org/docs/1.0/" } ] diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 20df98c5eccf..6ee6993e2ba7 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -87,6 +87,18 @@ that changing their value later will have an effect. ``libhdfs.dylib`` on macOS, ``libhdfs.so`` on other platforms). Alternatively, one can set :envvar:`HADOOP_HOME`. +.. envvar:: ARROW_REGISTER_ATFORK + + **Experimental**. An integer value to enable or disable the registration + of at-fork handlers. These are enabled by default or explicitly using the + value "1"; use "0" to disable. + + If enabled, at-fork handlers make Arrow C++ compatible with the use of the + ``fork()`` system call, such as by Python's :python:mod:`multiprocessing`, + but at the expense of executing + `potentially unsafe code `__ + in a forked child process if the parent process is multi-threaded. + .. envvar:: ARROW_S3_LOG_LEVEL Controls the verbosity of logging produced by S3 calls. Defaults to ``FATAL`` diff --git a/docs/source/developers/cpp/fuzzing.rst b/docs/source/developers/cpp/fuzzing.rst index 7c8b346074a3..4df5455de220 100644 --- a/docs/source/developers/cpp/fuzzing.rst +++ b/docs/source/developers/cpp/fuzzing.rst @@ -26,10 +26,10 @@ Fuzzing Arrow C++ To make the handling of invalid input more robust, we have enabled fuzz testing on several parts of the Arrow C++ feature set, currently: -* the IPC stream format -* the IPC file format -* the Parquet file format -* the CSV file format +* the IPC stream reader +* the IPC file reader +* the Parquet file reader +* the CSV file reader We welcome any contribution to expand the scope of fuzz testing and cover areas ingesting potentially invalid or malicious data. @@ -110,3 +110,23 @@ dependencies, you may need to install these before building the fuzz targets: $ conda install clang clangxx compiler-rt $ cmake .. --preset=fuzzing + + +.. _fuzz-regression-files: + +Regression files +================ + +When a fuzzer-detected bug is found and fixed, the corresponding reproducer +must be stored in the `arrow-testing `__ +repository to ensure that the code doesn't regress. + +The locations for these files are as follows: + +* IPC streams: in the ``data/arrow-ipc-stream`` `directory `__ +* IPC files: in the ``data/arrow-ipc-file`` `directory `__ +* Parquet files: in the ``data/parquet/fuzzing`` `directory `__ +* CSV files: in the ``data/csv/fuzzing`` `directory `__ + +Most of those files are invalid files for their respective formats and stress +proper error detection and reporting in the implementation code. diff --git a/docs/source/developers/guide/index.rst b/docs/source/developers/guide/index.rst index 0ed27a0ddc54..c8d3103ca78a 100644 --- a/docs/source/developers/guide/index.rst +++ b/docs/source/developers/guide/index.rst @@ -141,7 +141,9 @@ of adding a basic feature. #. **Push the branch on your fork and create a Pull Request** - See detailed instructions on :ref:`create_pr` + See detailed instructions on :ref:`create_pr`. If you have used AI tools + to help generate your contribution, please also read our guidance on + :ref:`ai-generated-code`. If you are ready you can start with building Arrow or choose to follow diff --git a/docs/source/developers/overview.rst b/docs/source/developers/overview.rst index 7e38dcb8ebc8..a6445aaccded 100644 --- a/docs/source/developers/overview.rst +++ b/docs/source/developers/overview.rst @@ -146,6 +146,45 @@ will merge the pull request. This is done with a description, a link back to the pull request, and attribution to the contributor and any co-authors. +.. _ai-generated-code: + +AI-generated code ++++++++++++++++++ + +We recognise that AI coding assistants are now a regular part of many +developers' workflows and can improve productivity. Thoughtful use of these +tools can be beneficial, but AI-generated PRs can sometimes lead to +undesirable additional maintainer burden. PRs that appear to be fully +generated by AI with little to no engagement from the author may be closed +without further review. + +Human-generated mistakes tend to be easier to spot and reason about, and +code review is intended to be a collaborative learning experience that +benefits both submitter and reviewer. When a PR appears to have been +generated without much engagement from the submitter, reviewers with access +to AI tools could more efficiently generate the code directly, and since +the submitter is not likely to learn from the review process, their time is +more productively spent researching and reporting on the issue. + +We are not opposed to the use of AI tools in generating PRs, but recommend +the following: + +* Only submit a PR if you are able to debug and own the changes yourself - + review all generated code to understand every detail +* Match the style and conventions used in the rest of the codebase, including + PR titles and descriptions +* Be upfront about AI usage and summarise what was AI-generated +* If there are parts you don't fully understand, leave comments on your own PR + explaining what steps you took to verify correctness +* Watch for AI's tendency to generate overly verbose comments, unnecessary + test cases, and incorrect fixes +* Break down large PRs into smaller ones to make review easier + +PR authors are also responsible for disclosing any copyrighted materials in +submitted contributions. See the `ASF's guidance on AI-generated code +`_ for further +information on licensing considerations. + .. Section on Experimental repositories: .. include:: experimental_repos.rst diff --git a/docs/source/developers/python/development.rst b/docs/source/developers/python/development.rst index d03b2439b10e..5529ad25a294 100644 --- a/docs/source/developers/python/development.rst +++ b/docs/source/developers/python/development.rst @@ -101,6 +101,74 @@ The test groups currently include: * ``s3``: Tests for Amazon S3 * ``tensorflow``: Tests that involve TensorFlow +Type Checking +============= + +PyArrow provides type stubs (``*.pyi`` files) for static type checking. These +stubs are located in the ``pyarrow-stubs/`` directory and are automatically +included in the distributed wheel packages. + +Running Type Checkers +--------------------- + +We support multiple type checkers. Their configurations are in +``pyproject.toml``. + +**mypy** + +To run mypy on the PyArrow codebase: + +.. code-block:: + + $ cd arrow/python + $ mypy + +The mypy configuration is in the ``[tool.mypy]`` section of ``pyproject.toml``. + +**pyright** + +To run pyright: + +.. code-block:: + + $ cd arrow/python + $ pyright + +The pyright configuration is in the ``[tool.pyright]`` section of ``pyproject.toml``. + +**ty** + +To run ty (note: currently only partially configured): + +.. code-block:: + + $ cd arrow/python + $ ty check + +Maintaining Type Stubs +----------------------- + +Type stubs for PyArrow are maintained in the ``pyarrow-stubs/`` +directory. These stubs mirror the structure of the main ``pyarrow/`` package. + +When adding or modifying public APIs: + +1. **Update the corresponding ``.pyi`` stub file** in ``pyarrow-stubs/`` + to reflect the new or changed function/class signatures. + +2. **Include type annotations** where possible. For Cython modules or + dynamically generated APIs such as compute kernels add the corresponding + stub in ``pyarrow-stubs/``. + +3. **Run type checkers** to ensure the stubs are correct and complete. + +The stub files are automatically copied into the built wheel during the build +process and will be included when users install PyArrow, enabling type checking +in downstream projects and for users' IDEs. + +Note: ``py.typed`` marker file in the ``pyarrow/`` directory indicates to type +checkers that PyArrow supports type checking according to :pep:`561`. + Doctest ======= @@ -127,6 +195,24 @@ for ``.py`` files or for ``.pyx`` and ``.pxi`` files. In this case you will also need to install the `pytest-cython `_ plugin. +Testing Documentation Examples +------------------------------- + +Documentation examples in ``.rst`` files under ``docs/source/python/`` use +doctest syntax and can be tested locally using: + +.. code-block:: + + $ pushd arrow/python + $ pytest --doctest-glob="*.rst" docs/source/python/file.rst # checking single file + $ pytest --doctest-glob="*.rst" docs/source/python # checking entire directory + $ popd + +The examples use standard doctest syntax with ``>>>`` for Python prompts and +``...`` for continuation lines. The ``conftest.py`` fixture automatically +handles temporary directory setup for examples that create files. + + Debugging ========= @@ -168,4 +254,4 @@ Similarly, use lldb when debugging on macOS. Benchmarking ============ -For running the benchmarks, see :ref:`python-benchmarks`. +For running the benchmarks, see :ref:`benchmarks`. diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 697e7627d89d..41b94aa0a83a 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -285,6 +285,8 @@ UUID A specific UUID version is not required or guaranteed. This extension represents UUIDs as FixedSizeBinary(16) with big-endian notation and does not interpret the bytes in any way. +.. _opaque_extension: + Opaque ====== diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index ca88a825dc83..5038132241c3 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -561,6 +561,8 @@ in ``datagen.py``): * Extension Types +.. _format-gold-integration-files: + Gold File Integration Tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/format/Security.rst b/docs/source/format/Security.rst new file mode 100644 index 000000000000..0c117fe1e21d --- /dev/null +++ b/docs/source/format/Security.rst @@ -0,0 +1,277 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _format_security: + +*********************** +Security Considerations +*********************** + +This document describes security considerations when reading Arrow +data from untrusted sources. It focuses specifically on data passed in a +standardized serialized form (such as a IPC stream), as opposed to an +implementation-specific native representation (such as ``arrow::Array`` in C++). + +.. note:: + Implementation-specific concerns, such as bad API usage, are out of scope + for this document. Please refer to the implementation's own documentation. + + +Who should read this +==================== + +You should read this document if you belong to either of these two categories: + +1. *users* of Arrow: that is, developers of third-party libraries or applications + that don't directly implement the Arrow formats or protocols, but instead + call language-specific APIs provided by an Arrow library (as defined below); + +2. *implementors* of Arrow libraries: that is, libraries that provide APIs + abstracting away from the details of the Arrow formats and protocols; such + libraries include, but are not limited to, the official Arrow implementations + documented on https://arrow.apache.org. + + +Columnar Format +=============== + +Invalid data +------------ + +The Arrow :ref:`columnar format <_format_columnar>` is an efficient binary +representation with a focus on performance and efficiency. While the format +does not store raw pointers, the contents of Arrow buffers are often +combined and converted to pointers into the process' address space. +Invalid Arrow data may therefore cause invalid memory accesses +(potentially crashing the process) or access to non-Arrow data +(potentially allowing an attacker to exfiltrate confidential information). + +For instance, to read a value from a Binary array, one needs to 1) read the +values' offsets from the array's offsets buffer, and 2) read the range of bytes +delimited by these offsets in the array's data buffer. If the offsets are +invalid (deliberately or not), then step 2) can access memory outside of the +data buffer's range. + +Another instance of invalid data lies in the values themselves. For example, +a String array is only allowed to contain valid UTF-8 data, but an untrusted +source might have emitted invalid UTF-8 under the disguise of a String array. +An unsuspecting algorithm that is only specified for valid UTF-8 inputs might +lead to dangerous behavior (for example by reading memory out of bounds when +looking for an UTF-8 character boundary). + +Fortunately, knowing its schema, it is possible to validate Arrow data up front, +so that reading this data will not pose any danger later on. + +.. TODO: + For each layout, we should list the associated security risks and the recommended + steps to validate (perhaps in Columnar.rst) + +Advice for users +'''''''''''''''' + +Arrow implementations often assume inputs follow the specification to provide +high speed processing. It is **extremely recommended** that your application +explicitly validates any Arrow data it receives under serialized form +from untrusted sources. Many Arrow implementations provide explicit APIs to +perform such validation. + +.. TODO: link to some validation APIs for the main implementations here? + +Advice for implementors +''''''''''''''''''''''' + +It is **recommended** that you provide dedicated APIs to validate Arrow arrays +and/or record batches. Users will be able to utilize those APIs to assert whether +data coming from untrusted sources can be safely accessed. + +A typical validation API must return a well-defined error, not crash, if the +given Arrow data is invalid; it must always be safe to execute regardless of +whether the data is valid or not. + +Uninitialized data +------------------ + +A less obvious pitfall is when some parts of an Arrow array are left uninitialized. +For example, if an element of a primitive Arrow array is marked null through its +validity bitmap, the corresponding value slot in the values buffer can be ignored +for all purposes. It is therefore tempting, when creating an array with null +values, to not initialize the corresponding value slots. + +However, this then introduces a serious security risk if the Arrow data is +serialized and published (e.g. using IPC or Flight) such that it can be +accessed by untrusted users. Indeed, the uninitialized value slot can +reveal data left by a previous memory allocation made in the same process. +Depending on the application, this data could contain confidential information. + +Advice for users and implementors +''''''''''''''''''''''''''''''''' + +When creating a Arrow array, it is **recommended** that you never leave any +data uninitialized in a buffer if the array might be sent to, or read by, an +untrusted third-party, even when the uninitialized data is logically +irrelevant. The easiest way to do this is to zero-initialize any buffer that +will not be populated in full. + +If it is determined, through benchmarking, that zero-initialization imposes +an excessive performance cost, a library or application may instead decide +to use uninitialized memory internally as an optimization; but it should then +ensure all such uninitialized values are cleared before passing the Arrow data +to another system. + +.. note:: + Sending Arrow data out of the current process can happen *indirectly*, + for example if you produce it over the C Data Interface and the consumer + persists it using the IPC format on some public storage. + + +C Data Interface +================ + +The C Data Interface contains raw pointers into the process' address space. +It is generally not possible to validate that those pointers are legitimate; +read from such a pointer may crash or access unrelated or bogus data. + +Advice for users +---------------- + +You should **never** consume a C Data Interface structure from an untrusted +producer, as it is by construction impossible to guard against dangerous +behavior in this case. + +Advice for implementors +----------------------- + +When consuming a C Data Interface structure, you can assume that it comes from +a trusted producer, for the reason explained above. However, it is still +**recommended** that you validate it for soundness (for example that the right +number of buffers is passed for a given datatype), as a trusted producer can +have bugs anyway. + + +IPC Format +========== + +The :ref:`IPC format <_ipc-message-format>` is a serialization format for the +columnar format with associated metadata. Reading an IPC stream or file from +an untrusted source comes with similar caveats as reading the Arrow columnar +format. + +The additional signalisation and metadata in the IPC format come with +their own risks. For example, buffer offsets and sizes encoded in IPC messages +may be out of bounds for the IPC stream; Flatbuffers-encoded metadata payloads +may carry incorrect offsets pointing outside of the designated metadata area. + +Advice for users +---------------- + +Arrow libraries will typically ensure IPC streams are structurally valid +but may not also validate the underlying Array data. It is **extremely recommended** +that you use the appropriate APIs to validate the Arrow data read from an untrusted IPC stream. + +Advice for implementors +----------------------- + +It is **extremely recommended** to run dedicated validation checks when decoding +the IPC format, to make sure that the decoding can not induce unwanted behavior. +Failing those checks should return a well-known error to the caller, not crash. + + +Extension Types +=============== + +Extension types typically register a custom deserialization hook so that they +can be automatically recreated when reading from an external source (for example +using IPC). The deserialization hook has to decode the extension type's parameters +from a string or binary payload specific to the extension type. +:ref:`Typical examples ` use a bespoke JSON representation +with object fields representing the various parameters. + +When reading data from an untrusted source, any registered deserialization hook +could be called with an arbitrary payload. It is therefore of primary importance +that the hook be safe to call on invalid, potentially malicious, data. This mandates +the use of a robust metadata serialization schema (such as JSON, but not Python's +`pickle `__ or R's +`serialize() `__, +for example). + +Advice for users and implementors +--------------------------------- + +When designing an extension type, it is **extremely recommended** to choose a +metadata serialization format that is robust against potentially malicious +data. + +When implementing an extension type, it is **recommended** to ensure that the +deserialization hook is able to detect, and error out gracefully, if the +serialized metadata payload is invalid. + + +Testing for robustness +====================== + +Advice for implementors +----------------------- + +For APIs that may process untrusted inputs, it is **extremely recommended** +that your unit tests exercise your APIs against typical kinds of invalid data. +For example, your validation APIs will have to be tested against invalid Binary +or List offsets, invalid UTF-8 data in a String array, etc. + +Testing against known regression files +'''''''''''''''''''''''''''''''''''''' + +The `arrow-testing `__ repository +contains regression files for various formats, such as the IPC format. + +Two categories of files are especially noteworthy and can serve to exercise +an Arrow implementation's robustness: + +1. :ref:`gold integration files ` that are valid + files to exercise compliance with Arrow IPC features; +2. :ref:`fuzz regression files ` that have been automatically + generated each time a fuzzer founds a bug triggered by a specific (usually invalid) + input for a given format. + +Fuzzing +''''''' + +It is **recommended** that you go one step further and set up some kind of +automated robustness testing against unforeseen inputs. One typical approach +is though fuzzing, possibly coupled with a runtime instrumentation framework +that detects dangerous behavior (such as Address Sanitizer in C++ or +Rust). + +A reasonable way of setting up fuzzing for Arrow is using the IPC format as +a binary payload; the fuzz target should not only attempt to decode the IPC +stream as Arrow data, but it should then validate the Arrow data. +This will strengthen both the IPC decoder and the validation routines +against invalid, potentially malicious data. Finally, if validation comes out +successfully, the fuzz target may exercise some important core functionality, +such as printing the data for human display; this will help ensure that the +validation routine did not let through invalid data that may lead to dangerous +behavior. + + +Non-Arrow formats and protocols +=============================== + +Arrow data can also be sent or stored using third-party formats such as Apache +Parquet. Those formats may or may not present the same security risks as listed +above (for example, the precautions around uninitialized data may not apply +in a format like Parquet that does not create any value slots for null elements). +We suggest you refer to these projects' own documentation for more concrete +guidelines. diff --git a/docs/source/format/index.rst b/docs/source/format/index.rst index 91912a5325d5..bbcc3ec62115 100644 --- a/docs/source/format/index.rst +++ b/docs/source/format/index.rst @@ -37,5 +37,6 @@ Specifications Flight FlightSql ADBC + Security Integration Glossary diff --git a/docs/source/implementations.rst b/docs/source/implementations.rst index daeea2c51460..44f851332135 100644 --- a/docs/source/implementations.rst +++ b/docs/source/implementations.rst @@ -121,6 +121,6 @@ The source files for the Cookbook are maintained in the R Ruby Rust - Swift + Swift nanoarrow Implementation Status diff --git a/docs/source/python/api/tables.rst b/docs/source/python/api/tables.rst index 48cc67eb6672..b795680e7151 100644 --- a/docs/source/python/api/tables.rst +++ b/docs/source/python/api/tables.rst @@ -64,3 +64,7 @@ Tensors :toctree: ../generated/ Tensor + SparseCOOTensor + SparseCSRMatrix + SparseCSCMatrix + SparseCSFTensor diff --git a/docs/source/python/benchmarks.rst b/docs/source/python/benchmarks.rst deleted file mode 100644 index 68fc03c7bcfb..000000000000 --- a/docs/source/python/benchmarks.rst +++ /dev/null @@ -1,55 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _python-benchmarks: - -Benchmarks -========== - -The ``pyarrow`` package comes with a suite of benchmarks meant to -run with `ASV`_. You'll need to install the ``asv`` package first -(``pip install asv`` or ``conda install -c conda-forge asv``). - -Running the benchmarks ----------------------- - -To run the benchmarks for a locally-built Arrow, run ``asv run --python=same``. - -We use conda environments as part of running the benchmarks. To use the ``asv`` -setup, you must set the ``$CONDA_HOME`` environment variable to point to the -root of your conda installation. - -Running for arbitrary Git revisions ------------------------------------ - -ASV allows to store results and generate graphs of the benchmarks over -the project's evolution. You need to have the latest development version of ASV: - -.. code:: - - pip install git+https://github.com/airspeed-velocity/asv - -Now you should be ready to run ``asv run`` or whatever other command -suits your needs. Note that this can be quite long, as each Arrow needs -to be rebuilt for each Git revision you're running the benchmarks for. - -Compatibility -------------- - -We only expect the benchmarking setup to work on a Unix-like system with bash. - -.. _asv: https://asv.readthedocs.org/ diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 397af9d2c517..81d12957c28c 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -26,7 +26,9 @@ Arrow supports logical compute operations over inputs of possibly varying types. The standard compute operations are provided by the :mod:`pyarrow.compute` -module and can be used directly:: +module and can be used directly: + +.. code-block:: python >>> import pyarrow as pa >>> import pyarrow.compute as pc @@ -45,14 +47,14 @@ Many compute functions support both array (chunked or not) and scalar inputs, but some will mandate either. For example, ``sort_indices`` requires its first and only input to be an array. -Below are a few simple examples:: +Below are a few simple examples: + +.. code-block:: python - >>> import pyarrow as pa - >>> import pyarrow.compute as pc >>> a = pa.array([1, 1, 2, 3]) >>> b = pa.array([4, 1, 2, 8]) >>> pc.equal(a, b) - + [ false, true, @@ -65,10 +67,10 @@ Below are a few simple examples:: If you are using a compute function which returns more than one value, results will be returned as a ``StructScalar``. You can extract the individual values by -calling the :meth:`pyarrow.StructScalar.values` method:: +calling the :meth:`pyarrow.StructScalar.values` method: + +.. code-block:: python - >>> import pyarrow as pa - >>> import pyarrow.compute as pc >>> a = pa.array([1, 1, 2, 3]) >>> pc.min_max(a) @@ -79,14 +81,14 @@ calling the :meth:`pyarrow.StructScalar.values` method:: These functions can do more than just element-by-element operations. -Here is an example of sorting a table:: +Here is an example of sorting a table: + +.. code-block:: python - >>> import pyarrow as pa - >>> import pyarrow.compute as pc >>> t = pa.table({'x':[1,2,3],'y':[3,2,1]}) >>> i = pc.sort_indices(t, sort_keys=[('y', 'ascending')]) >>> i - + [ 2, 1, @@ -108,28 +110,30 @@ Grouped Aggregations PyArrow supports grouped aggregations over :class:`pyarrow.Table` through the :meth:`pyarrow.Table.group_by` method. The method will return a grouping declaration -to which the hash aggregation functions can be applied:: +to which the hash aggregation functions can be applied: + +.. code-block:: python - >>> import pyarrow as pa >>> t = pa.table([ ... pa.array(["a", "a", "b", "b", "c"]), ... pa.array([1, 2, 3, 4, 5]), ... ], names=["keys", "values"]) >>> t.group_by("keys").aggregate([("values", "sum")]) pyarrow.Table - values_sum: int64 keys: string + values_sum: int64 ---- - values_sum: [[3,7,5]] keys: [["a","b","c"]] + values_sum: [[3,7,5]] The ``"sum"`` aggregation passed to the ``aggregate`` method in the previous example is the ``hash_sum`` compute function. Multiple aggregations can be performed at the same time by providing them -to the ``aggregate`` method:: +to the ``aggregate`` method: + +.. code-block:: python - >>> import pyarrow as pa >>> t = pa.table([ ... pa.array(["a", "a", "b", "b", "c"]), ... pa.array([1, 2, 3, 4, 5]), @@ -139,20 +143,20 @@ to the ``aggregate`` method:: ... ("keys", "count") ... ]) pyarrow.Table + keys: string values_sum: int64 keys_count: int64 - keys: string ---- + keys: [["a","b","c"]] values_sum: [[3,7,5]] keys_count: [[2,2,1]] - keys: [["a","b","c"]] Aggregation options can also be provided for each aggregation function, for example we can use :class:`CountOptions` to change how we count -null values:: +null values: + +.. code-block:: python - >>> import pyarrow as pa - >>> import pyarrow.compute as pc >>> table_with_nulls = pa.table([ ... pa.array(["a", "a", "a"]), ... pa.array([1, None, None]) @@ -161,20 +165,20 @@ null values:: ... ("values", "count", pc.CountOptions(mode="all")) ... ]) pyarrow.Table - values_count: int64 keys: string + values_count: int64 ---- - values_count: [[3]] keys: [["a"]] + values_count: [[3]] >>> table_with_nulls.group_by(["keys"]).aggregate([ ... ("values", "count", pc.CountOptions(mode="only_valid")) ... ]) pyarrow.Table - values_count: int64 keys: string + values_count: int64 ---- - values_count: [[1]] keys: [["a"]] + values_count: [[1]] Following is a list of all supported grouped aggregation functions. You can use them with or without the ``"hash_"`` prefix. @@ -212,20 +216,19 @@ on which the join should be performed: .. code-block:: python - import pyarrow as pa - - table1 = pa.table({'id': [1, 2, 3], - 'year': [2020, 2022, 2019]}) - - table2 = pa.table({'id': [3, 4], - 'n_legs': [5, 100], - 'animal': ["Brittle stars", "Centipede"]}) - - joined_table = table1.join(table2, keys="id") + >>> table1 = pa.table({'id': [1, 2, 3], + ... 'year': [2020, 2022, 2019]}) + >>> table2 = pa.table({'id': [3, 4], + ... 'n_legs': [5, 100], + ... 'animal': ["Brittle stars", "Centipede"]}) + >>> joined_table = table1.join(table2, keys="id") The result will be a new table created by joining ``table1`` with -``table2`` on the ``id`` key with a ``left outer join``:: +``table2`` on the ``id`` key with a ``left outer join``: +.. code-block:: python + + >>> joined_table pyarrow.Table id: int64 year: int64 @@ -242,70 +245,57 @@ passing them to the ``join_type`` argument: .. code-block:: python - table1.join(table2, keys='id', join_type="full outer") - -In that case the result would be:: - + >>> table1.join(table2, keys='id', join_type="full outer").combine_chunks().sort_by('id') pyarrow.Table id: int64 year: int64 n_legs: int64 animal: string ---- - id: [[3,1,2,4]] - year: [[2019,2020,2022,null]] - n_legs: [[5,null,null,100]] - animal: [["Brittle stars",null,null,"Centipede"]] + id: [[1,2,3,4]] + year: [[2020,2022,2019,null]] + n_legs: [[null,null,5,100]] + animal: [[null,null,"Brittle stars","Centipede"]] It's also possible to provide additional join keys, so that the join happens on two keys instead of one. For example we can add an ``year`` column to ``table2`` so that we can join on ``('id', 'year')``: -.. code-block:: - - table2_withyear = table2.append_column("year", pa.array([2019, 2022])) - table1.join(table2_withyear, keys=["id", "year"]) - -The result will be a table where only entries with ``id=3`` and ``year=2019`` -have data, the rest will be ``null``:: +.. code-block:: python + >>> table2_withyear = table2.append_column("year", pa.array([2019, 2022])) + >>> table1.join(table2_withyear, keys=["id", "year"]) pyarrow.Table id: int64 year: int64 - animal: string n_legs: int64 + animal: string ---- id: [[3,1,2]] year: [[2019,2020,2022]] - animal: [["Brittle stars",null,null]] n_legs: [[5,null,null]] + animal: [["Brittle stars",null,null]] The same capabilities are available for :meth:`.Dataset.join` too, so you can take two datasets and join them: -.. code-block:: - - import pyarrow.dataset as ds - - ds1 = ds.dataset(table1) - ds2 = ds.dataset(table2) - - joined_ds = ds1.join(ds2, keys="id") - -The resulting dataset will be an :class:`.InMemoryDataset` containing the joined data:: +.. code-block:: python + >>> import pyarrow.dataset as ds + >>> ds1 = ds.dataset(table1) + >>> ds2 = ds.dataset(table2) + >>> joined_ds = ds1.join(ds2, keys="id") >>> joined_ds.head(5) - pyarrow.Table id: int64 year: int64 - animal: string n_legs: int64 + animal: string ---- id: [[3,1,2]] year: [[2019,2020,2022]] - animal: [["Brittle stars",null,null]] n_legs: [[5,null,null]] + animal: [["Brittle stars",null,null]] .. _py-filter-expr: @@ -328,8 +318,7 @@ in column ``"nums"`` .. code-block:: python - import pyarrow.compute as pc - even_filter = (pc.bit_wise_and(pc.field("nums"), pc.scalar(1)) == pc.scalar(0)) + >>> even_filter = (pc.bit_wise_and(pc.field("nums"), pc.scalar(1)) == pc.scalar(0)) .. note:: @@ -387,6 +376,8 @@ our ``even_filter`` with a ``pc.field("nums") > 5`` filter: The method will return an instance of :class:`.Dataset` which will lazily apply the filter as soon as actual data of the dataset is accessed: +.. code-block:: python + >>> dataset = ds.dataset(table) >>> filtered = dataset.filter(pc.field("nums") < 5).filter(pc.field("nums") > 2) >>> filtered.to_table() @@ -420,42 +411,36 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun .. code-block:: python - import numpy as np - - import pyarrow as pa - import pyarrow.compute as pc - - function_name = "numpy_gcd" - function_docs = { - "summary": "Calculates the greatest common divisor", - "description": - "Given 'x' and 'y' find the greatest number that divides\n" - "evenly into both x and y." - } - - input_types = { - "x" : pa.int64(), - "y" : pa.int64() - } - - output_type = pa.int64() - - def to_np(val): - if isinstance(val, pa.Scalar): - return val.as_py() - else: - return np.array(val) - - def gcd_numpy(ctx, x, y): - np_x = to_np(x) - np_y = to_np(y) - return pa.array(np.gcd(np_x, np_y)) - - pc.register_scalar_function(gcd_numpy, - function_name, - function_docs, - input_types, - output_type) + >>> import numpy as np + >>> function_name = "numpy_gcd" + >>> function_docs = { + ... "summary": "Calculates the greatest common divisor", + ... "description": + ... "Given 'x' and 'y' find the greatest number that divides\n" + ... "evenly into both x and y." + ... } + >>> input_types = { + ... "x" : pa.int64(), + ... "y" : pa.int64() + ... } + >>> output_type = pa.int64() + >>> + >>> def to_np(val): + ... if isinstance(val, pa.Scalar): + ... return val.as_py() + ... else: + ... return np.array(val) + >>> + >>> def gcd_numpy(ctx, x, y): + ... np_x = to_np(x) + ... np_y = to_np(y) + ... return pa.array(np.gcd(np_x, np_y)) + >>> + >>> pc.register_scalar_function(gcd_numpy, + ... function_name, + ... function_docs, + ... input_types, + ... output_type) The implementation of a user-defined function always takes a first *context* @@ -472,7 +457,7 @@ You can call a user-defined function directly using :func:`pyarrow.compute.call_ >>> pc.call_function("numpy_gcd", [pa.scalar(27), pa.scalar(63)]) >>> pc.call_function("numpy_gcd", [pa.scalar(27), pa.array([81, 12, 5])]) - + [ 27, 3, @@ -492,7 +477,6 @@ the GCD of one column with the scalar value 30. We will be re-using the .. code-block:: python - >>> import pyarrow.dataset as ds >>> data_table = pa.table({'category': ['A', 'B', 'C', 'D'], 'value': [90, 630, 1827, 2709]}) >>> dataset = ds.dataset(data_table) >>> func_args = [pc.scalar(30), ds.field("value")] diff --git a/docs/source/python/conftest.py b/docs/source/python/conftest.py new file mode 100644 index 000000000000..7ec0cc1936a5 --- /dev/null +++ b/docs/source/python/conftest.py @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + + +# Save output files from doctest examples into temp dir +@pytest.fixture(autouse=True) +def _docdir(request): + # Trigger ONLY for the doctests + from _pytest.doctest import DoctestItem + is_doctest = isinstance(request.node, DoctestItem) + + if is_doctest: + # Get the fixture dynamically by its name. + tmpdir = request.getfixturevalue('tmpdir') + + # Chdir only for the duration of the test. + with tmpdir.as_cwd(): + yield + else: + yield diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst index 5eb68e9ccdc5..2bc2ccabc996 100644 --- a/docs/source/python/csv.rst +++ b/docs/source/python/csv.rst @@ -41,12 +41,16 @@ Usage CSV reading and writing functionality is available through the :mod:`pyarrow.csv` module. In many cases, you will simply call the -:func:`read_csv` function with the file path you want to read from:: +:func:`read_csv` function with the file path you want to read from: + +.. code-block:: python >>> from pyarrow import csv - >>> fn = 'tips.csv.gz' - >>> table = csv.read_csv(fn) - >>> table + >>> import pyarrow as pa + >>> import pandas as pd + >>> fn = 'tips.csv.gz' # doctest: +SKIP + >>> table = csv.read_csv(fn) # doctest: +SKIP + >>> table # doctest: +SKIP pyarrow.Table total_bill: double tip: double @@ -55,10 +59,10 @@ CSV reading and writing functionality is available through the day: string time: string size: int64 - >>> len(table) + >>> len(table) # doctest: +SKIP 244 - >>> df = table.to_pandas() - >>> df.head() + >>> df = table.to_pandas() # doctest: +SKIP + >>> df.head() # doctest: +SKIP total_bill tip sex smoker day time size 0 16.99 1.01 Female No Sun Dinner 2 1 10.34 1.66 Male No Sun Dinner 3 @@ -68,10 +72,11 @@ CSV reading and writing functionality is available through the To write CSV files, just call :func:`write_csv` with a :class:`pyarrow.RecordBatch` or :class:`pyarrow.Table` and a path or -file-like object:: +file-like object: + +.. code-block:: python - >>> import pyarrow as pa - >>> import pyarrow.csv as csv + >>> table = pa.table({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}) >>> csv.write_csv(table, "tips.csv") >>> with pa.CompressedOutputStream("tips.csv.gz", "gzip") as out: ... csv.write_csv(table, out) @@ -83,15 +88,21 @@ Customized parsing To alter the default parsing settings in case of reading CSV files with an unusual structure, you should create a :class:`ParseOptions` instance -and pass it to :func:`read_csv`:: - - import pyarrow as pa - import pyarrow.csv as csv - - table = csv.read_csv('tips.csv.gz', parse_options=csv.ParseOptions( - delimiter=";", - invalid_row_handler=skip_handler - )) +and pass it to :func:`read_csv`: + +.. code-block:: python + + >>> def skip_handler(row): + ... pass + >>> table = csv.read_csv('tips.csv.gz', parse_options=csv.ParseOptions( + ... delimiter=";", + ... invalid_row_handler=skip_handler + ... )) + >>> table + pyarrow.Table + col1,"col2": string + ---- + col1,"col2": [["1,"a"","2,"b"","3,"c""]] Available parsing options are: @@ -113,17 +124,23 @@ Customized conversion --------------------- To alter how CSV data is converted to Arrow types and data, you should create -a :class:`ConvertOptions` instance and pass it to :func:`read_csv`:: +a :class:`ConvertOptions` instance and pass it to :func:`read_csv`: - import pyarrow as pa - import pyarrow.csv as csv +.. code-block:: python - table = csv.read_csv('tips.csv.gz', convert_options=csv.ConvertOptions( - column_types={ - 'total_bill': pa.decimal128(precision=10, scale=2), - 'tip': pa.decimal128(precision=10, scale=2), - } - )) + >>> table = csv.read_csv('tips.csv.gz', convert_options=csv.ConvertOptions( + ... column_types={ + ... 'total_bill': pa.decimal128(precision=10, scale=2), + ... 'tip': pa.decimal128(precision=10, scale=2), + ... } + ... )) + >>> table + pyarrow.Table + col1: int64 + col2: string + ---- + col1: [[1,2,3]] + col2: [["a","b","c"]] .. note:: To assign a column as ``duration``, the CSV values must be numeric strings @@ -173,15 +190,21 @@ Character encoding By default, CSV files are expected to be encoded in UTF8. Non-UTF8 data is accepted for ``binary`` columns. The encoding can be changed using -the :class:`ReadOptions` class:: +the :class:`ReadOptions` class: - import pyarrow as pa - import pyarrow.csv as csv +.. code-block:: python - table = csv.read_csv('tips.csv.gz', read_options=csv.ReadOptions( - column_names=["animals", "n_legs", "entry"], - skip_rows=1 - )) + >>> table = csv.read_csv('tips.csv.gz', read_options=csv.ReadOptions( + ... column_names=["n_legs", "entry"], + ... skip_rows=1 + ... )) + >>> table + pyarrow.Table + n_legs: int64 + entry: string + ---- + n_legs: [[1,2,3]] + entry: [["a","b","c"]] Available read options are: @@ -204,10 +227,10 @@ Customized writing To alter the default write settings in case of writing CSV files with different conventions, you can create a :class:`WriteOptions` instance and -pass it to :func:`write_csv`:: +pass it to :func:`write_csv`: + +.. code-block:: python - >>> import pyarrow as pa - >>> import pyarrow.csv as csv >>> # Omit the header row (include_header=True is the default) >>> options = csv.WriteOptions(include_header=False) >>> csv.write_csv(table, "data.csv", options) @@ -217,12 +240,12 @@ Incremental writing To write CSV files one batch at a time, create a :class:`CSVWriter`. This requires the output (a path or file-like object), the schema of the data to -be written, and optionally write options as described above:: +be written, and optionally write options as described above: + +.. code-block:: python - >>> import pyarrow as pa - >>> import pyarrow.csv as csv >>> with csv.CSVWriter("data.csv", table.schema) as writer: - >>> writer.write_table(table) + ... writer.write_table(table) Performance ----------- diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index 63df734163fc..22a3114fdd28 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -58,19 +58,22 @@ array data. These include: Each data type in Arrow has a corresponding factory function for creating an instance of that type object in Python: -.. ipython:: python +.. code-block:: python - import pyarrow as pa - t1 = pa.int32() - t2 = pa.string() - t3 = pa.binary() - t4 = pa.binary(10) - t5 = pa.timestamp('ms') - - t1 - print(t1) - print(t4) - print(t5) + >>> import pyarrow as pa + >>> t1 = pa.int32() + >>> t2 = pa.string() + >>> t3 = pa.binary() + >>> t4 = pa.binary(10) + >>> t5 = pa.timestamp('ms') + >>> t1 + DataType(int32) + >>> print(t1) + int32 + >>> print(t4) + fixed_size_binary[10] + >>> print(t5) + timestamp[ms] .. note:: Different data types might use a given physical storage. For example, @@ -83,44 +86,50 @@ input data (e.g. Python objects) may be coerced to more than one Arrow type. The :class:`~pyarrow.Field` type is a type plus a name and optional user-defined metadata: -.. ipython:: python +.. code-block:: python - f0 = pa.field('int32_field', t1) - f0 - f0.name - f0.type + >>> f0 = pa.field('int32_field', t1) + >>> f0 + pyarrow.Field + >>> f0.name + 'int32_field' + >>> f0.type + DataType(int32) Arrow supports **nested value types** like list, map, struct, and union. When creating these, you must pass types or fields to indicate the data types of the types' children. For example, we can define a list of int32 values with: -.. ipython:: python +.. code-block:: python - t6 = pa.list_(t1) - t6 + >>> t6 = pa.list_(t1) + >>> t6 + ListType(list) A ``struct`` is a collection of named fields: -.. ipython:: python - - fields = [ - pa.field('s0', t1), - pa.field('s1', t2), - pa.field('s2', t4), - pa.field('s3', t6), - ] +.. code-block:: python - t7 = pa.struct(fields) - print(t7) + >>> fields = [ + ... pa.field('s0', t1), + ... pa.field('s1', t2), + ... pa.field('s2', t4), + ... pa.field('s3', t6), + ... ] + >>> t7 = pa.struct(fields) + >>> print(t7) + struct> For convenience, you can pass ``(name, type)`` tuples directly instead of :class:`~pyarrow.Field` instances: -.. ipython:: python +.. code-block:: python - t8 = pa.struct([('s0', t1), ('s1', t2), ('s2', t4), ('s3', t6)]) - print(t8) - t8 == t7 + >>> t8 = pa.struct([('s0', t1), ('s1', t2), ('s2', t4), ('s3', t6)]) + >>> print(t8) + struct> + >>> t8 == t7 + True See :ref:`Data Types API ` for a full listing of data type @@ -136,13 +145,18 @@ defines the column names and types in a record batch or table data structure. The :func:`pyarrow.schema` factory function makes new Schema objects in Python: -.. ipython:: python +.. code-block:: python - my_schema = pa.schema([('field0', t1), - ('field1', t2), - ('field2', t4), - ('field3', t6)]) - my_schema + >>> my_schema = pa.schema([('field0', t1), + ... ('field1', t2), + ... ('field2', t4), + ... ('field3', t6)]) + >>> my_schema + field0: int32 + field1: string + field2: fixed_size_binary[10] + field3: list + child 0, item: int32 In some applications, you may not create schemas directly, only using the ones that are embedded in :ref:`IPC messages `. @@ -150,11 +164,16 @@ that are embedded in :ref:`IPC messages `. Schemas are immutable, which means you can't update an existing schema, but you can create a new one with updated values using :meth:`Schema.set`. -.. ipython:: python +.. code-block:: python - updated_field = pa.field('field0_new', pa.int64()) - my_schema2 = my_schema.set(0, updated_field) - my_schema2 + >>> updated_field = pa.field('field0_new', pa.int64()) + >>> my_schema2 = my_schema.set(0, updated_field) + >>> my_schema2 + field0_new: int64 + field1: string + field2: fixed_size_binary[10] + field3: list + child 0, item: int32 .. _data.array: @@ -171,47 +190,69 @@ A simple way to create arrays is with ``pyarrow.array``, which is similar to the ``numpy.array`` function. By default PyArrow will infer the data type for you: -.. ipython:: python +.. code-block:: python - arr = pa.array([1, 2, None, 3]) - arr + >>> arr = pa.array([1, 2, None, 3]) + >>> arr + + [ + 1, + 2, + null, + 3 + ] But you may also pass a specific data type to override type inference: -.. ipython:: python +.. code-block:: python - pa.array([1, 2], type=pa.uint16()) + >>> pa.array([1, 2], type=pa.uint16()) + + [ + 1, + 2 + ] The array's ``type`` attribute is the corresponding piece of type metadata: -.. ipython:: python +.. code-block:: python - arr.type + >>> arr.type + DataType(int64) Each in-memory array has a known length and null count (which will be 0 if there are no null values): -.. ipython:: python +.. code-block:: python - len(arr) - arr.null_count + >>> len(arr) + 4 + >>> arr.null_count + 1 Scalar values can be selected with normal indexing. ``pyarrow.array`` converts ``None`` values to Arrow nulls; we return the special ``pyarrow.NA`` value for nulls: -.. ipython:: python +.. code-block:: python - arr[0] - arr[2] + >>> arr[0] + + >>> arr[2] + Arrow data is immutable, so values can be selected but not assigned. Arrays can be sliced without copying: -.. ipython:: python +.. code-block:: python - arr[1:3] + >>> arr[1:3] + + [ + 2, + null + ] None values and NAN handling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -234,32 +275,49 @@ List arrays ``pyarrow.array`` is able to infer the type of simple nested data structures like lists: -.. ipython:: python +.. code-block:: python - nested_arr = pa.array([[], None, [1, 2], [None, 1]]) - print(nested_arr.type) + >>> nested_arr = pa.array([[], None, [1, 2], [None, 1]]) + >>> print(nested_arr.type) + list ListView arrays ~~~~~~~~~~~~~~~ ``pyarrow.array`` can create an alternate list type called ListView: -.. ipython:: python +.. code-block:: python - nested_arr = pa.array([[], None, [1, 2], [None, 1]], type=pa.list_view(pa.int64())) - print(nested_arr.type) + >>> nested_arr = pa.array([[], None, [1, 2], [None, 1]], type=pa.list_view(pa.int64())) + >>> print(nested_arr.type) + list_view ListView arrays have a different set of buffers than List arrays. The ListView array has both an offsets and sizes buffer, while a List array only has an offsets buffer. This allows for ListView arrays to specify out-of-order offsets: -.. ipython:: python - - values = [1, 2, 3, 4, 5, 6] - offsets = [4, 2, 0] - sizes = [2, 2, 2] - arr = pa.ListViewArray.from_arrays(offsets, sizes, values) - arr +.. code-block:: python + + >>> values = [1, 2, 3, 4, 5, 6] + >>> offsets = [4, 2, 0] + >>> sizes = [2, 2, 2] + >>> arr = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> arr + + [ + [ + 5, + 6 + ], + [ + 3, + 4 + ], + [ + 1, + 2 + ] + ] See the format specification for more details on :ref:`listview-layout`. @@ -269,39 +327,114 @@ Struct arrays ``pyarrow.array`` is able to infer the schema of a struct type from arrays of dictionaries: -.. ipython:: python - - pa.array([{'x': 1, 'y': True}, {'z': 3.4, 'x': 4}]) +.. code-block:: python + + >>> pa.array([{'x': 1, 'y': True}, {'z': 3.4, 'x': 4}]) + + -- is_valid: all not null + -- child 0 type: int64 + [ + 1, + 4 + ] + -- child 1 type: bool + [ + true, + null + ] + -- child 2 type: double + [ + null, + 3.4 + ] Struct arrays can be initialized from a sequence of Python dicts or tuples. For tuples, you must explicitly pass the type: -.. ipython:: python - - ty = pa.struct([('x', pa.int8()), - ('y', pa.bool_())]) - pa.array([{'x': 1, 'y': True}, {'x': 2, 'y': False}], type=ty) - pa.array([(3, True), (4, False)], type=ty) +.. code-block:: python + + >>> ty = pa.struct([('x', pa.int8()), + ... ('y', pa.bool_())]) + >>> pa.array([{'x': 1, 'y': True}, {'x': 2, 'y': False}], type=ty) + + -- is_valid: all not null + -- child 0 type: int8 + [ + 1, + 2 + ] + -- child 1 type: bool + [ + true, + false + ] + >>> pa.array([(3, True), (4, False)], type=ty) + + -- is_valid: all not null + -- child 0 type: int8 + [ + 3, + 4 + ] + -- child 1 type: bool + [ + true, + false + ] When initializing a struct array, nulls are allowed both at the struct level and at the individual field level. If initializing from a sequence of Python dicts, a missing dict key is handled as a null value: -.. ipython:: python - - pa.array([{'x': 1}, None, {'y': None}], type=ty) +.. code-block:: python + + >>> pa.array([{'x': 1}, None, {'y': None}], type=ty) + + -- is_valid: + [ + true, + false, + true + ] + -- child 0 type: int8 + [ + 1, + 0, + null + ] + -- child 1 type: bool + [ + null, + false, + null + ] You can also construct a struct array from existing arrays for each of the struct's components. In this case, data storage will be shared with the individual arrays, and no copy is involved: -.. ipython:: python - - xs = pa.array([5, 6, 7], type=pa.int16()) - ys = pa.array([False, True, True]) - arr = pa.StructArray.from_arrays((xs, ys), names=('x', 'y')) - arr.type - arr +.. code-block:: python + + >>> xs = pa.array([5, 6, 7], type=pa.int16()) + >>> ys = pa.array([False, True, True]) + >>> arr = pa.StructArray.from_arrays((xs, ys), names=('x', 'y')) + >>> arr.type + StructType(struct) + >>> arr + + -- is_valid: all not null + -- child 0 type: int16 + [ + 5, + 6, + 7 + ] + -- child 1 type: bool + [ + false, + true, + true + ] Map arrays ~~~~~~~~~~ @@ -309,11 +442,34 @@ Map arrays Map arrays can be constructed from lists of lists of tuples (key-item pairs), but only if the type is explicitly passed into :meth:`array`: -.. ipython:: python - - data = [[('x', 1), ('y', 0)], [('a', 2), ('b', 45)]] - ty = pa.map_(pa.string(), pa.int64()) - pa.array(data, type=ty) +.. code-block:: python + + >>> data = [[('x', 1), ('y', 0)], [('a', 2), ('b', 45)]] + >>> ty = pa.map_(pa.string(), pa.int64()) + >>> pa.array(data, type=ty) + + [ + keys: + [ + "x", + "y" + ] + values: + [ + 1, + 0 + ], + keys: + [ + "a", + "b" + ] + values: + [ + 2, + 45 + ] + ] MapArrays can also be constructed from offset, key, and item arrays. Offsets represent the starting position of each map. Note that the :attr:`MapArray.keys` and :attr:`MapArray.items` @@ -321,13 +477,45 @@ properties give the *flattened* keys and items. To keep the keys and items assoc their row, use the :meth:`ListArray.from_arrays` constructor with the :attr:`MapArray.offsets` property. -.. ipython:: python +.. code-block:: python - arr = pa.MapArray.from_arrays([0, 2, 3], ['x', 'y', 'z'], [4, 5, 6]) - arr.keys - arr.items - pa.ListArray.from_arrays(arr.offsets, arr.keys) - pa.ListArray.from_arrays(arr.offsets, arr.items) + >>> arr = pa.MapArray.from_arrays([0, 2, 3], ['x', 'y', 'z'], [4, 5, 6]) + >>> arr.keys + + [ + "x", + "y", + "z" + ] + >>> arr.items + + [ + 4, + 5, + 6 + ] + >>> pa.ListArray.from_arrays(arr.offsets, arr.keys) + + [ + [ + "x", + "y" + ], + [ + "z" + ] + ] + >>> pa.ListArray.from_arrays(arr.offsets, arr.items) + + [ + [ + 4, + 5 + ], + [ + 6 + ] + ] Union arrays ~~~~~~~~~~~~ @@ -341,28 +529,76 @@ as the resulting union array. They are adjuncted with a ``int8`` "types" array that tells, for each value, from which child array it must be selected: -.. ipython:: python - - xs = pa.array([5, 6, 7]) - ys = pa.array([False, False, True]) - types = pa.array([0, 1, 1], type=pa.int8()) - union_arr = pa.UnionArray.from_sparse(types, [xs, ys]) - union_arr.type - union_arr +.. code-block:: python + + >>> xs = pa.array([5, 6, 7]) + >>> ys = pa.array([False, False, True]) + >>> types = pa.array([0, 1, 1], type=pa.int8()) + >>> union_arr = pa.UnionArray.from_sparse(types, [xs, ys]) + >>> union_arr.type + SparseUnionType(sparse_union<0: int64=0, 1: bool=1>) + >>> union_arr + + -- is_valid: all not null + -- type_ids: [ + 0, + 1, + 1 + ] + -- child 0 type: int64 + [ + 5, + 6, + 7 + ] + -- child 1 type: bool + [ + false, + false, + true + ] In a dense union array, you also pass, in addition to the ``int8`` "types" array, a ``int32`` "offsets" array that tells, for each value, at each offset in the selected child array it can be found: -.. ipython:: python - - xs = pa.array([5, 6, 7]) - ys = pa.array([False, True]) - types = pa.array([0, 1, 1, 0, 0], type=pa.int8()) - offsets = pa.array([0, 0, 1, 1, 2], type=pa.int32()) - union_arr = pa.UnionArray.from_dense(types, offsets, [xs, ys]) - union_arr.type - union_arr +.. code-block:: python + + >>> xs = pa.array([5, 6, 7]) + >>> ys = pa.array([False, True]) + >>> types = pa.array([0, 1, 1, 0, 0], type=pa.int8()) + >>> offsets = pa.array([0, 0, 1, 1, 2], type=pa.int32()) + >>> union_arr = pa.UnionArray.from_dense(types, offsets, [xs, ys]) + >>> union_arr.type + DenseUnionType(dense_union<0: int64=0, 1: bool=1>) + >>> union_arr + + -- is_valid: all not null + -- type_ids: [ + 0, + 1, + 1, + 0, + 0 + ] + -- value_offsets: [ + 0, + 0, + 1, + 1, + 2 + ] + -- child 0 type: int64 + [ + 5, + 6, + 7 + ] + -- child 1 type: bool + [ + false, + true + ] .. _data.dictionary: @@ -380,28 +616,75 @@ they appear in C++ and Python is slightly different. We define a special :class:`~.DictionaryArray` type with a corresponding dictionary type. Let's consider an example: -.. ipython:: python - - indices = pa.array([0, 1, 0, 1, 2, 0, None, 2]) - dictionary = pa.array(['foo', 'bar', 'baz']) - - dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) - dict_array +.. code-block:: python + + >>> indices = pa.array([0, 1, 0, 1, 2, 0, None, 2]) + >>> dictionary = pa.array(['foo', 'bar', 'baz']) + >>> + >>> dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) + >>> dict_array + + ... + -- dictionary: + [ + "foo", + "bar", + "baz" + ] + -- indices: + [ + 0, + 1, + 0, + 1, + 2, + 0, + null, + 2 + ] Here we have: -.. ipython:: python - - print(dict_array.type) - dict_array.indices - dict_array.dictionary +.. code-block:: python + + >>> print(dict_array.type) + dictionary + >>> dict_array.indices + + [ + 0, + 1, + 0, + 1, + 2, + 0, + null, + 2 + ] + >>> dict_array.dictionary + + [ + "foo", + "bar", + "baz" + ] When using :class:`~.DictionaryArray` with pandas, the analogue is ``pandas.Categorical`` (more on this later): -.. ipython:: python +.. code-block:: python - dict_array.to_pandas() + >>> dict_array.to_pandas() + 0 foo + 1 bar + 2 foo + 3 bar + 4 baz + 5 foo + 6 NaN + 7 baz + dtype: category + Categories (3, str): ['foo', 'bar', 'baz'] .. _data.record_batch: @@ -411,32 +694,50 @@ Record Batches A **Record Batch** in Apache Arrow is a collection of equal-length array instances. Let's consider a collection of arrays: -.. ipython:: python +.. code-block:: python - data = [ - pa.array([1, 2, 3, 4]), - pa.array(['foo', 'bar', 'baz', None]), - pa.array([True, None, False, True]) - ] + >>> data = [ + ... pa.array([1, 2, 3, 4]), + ... pa.array(['foo', 'bar', 'baz', None]), + ... pa.array([True, None, False, True]) + ... ] A record batch can be created from this list of arrays using ``RecordBatch.from_arrays``: -.. ipython:: python - - batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2']) - batch.num_columns - batch.num_rows - batch.schema - - batch[1] +.. code-block:: python + + >>> batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2']) + >>> batch.num_columns + 3 + >>> batch.num_rows + 4 + >>> batch.schema + f0: int64 + f1: string + f2: bool + >>> + >>> batch[1] + + [ + "foo", + "bar", + "baz", + null + ] A record batch can be sliced without copying memory like an array: -.. ipython:: python +.. code-block:: python - batch2 = batch.slice(1, 3) - batch2[1] + >>> batch2 = batch.slice(1, 3) + >>> batch2[1] + + [ + "bar", + "baz", + null + ] .. _data.table: @@ -453,40 +754,96 @@ object makes this efficient without requiring additional memory copying. Considering the record batch we created above, we can create a Table containing one or more copies of the batch using ``Table.from_batches``: -.. ipython:: python - - batches = [batch] * 5 - table = pa.Table.from_batches(batches) - table - table.num_rows +.. code-block:: python + + >>> batches = [batch] * 5 + >>> table = pa.Table.from_batches(batches) + >>> table + pyarrow.Table + f0: int64 + f1: string + f2: bool + ---- + f0: [[1,2,3,4],[1,2,3,4],...,[1,2,3,4],[1,2,3,4]] + f1: [["foo","bar","baz",null],...,["foo","bar","baz",null]] + f2: [[true,null,false,true],...,[true,null,false,true]] + >>> table.num_rows + 20 The table's columns are instances of :class:`~.ChunkedArray`, which is a container for one or more arrays of the same type. -.. ipython:: python - - c = table[0] - c - c.num_chunks - c.chunk(0) +.. code-block:: python + + >>> c = table[0] + >>> c + + [ + [ + 1, + 2, + 3, + 4 + ], + ... + [ + 1, + 2, + 3, + 4 + ] + ] + >>> c.num_chunks + 5 + >>> c.chunk(0) + + [ + 1, + 2, + 3, + 4 + ] As you'll see in the :ref:`pandas section `, we can convert these objects to contiguous NumPy arrays for use in pandas: -.. ipython:: python - - c.to_pandas() +.. code-block:: python + + >>> c.to_pandas() + 0 1 + 1 2 + 2 3 + 3 4 + 4 1 + 5 2 + 6 3 + 7 4 + 8 1 + 9 2 + 10 3 + 11 4 + 12 1 + 13 2 + 14 3 + 15 4 + 16 1 + 17 2 + 18 3 + 19 4 + Name: f0, dtype: int64 Multiple tables can also be concatenated together to form a single table using ``pyarrow.concat_tables``, if the schemas are equal: -.. ipython:: python +.. code-block:: python - tables = [table] * 2 - table_all = pa.concat_tables(tables) - table_all.num_rows - c = table_all[0] - c.num_chunks + >>> tables = [table] * 2 + >>> table_all = pa.concat_tables(tables) + >>> table_all.num_rows + 40 + >>> c = table_all[0] + >>> c.num_chunks + 10 This is similar to ``Table.from_batches``, but uses tables as input instead of record batches. Record batches can be made into tables, but not the other way @@ -508,21 +865,23 @@ Note that this metadata is preserved in :ref:`ipc` processes. To customize the schema metadata of an existing table you can use :meth:`Table.replace_schema_metadata`: -.. ipython:: python +.. code-block:: python - table.schema.metadata # empty - table = table.replace_schema_metadata({"f0": "First dose"}) - table.schema.metadata + >>> table.schema.metadata + >>> table = table.replace_schema_metadata({"f0": "First dose"}) + >>> table.schema.metadata + {b'f0': b'First dose'} To customize the metadata of the field from the table schema you can use :meth:`Field.with_metadata`: -.. ipython:: python +.. code-block:: python - field_f1 = table.schema.field("f1") - field_f1.metadata # empty - field_f1 = field_f1.with_metadata({"f1": "Second dose"}) - field_f1.metadata + >>> field_f1 = table.schema.field("f1") + >>> field_f1.metadata + >>> field_f1 = field_f1.with_metadata({"f1": "Second dose"}) + >>> field_f1.metadata + {b'f1': b'Second dose'} Both options create a shallow copy of the data and do not in fact change the Schema which is immutable. To change the metadata in the schema of the table @@ -531,17 +890,20 @@ we created a new object when calling :meth:`Table.replace_schema_metadata`. To change the metadata of the field in the schema we would need to define a new schema and cast the data to this schema: -.. ipython:: python - - my_schema2 = pa.schema([ - pa.field('f0', pa.int64(), metadata={"name": "First dose"}), - pa.field('f1', pa.string(), metadata={"name": "Second dose"}), - pa.field('f2', pa.bool_())], - metadata={"f2": "booster"}) - t2 = table.cast(my_schema2) - t2.schema.field("f0").metadata - t2.schema.field("f1").metadata - t2.schema.metadata +.. code-block:: python + + >>> my_schema2 = pa.schema([ + ... pa.field('f0', pa.int64(), metadata={"name": "First dose"}), + ... pa.field('f1', pa.string(), metadata={"name": "Second dose"}), + ... pa.field('f2', pa.bool_())], + ... metadata={"f2": "booster"}) + >>> t2 = table.cast(my_schema2) + >>> t2.schema.field("f0").metadata + {b'name': b'First dose'} + >>> t2.schema.field("f1").metadata + {b'name': b'Second dose'} + >>> t2.schema.metadata + {b'f2': b'booster'} Metadata key and value pairs are ``std::string`` objects in the C++ implementation and so they are bytes objects (``b'...'``) in Python. @@ -551,22 +913,29 @@ Record Batch Readers Many functions in PyArrow either return or take as an argument a :class:`RecordBatchReader`. It can be used like any iterable of record batches, but also provides their common -schema without having to get any of the batches.:: +schema without having to get any of the batches. + +.. code-block:: python >>> schema = pa.schema([('x', pa.int64())]) + >>> >>> def iter_record_batches(): ... for i in range(2): ... yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema) + >>> >>> reader = pa.RecordBatchReader.from_batches(schema, iter_record_batches()) >>> print(reader.schema) - pyarrow.Schema x: int64 >>> for batch in reader: ... print(batch) pyarrow.RecordBatch x: int64 + ---- + x: [1,2,3] pyarrow.RecordBatch x: int64 + ---- + x: [1,2,3] It can also be sent between languages using the :ref:`C stream interface `. @@ -584,31 +953,33 @@ to efficiently convert tabular columnar data into a tensor. Data types supported in this conversion are unsigned, signed integer and float types. Currently only column-major conversion is supported. - >>> import pyarrow as pa - >>> arr1 = [1, 2, 3, 4, 5] - >>> arr2 = [10, 20, 30, 40, 50] - >>> batch = pa.RecordBatch.from_arrays( +.. code-block:: python + + >>> arr1 = [1, 2, 3, 4, 5] + >>> arr2 = [10, 20, 30, 40, 50] + >>> batch = pa.RecordBatch.from_arrays( ... [ ... pa.array(arr1, type=pa.uint16()), ... pa.array(arr2, type=pa.int16()), ... ], ["a", "b"] ... ) - >>> batch.to_tensor() + >>> batch.to_tensor() type: int32 - shape: (9, 2) - strides: (4, 36) - >>> batch.to_tensor().to_numpy() + shape: (5, 2) + strides: (8, 4) + >>> batch.to_tensor().to_numpy() array([[ 1, 10], - [ 2, 20], - [ 3, 30], - [ 4, 40], - [ 5, 50]], dtype=int32) + [ 2, 20], + [ 3, 30], + [ 4, 40], + [ 5, 50]], dtype=int32) With ``null_to_nan`` set to ``True`` one can also convert data with nulls. They will be converted to ``NaN``: - >>> import pyarrow as pa +.. code-block:: python + >>> batch = pa.record_batch( ... [ ... pa.array([1, 2, 3, 4, None], type=pa.int32()), @@ -617,7 +988,7 @@ nulls. They will be converted to ``NaN``: ... ) >>> batch.to_tensor(null_to_nan=True).to_numpy() array([[ 1., 10.], - [ 2., 20.], - [ 3., 30.], - [ 4., 40.], - [nan, nan]]) + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index de4ff7be4c79..4e18ea0a51cd 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -15,17 +15,6 @@ .. specific language governing permissions and limitations .. under the License. -.. ipython:: python - :suppress: - - # set custom tmp working directory for files that create data - import os - import tempfile - - orig_working_dir = os.getcwd() - temp_working_dir = tempfile.mkdtemp(prefix="pyarrow-") - os.chdir(temp_working_dir) - .. currentmodule:: pyarrow.dataset .. _dataset: @@ -64,23 +53,24 @@ Reading Datasets For the examples below, let's create a small dataset consisting of a directory with two parquet files: -.. ipython:: python - - import tempfile - import pathlib - import pyarrow as pa - import pyarrow.parquet as pq - import numpy as np - - base = pathlib.Path(tempfile.mkdtemp(prefix="pyarrow-")) - (base / "parquet_dataset").mkdir(exist_ok=True) - - # creating an Arrow Table - table = pa.table({'a': range(10), 'b': np.random.randn(10), 'c': [1, 2] * 5}) +.. code-block:: python - # writing it into two parquet files - pq.write_table(table.slice(0, 5), base / "parquet_dataset/data1.parquet") - pq.write_table(table.slice(5, 10), base / "parquet_dataset/data2.parquet") + >>> import tempfile + >>> import pathlib + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> import numpy as np + >>> + >>> base = pathlib.Path(tempfile.mkdtemp(prefix="pyarrow-")) + >>> (base / "parquet_dataset").mkdir(exist_ok=True) + >>> + >>> # creating an Arrow Table + >>> np.random.seed(0) + >>> table = pa.table({'a': range(10), 'b': np.random.randn(10), 'c': [1, 2] * 5}) + >>> + >>> # writing it into two parquet files + >>> pq.write_table(table.slice(0, 5), base / "parquet_dataset/data1.parquet") + >>> pq.write_table(table.slice(5, 10), base / "parquet_dataset/data2.parquet") Dataset discovery ~~~~~~~~~~~~~~~~~ @@ -88,11 +78,12 @@ Dataset discovery A :class:`Dataset` object can be created with the :func:`dataset` function. We can pass it the path to the directory containing the data files: -.. ipython:: python +.. code-block:: python - import pyarrow.dataset as ds - dataset = ds.dataset(base / "parquet_dataset", format="parquet") - dataset + >>> import pyarrow.dataset as ds + >>> dataset = ds.dataset(base / "parquet_dataset", format="parquet") + >>> dataset + In addition to searching a base directory, :func:`dataset` accepts a path to a single file or a list of file paths. @@ -100,25 +91,48 @@ single file or a list of file paths. Creating a :class:`Dataset` object does not begin reading the data itself. If needed, it only crawls the directory to find all the files: -.. ipython:: python +.. code-block:: python - dataset.files + >>> dataset.files + ['.../parquet_dataset/data1.parquet', '.../parquet_dataset/data2.parquet'] ... and infers the dataset's schema (by default from the first file): -.. ipython:: python +.. code-block:: python - print(dataset.schema.to_string(show_field_metadata=False)) + >>> print(dataset.schema.to_string(show_field_metadata=False)) + a: int64 + b: double + c: int64 Using the :meth:`Dataset.to_table` method we can read the dataset (or a portion of it) into a pyarrow Table (note that depending on the size of your dataset this can require a lot of memory, see below on filtering / iterative loading): -.. ipython:: python +.. code-block:: python - dataset.to_table() - # converting to pandas to see the contents of the scanned table - dataset.to_table().to_pandas() + >>> dataset.to_table() + pyarrow.Table + a: int64 + b: double + c: int64 + ---- + a: [[0,1,2,3,4],[5,6,7,8,9]] + b: [[...],[...]] + c: [[1,2,1,2,1],[2,1,2,1,2]] + >>> # converting to pandas to see the contents of the scanned table + >>> dataset.to_table().to_pandas() + a b c + 0 0 1.764052 1 + 1 1 0.400157 2 + 2 2 0.978738 1 + 3 3 2.240893 2 + 4 4 1.867558 1 + 5 5 -0.977278 2 + 6 6 0.950088 1 + 7 7 -0.151357 2 + 8 8 -0.103219 1 + 9 9 0.410599 2 Reading different file formats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -130,19 +144,25 @@ supported; more formats are planned in the future. If we save the table as Feather files instead of Parquet files: -.. ipython:: python - - import pyarrow.feather as feather +.. code-block:: python - feather.write_feather(table, base / "data.feather") + >>> import pyarrow.feather as feather + >>> + >>> feather.write_feather(table, base / "data.feather") …then we can read the Feather file using the same functions, but with specifying ``format="feather"``: -.. ipython:: python +.. code-block:: python - dataset = ds.dataset(base / "data.feather", format="feather") - dataset.to_table().to_pandas().head() + >>> dataset = ds.dataset(base / "data.feather", format="feather") + >>> dataset.to_table().to_pandas().head() + a b c + 0 0 1.764052 1 + 1 1 0.400157 2 + 2 2 0.978738 1 + 3 3 2.240893 2 + 4 4 1.867558 1 Customizing file formats ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -172,19 +192,40 @@ To avoid reading all data when only needing a subset, the ``columns`` and The ``columns`` keyword can be used to only read the specified columns: -.. ipython:: python +.. code-block:: python - dataset = ds.dataset(base / "parquet_dataset", format="parquet") - dataset.to_table(columns=['a', 'b']).to_pandas() + >>> dataset = ds.dataset(base / "parquet_dataset", format="parquet") + >>> dataset.to_table(columns=['a', 'b']).to_pandas() + a b + 0 0 1.764052 + 1 1 0.400157 + 2 2 0.978738 + 3 3 2.240893 + 4 4 1.867558 + 5 5 -0.977278 + 6 6 0.950088 + 7 7 -0.151357 + 8 8 -0.103219 + 9 9 0.410599 With the ``filter`` keyword, rows which do not match the filter predicate will not be included in the returned table. The keyword expects a boolean :class:`Expression` referencing at least one of the columns: -.. ipython:: python +.. code-block:: python - dataset.to_table(filter=ds.field('a') >= 7).to_pandas() - dataset.to_table(filter=ds.field('c') == 2).to_pandas() + >>> dataset.to_table(filter=ds.field('a') >= 7).to_pandas() + a b c + 0 7 -0.151357 2 + 1 8 -0.103219 1 + 2 9 0.410599 2 + >>> dataset.to_table(filter=ds.field('c') == 2).to_pandas() + a b c + 0 1 0.400157 2 + 1 3 2.240893 2 + 2 5 -0.977278 2 + 3 7 -0.151357 2 + 4 9 0.410599 2 The easiest way to construct those :class:`Expression` objects is by using the :func:`field` helper function. Any column - not just partition columns - can be @@ -193,11 +234,18 @@ referenced using the :func:`field` function (which creates a including the comparisons (equal, larger/less than, etc), set membership testing, and boolean combinations (``&``, ``|``, ``~``): -.. ipython:: python +.. code-block:: python - ds.field('a') != 3 - ds.field('a').isin([1, 2, 3]) - (ds.field('a') > ds.field('b')) & (ds.field('b') > 1) + >>> ds.field('a') != 3 + + >>> ds.field('a').isin([1, 2, 3]) + + >>> (ds.field('a') > ds.field('b')) & (ds.field('b') > 1) + b) and (b > 1))> Note that :class:`Expression` objects can **not** be combined by python logical operators ``and``, ``or`` and ``not``. @@ -213,25 +261,37 @@ In this case, we pass it a dictionary with the keys being the resulting column names and the values the expression that is used to construct the column values: -.. ipython:: python +.. code-block:: python - projection = { - "a_renamed": ds.field("a"), - "b_as_float32": ds.field("b").cast("float32"), - "c_1": ds.field("c") == 1, - } - dataset.to_table(columns=projection).to_pandas().head() + >>> projection = { + ... "a_renamed": ds.field("a"), + ... "b_as_float32": ds.field("b").cast("float32"), + ... "c_1": ds.field("c") == 1, + ... } + >>> dataset.to_table(columns=projection).to_pandas().head() + a_renamed b_as_float32 c_1 + 0 0 1.764052 True + 1 1 0.400157 False + 2 2 0.978738 True + 3 3 2.240893 False + 4 4 1.867558 True The dictionary also determines the column selection (only the keys in the dictionary will be present as columns in the resulting table). If you want to include a derived column in *addition* to the existing columns, you can build up the dictionary from the dataset schema: -.. ipython:: python +.. code-block:: python - projection = {col: ds.field(col) for col in dataset.schema.names} - projection.update({"b_large": ds.field("b") > 1}) - dataset.to_table(columns=projection).to_pandas().head() + >>> projection = {col: ds.field(col) for col in dataset.schema.names} + >>> projection.update({"b_large": ds.field("b") > 1}) + >>> dataset.to_table(columns=projection).to_pandas().head() + a b c b_large + 0 0 1.764052 1 True + 1 1 0.400157 2 False + 2 2 0.978738 1 False + 3 3 2.240893 2 True + 4 4 1.867558 1 True Reading partitioned data @@ -269,12 +329,12 @@ in Apache Hive. Let's create a small partitioned dataset. The :func:`~pyarrow.parquet.write_to_dataset` function can write such hive-like partitioned datasets. -.. ipython:: python +.. code-block:: python - table = pa.table({'a': range(10), 'b': np.random.randn(10), 'c': [1, 2] * 5, - 'part': ['a'] * 5 + ['b'] * 5}) - pq.write_to_dataset(table, "parquet_dataset_partitioned", - partition_cols=['part']) + >>> table = pa.table({'a': range(10), 'b': np.random.randn(10), 'c': [1, 2] * 5, + ... 'part': ['a'] * 5 + ['b'] * 5}) + >>> pq.write_to_dataset(table, "parquet_dataset_partitioned", + ... partition_cols=['part']) The above created a directory with two subdirectories ("part=a" and "part=b"), and the Parquet files written in those directories no longer include the "part" @@ -283,25 +343,36 @@ column. Reading this dataset with :func:`dataset`, we now specify that the dataset should use a hive-like partitioning scheme with the ``partitioning`` keyword: -.. ipython:: python +.. code-block:: python - dataset = ds.dataset("parquet_dataset_partitioned", format="parquet", - partitioning="hive") - dataset.files + >>> dataset = ds.dataset("parquet_dataset_partitioned", format="parquet", + ... partitioning="hive") + >>> dataset.files + ['parquet_dataset_partitioned/part=a/...-0.parquet', 'parquet_dataset_partitioned/part=b/...-0.parquet'] Although the partition fields are not included in the actual Parquet files, they will be added back to the resulting table when scanning this dataset: -.. ipython:: python +.. code-block:: python - dataset.to_table().to_pandas().head(3) + >>> dataset.to_table().to_pandas().head(3) + a b c part + 0 0 0.144044 1 a + 1 1 1.454274 2 a + 2 2 0.761038 1 a We can now filter on the partition keys, which avoids loading files altogether if they do not match the filter: -.. ipython:: python +.. code-block:: python - dataset.to_table(filter=ds.field("part") == "b").to_pandas() + >>> dataset.to_table(filter=ds.field("part") == "b").to_pandas() + a b c part + 0 5 0.333674 2 b + 1 6 1.494079 1 b + 2 7 -0.205158 2 b + 3 8 0.313068 1 b + 4 9 -0.854096 2 b Different partitioning schemes @@ -316,11 +387,11 @@ using the :func:`partitioning` function. For example: .. code-block:: python - part = ds.partitioning( - pa.schema([("year", pa.int16()), ("month", pa.int8()), ("day", pa.int32())]), - flavor="hive" - ) - dataset = ds.dataset(..., partitioning=part) + >>> part = ds.partitioning( # doctest: +SKIP + ... pa.schema([("year", pa.int16()), ("month", pa.int8()), ("day", pa.int32())]), + ... flavor="hive" + ... ) + >>> dataset = ds.dataset(..., partitioning=part) # doctest: +SKIP "Directory partitioning" is also supported, where the segments in the file path represent the values of the partition keys without including the name (the @@ -332,7 +403,7 @@ when constructing a directory partitioning: .. code-block:: python - part = ds.partitioning(field_names=["year", "month", "day"]) + >>> part = ds.partitioning(field_names=["year", "month", "day"]) # doctest: +SKIP Directory partitioning also supports providing a full schema rather than inferring types from file paths. @@ -350,17 +421,16 @@ specifying a S3 path: .. code-block:: python - dataset = ds.dataset("s3://arrow-datasets/nyc-taxi/") + >>> dataset = ds.dataset("s3://arrow-datasets/nyc-taxi/") # doctest: +SKIP Typically, you will want to customize the connection parameters, and then a file system object can be created and passed to the ``filesystem`` keyword: .. code-block:: python - from pyarrow import fs - - s3 = fs.S3FileSystem(region="us-east-1") - dataset = ds.dataset("arrow-datasets/nyc-taxi/", filesystem=s3) + >>> from pyarrow import fs + >>> s3 = fs.S3FileSystem(region="us-east-1") + >>> dataset = ds.dataset("arrow-datasets/nyc-taxi/", filesystem=s3) # doctest: +SKIP The currently available classes are :class:`~pyarrow.fs.S3FileSystem` and :class:`~pyarrow.fs.HadoopFileSystem`. See the :ref:`filesystem` docs for more @@ -377,11 +447,9 @@ useful for testing or benchmarking. .. code-block:: python - from pyarrow import fs - - # By default, MinIO will listen for unencrypted HTTP traffic. - minio = fs.S3FileSystem(scheme="http", endpoint_override="localhost:9000") - dataset = ds.dataset("arrow-datasets/nyc-taxi/", filesystem=minio) + >>> # By default, MinIO will listen for unencrypted HTTP traffic. + >>> minio = fs.S3FileSystem(scheme="http", endpoint_override="localhost:9000") + >>> dataset = ds.dataset("arrow-datasets/nyc-taxi/", filesystem=minio) # doctest: +SKIP Working with Parquet Datasets @@ -401,7 +469,7 @@ dataset with a ``_metadata`` file: .. code-block:: python - dataset = ds.parquet_dataset("/path/to/dir/_metadata") + >>> dataset = ds.parquet_dataset("/path/to/dir/_metadata") # doctest: +SKIP By default, the constructed :class:`Dataset` object for Parquet datasets maps each fragment to a single Parquet file. If you want fragments mapping to each @@ -410,8 +478,8 @@ the fragments: .. code-block:: python - fragments = list(dataset.get_fragments()) - fragments[0].split_by_row_group() + >>> fragments = list(dataset.get_fragments()) # doctest: +SKIP + >>> fragments[0].split_by_row_group() # doctest: +SKIP This method returns a list of new Fragments mapping to each row group of the original Fragment (Parquet file). Both ``get_fragments()`` and @@ -432,35 +500,44 @@ automatic discovery or inference. For the example here, we are going to use a dataset where the file names contain additional partitioning information: -.. ipython:: python +.. code-block:: python - # creating a dummy dataset: directory with two files - table = pa.table({'col1': range(3), 'col2': np.random.randn(3)}) - (base / "parquet_dataset_manual").mkdir(exist_ok=True) - pq.write_table(table, base / "parquet_dataset_manual" / "data_2018.parquet") - pq.write_table(table, base / "parquet_dataset_manual" / "data_2019.parquet") + >>> # creating a dummy dataset: directory with two files + >>> table = pa.table({'col1': range(3), 'col2': np.random.randn(3)}) + >>> (base / "parquet_dataset_manual").mkdir(exist_ok=True) + >>> pq.write_table(table, base / "parquet_dataset_manual" / "data_2018.parquet") + >>> pq.write_table(table, base / "parquet_dataset_manual" / "data_2019.parquet") To create a Dataset from a list of files, we need to specify the paths, schema, format, filesystem, and partition expressions manually: -.. ipython:: python - - from pyarrow import fs - - schema = pa.schema([("year", pa.int64()), ("col1", pa.int64()), ("col2", pa.float64())]) +.. code-block:: python - dataset = ds.FileSystemDataset.from_paths( - ["data_2018.parquet", "data_2019.parquet"], schema=schema, format=ds.ParquetFileFormat(), - filesystem=fs.SubTreeFileSystem(str(base / "parquet_dataset_manual"), fs.LocalFileSystem()), - partitions=[ds.field('year') == 2018, ds.field('year') == 2019]) + >>> schema = pa.schema([("year", pa.int64()), ("col1", pa.int64()), ("col2", pa.float64())]) + >>> + >>> dataset = ds.FileSystemDataset.from_paths( + ... ["data_2018.parquet", "data_2019.parquet"], schema=schema, format=ds.ParquetFileFormat(), + ... filesystem=fs.SubTreeFileSystem(str(base / "parquet_dataset_manual"), fs.LocalFileSystem()), + ... partitions=[ds.field('year') == 2018, ds.field('year') == 2019]) Since we specified the "partition expressions" for our files, this information is materialized as columns when reading the data and can be used for filtering: -.. ipython:: python +.. code-block:: python - dataset.to_table().to_pandas() - dataset.to_table(filter=ds.field('year') == 2019).to_pandas() + >>> dataset.to_table().to_pandas() + year col1 col2 + 0 2018 0 -2.552990 + 1 2018 1 0.653619 + 2 2018 2 0.864436 + 3 2019 0 -2.552990 + 4 2019 1 0.653619 + 5 2019 2 0.864436 + >>> dataset.to_table(filter=ds.field('year') == 2019).to_pandas() + year col1 col2 + 0 2019 0 -2.552990 + 1 2019 1 0.653619 + 2 2019 2 0.864436 Another benefit of manually listing the files is that the order of the files controls the order of the data. When performing an ordered read (or a read to @@ -481,16 +558,16 @@ The easiest way to do this is to use the method :meth:`Dataset.to_batches`. Thi method returns an iterator of record batches. For example, we can use this method to calculate the average of a column without loading the entire column into memory: -.. ipython:: python - - import pyarrow.compute as pc +.. code-block:: python - col2_sum = 0 - count = 0 - for batch in dataset.to_batches(columns=["col2"], filter=~ds.field("col2").is_null()): - col2_sum += pc.sum(batch.column("col2")).as_py() - count += batch.num_rows - mean_a = col2_sum/count + >>> import pyarrow.compute as pc + >>> + >>> col2_sum = 0 + >>> count = 0 + >>> for batch in dataset.to_batches(columns=["col2"], filter=~ds.field("col2").is_null()): + ... col2_sum += pc.sum(batch.column("col2")).as_py() + ... count += batch.num_rows + >>> mean_a = col2_sum/count Customizing the batch size ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -537,10 +614,10 @@ you want to partition your data or you need to write a large amount of data. A basic dataset write is similar to writing a table except that you specify a directory instead of a filename. -.. ipython:: python +.. code-block:: python - table = pa.table({"a": range(10), "b": np.random.randn(10), "c": [1, 2] * 5}) - ds.write_dataset(table, "sample_dataset", format="parquet") + >>> table = pa.table({"a": range(10), "b": np.random.randn(10), "c": [1, 2] * 5}) + >>> ds.write_dataset(table, "sample_dataset", format="parquet") The above example will create a single file named part-0.parquet in our sample_dataset directory. @@ -560,12 +637,12 @@ This uses the same kind of partitioning objects we used for reading datasets. T our above data out to a partitioned directory we only need to specify how we want the dataset to be partitioned. For example: -.. ipython:: python +.. code-block:: python - part = ds.partitioning( - pa.schema([("c", pa.int16())]), flavor="hive" - ) - ds.write_dataset(table, "partitioned_dataset", format="parquet", partitioning=part) + >>> part = ds.partitioning( + ... pa.schema([("c", pa.int16())]), flavor="hive" + ... ) + >>> ds.write_dataset(table, "partitioned_dataset", format="parquet", partitioning=part) This will create two files. Half our data will be in the dataset_root/c=1 directory and the other half will be in the dataset_root/c=2 directory. @@ -688,23 +765,23 @@ you may not be able to load everything into a single in-memory table. Fortunate simple, for example, to repartition a large dataset without loading the entire dataset into memory: -.. ipython:: python - - old_part = ds.partitioning( - pa.schema([("c", pa.int16())]), flavor="hive" - ) - new_part = ds.partitioning( - pa.schema([("c", pa.int16())]), flavor=None - ) - input_dataset = ds.dataset("partitioned_dataset", partitioning=old_part) - # A scanner can act as an iterator of record batches but you could also receive - # data from the network (e.g. via flight), from your own scanning, or from any - # other method that yields record batches. In addition, you can pass a dataset - # into write_dataset directly but this method is useful if you want to customize - # the scanner (e.g. to filter the input dataset or set a maximum batch size) - scanner = input_dataset.scanner() +.. code-block:: python - ds.write_dataset(scanner, "repartitioned_dataset", format="parquet", partitioning=new_part) + >>> old_part = ds.partitioning( + ... pa.schema([("c", pa.int16())]), flavor="hive" + ... ) + >>> new_part = ds.partitioning( + ... pa.schema([("c", pa.int16())]), flavor=None + ... ) + >>> input_dataset = ds.dataset("partitioned_dataset", partitioning=old_part) + >>> # A scanner can act as an iterator of record batches but you could also receive + >>> # data from the network (e.g. via flight), from your own scanning, or from any + >>> # other method that yields record batches. In addition, you can pass a dataset + >>> # into write_dataset directly but this method is useful if you want to customize + >>> # the scanner (e.g. to filter the input dataset or set a maximum batch size) + >>> scanner = input_dataset.scanner() + >>> + >>> ds.write_dataset(scanner, "repartitioned_dataset", format="parquet", partitioning=new_part) After the above example runs our data will be in dataset_root/1 and dataset_root/2 directories. In this simple example we are not changing the structure of the data @@ -722,17 +799,35 @@ call. For simple datasets it may be possible to know which files will be create larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used to supply a visitor that will be called as each file is created: -.. ipython:: python +.. code-block:: python - def file_visitor(written_file): - print(f"path={written_file.path}") - print(f"size={written_file.size} bytes") - print(f"metadata={written_file.metadata}") + >>> def file_visitor(written_file): + ... print(f"path={written_file.path}") + ... print(f"size={written_file.size} bytes") + ... print(f"metadata={written_file.metadata}") -.. ipython:: python +.. code-block:: python - ds.write_dataset(table, "dataset_visited", format="parquet", partitioning=part, - file_visitor=file_visitor) + >>> ds.write_dataset(table, "dataset_visited", format="parquet", partitioning=part, + ... file_visitor=file_visitor) + path=dataset_visited/c=.../part-0.parquet + size=... bytes + metadata= + created_by: parquet-cpp-arrow version ... + num_columns: 2 + num_rows: 5 + num_row_groups: 1 + format_version: 2.6 + serialized_size: 0 + path=dataset_visited/c=.../part-0.parquet + size=... bytes + metadata= + created_by: parquet-cpp-arrow version ... + num_columns: 2 + num_rows: 5 + num_row_groups: 1 + format_version: 2.6 + serialized_size: 0 This will allow you to collect the filenames that belong to the dataset and store them elsewhere which can be useful when you want to avoid scanning directories the next time you need to read @@ -746,23 +841,10 @@ In addition to the common options shared by all formats there are also format sp that are unique to a particular format. For example, to allow truncated timestamps while writing Parquet files: -.. ipython:: python - - parquet_format = ds.ParquetFileFormat() - write_options = parquet_format.make_write_options(allow_truncated_timestamps=True) - ds.write_dataset(table, "sample_dataset2", format="parquet", partitioning=part, - file_options=write_options) - - -.. ipython:: python - :suppress: - - # clean-up custom working directory - import os - import shutil +.. code-block:: python - os.chdir(orig_working_dir) - shutil.rmtree(temp_working_dir, ignore_errors=True) + >>> parquet_format = ds.ParquetFileFormat() + >>> write_options = parquet_format.make_write_options(allow_truncated_timestamps=True) + >>> ds.write_dataset(table, "sample_dataset2", format="parquet", partitioning=part, + ... file_options=write_options) - # also clean-up custom base directory used in some examples - shutil.rmtree(str(base), ignore_errors=True) diff --git a/docs/source/python/dlpack.rst b/docs/source/python/dlpack.rst index 9f0d3b58aa6e..6e74cd5c82c1 100644 --- a/docs/source/python/dlpack.rst +++ b/docs/source/python/dlpack.rst @@ -68,36 +68,36 @@ Examples Convert a PyArrow CPU array into a NumPy array: -.. code-block:: +.. code-block:: python >>> import pyarrow as pa + >>> import numpy as np >>> array = pa.array([2, 0, 2, 4]) - + >>> array + [ - 2, - 0, - 2, - 4 + 2, + 0, + 2, + 4 ] - - >>> import numpy as np >>> np.from_dlpack(array) array([2, 0, 2, 4]) Convert a PyArrow CPU array into a PyTorch tensor: -.. code-block:: +.. code-block:: python - >>> import torch - >>> torch.from_dlpack(array) + >>> import torch # doctest: +SKIP + >>> torch.from_dlpack(array) # doctest: +SKIP tensor([2, 0, 2, 4]) Convert a PyArrow CPU array into a JAX array: -.. code-block:: +.. code-block:: python - >>> import jax - >>> jax.numpy.from_dlpack(array) + >>> import jax # doctest: +SKIP + >>> jax.numpy.from_dlpack(array) # doctest: +SKIP Array([2, 0, 2, 4], dtype=int32) - >>> jax.dlpack.from_dlpack(array) + >>> jax.dlpack.from_dlpack(array) # doctest: +SKIP Array([2, 0, 2, 4], dtype=int32) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 29f0ed55d03e..48262b680778 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -90,16 +90,16 @@ by implementing the ``__arrow_array__`` method (similar to numpy's ``__array__`` protocol). For example, to support conversion of your duck array class to an Arrow array, -define the ``__arrow_array__`` method to return an Arrow array:: +define the ``__arrow_array__`` method to return an Arrow array: - class MyDuckArray: - - ... +.. code-block:: python - def __arrow_array__(self, type=None): - # convert the underlying array values to a PyArrow Array - import pyarrow - return pyarrow.array(..., type=type) + >>> class MyDuckArray: + ... + ... def __arrow_array__(self, type=None): + ... # convert the underlying array values to a PyArrow Array + ... import pyarrow + ... return pyarrow.array(..., type=type) The ``__arrow_array__`` method takes an optional ``type`` keyword which is passed through from :func:`pyarrow.array`. The method is allowed to return either @@ -138,51 +138,55 @@ PyArrow allows you to define extension types from Python by subclassing :class:`ExtensionType` and giving the derived class its own extension name and mechanism to (de)serialize any parameters. For example, we could define a custom rational type for fractions which can be represented as a pair of -integers:: - - class RationalType(pa.ExtensionType): - - def __init__(self, data_type: pa.DataType): - if not pa.types.is_integer(data_type): - raise TypeError(f"data_type must be an integer type not {data_type}") - - super().__init__( - pa.struct( - [ - ("numer", data_type), - ("denom", data_type), - ], - ), - "my_package.rational", - ) - - def __arrow_ext_serialize__(self) -> bytes: - # No parameters are necessary - return b"" +integers: - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - # Sanity checks, not required but illustrate the method signature. - assert pa.types.is_struct(storage_type) - assert pa.types.is_integer(storage_type[0].type) - assert storage_type[0].type == storage_type[1].type - assert serialized == b"" +.. code-block:: python - # return an instance of this subclass - return RationalType(storage_type[0].type) + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... "my_package.rational", + ... ) + ... + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # Sanity checks, not required but illustrate the method signature. + ... assert pa.types.is_struct(storage_type) + ... assert pa.types.is_integer(storage_type[0].type) + ... assert storage_type[0].type == storage_type[1].type + ... assert serialized == b"" + ... + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) The special methods ``__arrow_ext_serialize__`` and ``__arrow_ext_deserialize__`` define the serialization and deserialization of an extension type instance. -This can now be used to create arrays and tables holding the extension type:: +This can now be used to create arrays and tables holding the extension type: + +.. code-block:: python >>> rational_type = RationalType(pa.int32()) >>> rational_type.extension_name 'my_package.rational' >>> rational_type.storage_type StructType(struct) - >>> storage_array = pa.array( ... [ ... {"numer": 10, "denom": 17}, @@ -194,7 +198,7 @@ This can now be used to create arrays and tables holding the extension type:: >>> # or equivalently >>> arr = pa.ExtensionArray.from_storage(rational_type, storage_array) >>> arr - + -- is_valid: all not null -- child 0 type: int32 [ @@ -210,23 +214,29 @@ This can now be used to create arrays and tables holding the extension type:: This array can be included in RecordBatches, sent over IPC and received in another Python process. The receiving process must explicitly register the extension type for deserialization, otherwise it will fall back to the -storage type:: +storage type: + +.. code-block:: python >>> pa.register_extension_type(RationalType(pa.int32())) For example, creating a RecordBatch and writing it to a stream using the -IPC protocol:: +IPC protocol: + +.. code-block:: python >>> batch = pa.RecordBatch.from_arrays([arr], ["ext"]) >>> sink = pa.BufferOutputStream() >>> with pa.RecordBatchStreamWriter(sink, batch.schema) as writer: - ... writer.write_batch(batch) + ... writer.write_batch(batch) >>> buf = sink.getvalue() -and then reading it back yields the proper type:: +and then reading it back yields the proper type: + +.. code-block:: python >>> with pa.ipc.open_stream(buf) as reader: - ... result = reader.read_all() + ... result = reader.read_all() >>> result.column("ext").type RationalType(StructType(struct)) @@ -234,7 +244,9 @@ Further, note that while we registered the concrete type ``RationalType(pa.int32())``, the same extension name (``"my_package.rational"``) is used by ``RationalType(integer_type)`` for *all* Arrow integer types. As such, the above code also allows users to -(de)serialize these data types:: +(de)serialize these data types: + +.. code-block:: python >>> big_rational_type = RationalType(pa.int64()) >>> storage_array = pa.array( @@ -248,10 +260,10 @@ for *all* Arrow integer types. As such, the above code also allows users to >>> batch = pa.RecordBatch.from_arrays([arr], ["ext"]) >>> sink = pa.BufferOutputStream() >>> with pa.RecordBatchStreamWriter(sink, batch.schema) as writer: - ... writer.write_batch(batch) + ... writer.write_batch(batch) >>> buf = sink.getvalue() >>> with pa.ipc.open_stream(buf) as reader: - ... result = reader.read_all() + ... result = reader.read_all() >>> result.column("ext").type RationalType(StructType(struct)) @@ -273,31 +285,31 @@ representing time spans (e.g., a frequency of a day, a month, a quarter, etc). It is stored as an int64 array which is interpreted as the number of time spans of the given frequency since 1970. -:: - - class PeriodType(pa.ExtensionType): - - def __init__(self, freq): - # attributes need to be set first before calling - # super init (as that calls serialize) - self._freq = freq - super().__init__(pa.int64(), "my_package.period") - - @property - def freq(self): - return self._freq - - def __arrow_ext_serialize__(self): - return "freq={}".format(self.freq).encode() +.. code-block:: python - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - # Return an instance of this subclass given the serialized - # metadata. - serialized = serialized.decode() - assert serialized.startswith("freq=") - freq = serialized.split("=")[1] - return PeriodType(freq) + >>> class PeriodType(pa.ExtensionType): + ... + ... def __init__(self, freq): + ... # attributes need to be set first before calling + ... # super init (as that calls serialize) + ... self._freq = freq + ... super().__init__(pa.int64(), "my_package.period") + ... + ... @property + ... def freq(self): + ... return self._freq + ... + ... def __arrow_ext_serialize__(self): + ... return "freq={}".format(self.freq).encode() + ... + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # Return an instance of this subclass given the serialized + ... # metadata. + ... serialized = serialized.decode() + ... assert serialized.startswith("freq=") + ... freq = serialized.split("=")[1] + ... return PeriodType(freq) Here, we ensure to store all information in the serialized metadata that is needed to reconstruct the instance (in the ``__arrow_ext_deserialize__`` class @@ -318,51 +330,55 @@ definition of the extension type. For instance, let us consider the example from the `Numpy Quickstart `_ of points in 3D space. We can store these as a fixed-size list, where we wish to be able to extract -the data as a 2-D Numpy array ``(N, 3)`` without any copy:: +the data as a 2-D Numpy array ``(N, 3)`` without any copy: - class Point3DArray(pa.ExtensionArray): - def to_numpy_array(self): - return self.storage.flatten().to_numpy().reshape((-1, 3)) - - - class Point3DType(pa.ExtensionType): - def __init__(self): - super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") - - def __arrow_ext_serialize__(self): - return b"" - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - return Point3DType() +.. code-block:: python - def __arrow_ext_class__(self): - return Point3DArray + >>> class Point3DArray(pa.ExtensionArray): + ... def to_numpy_array(self): + ... return self.storage.flatten().to_numpy().reshape((-1, 3)) + >>> class Point3DType(pa.ExtensionType): + ... def __init__(self): + ... super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") + ... + ... def __arrow_ext_serialize__(self): + ... return b"" + ... + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... return Point3DType() + ... + ... def __arrow_ext_class__(self): + ... return Point3DArray + +Arrays built using this extension type now have the expected custom array class: -Arrays built using this extension type now have the expected custom array class:: +.. code-block:: python >>> storage = pa.array([[1, 2, 3], [4, 5, 6]], pa.list_(pa.float32(), 3)) >>> arr = pa.ExtensionArray.from_storage(Point3DType(), storage) >>> arr - <__main__.Point3DArray object at 0x7f40dea80670> + <__main__.Point3DArray object at ...> [ - [ - 1, - 2, - 3 - ], - [ - 4, - 5, - 6 - ] + [ + 1, + 2, + 3 + ], + [ + 4, + 5, + 6 + ] ] -The additional methods in the extension class are then available to the user:: +The additional methods in the extension class are then available to the user: + +.. code-block:: python >>> arr.to_numpy_array() array([[1., 2., 3.], - [4., 5., 6.]], dtype=float32) + [4., 5., 6.]], dtype=float32) This array can be sent over IPC, received in another Python process, and the custom @@ -378,37 +394,37 @@ If you want scalars of your custom extension type to convert to a custom type wh :meth:`ExtensionScalar.as_py()` is called, you can override the :meth:`ExtensionScalar.as_py()` method by subclassing :class:`ExtensionScalar`. For example, if we wanted the above example 3D point type to return a custom -3D point class instead of a list, we would implement:: - - from collections import namedtuple +3D point class instead of a list, we would implement: - Point3D = namedtuple("Point3D", ["x", "y", "z"]) - - class Point3DScalar(pa.ExtensionScalar): - def as_py(self) -> Point3D: - return Point3D(*self.value.as_py()) - - class Point3DType(pa.ExtensionType): - def __init__(self): - super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") - - def __arrow_ext_serialize__(self): - return b"" - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - return Point3DType() +.. code-block:: python - def __arrow_ext_scalar_class__(self): - return Point3DScalar + >>> from collections import namedtuple + >>> Point3D = namedtuple("Point3D", ["x", "y", "z"]) + >>> class Point3DScalar(pa.ExtensionScalar): + ... def as_py(self, **kwargs) -> Point3D: + ... return Point3D(*self.value.as_py(**kwargs)) + >>> class Point3DType(pa.ExtensionType): + ... def __init__(self): + ... super().__init__(pa.list_(pa.float32(), 3), "my_package.Point3DType") + ... + ... def __arrow_ext_serialize__(self): + ... return b"" + ... + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... return Point3DType() + ... + ... def __arrow_ext_scalar_class__(self): + ... return Point3DScalar + +Arrays built using this extension type now provide scalars that convert to our ``Point3D`` class: -Arrays built using this extension type now provide scalars that convert to our ``Point3D`` class:: +.. code-block:: python >>> storage = pa.array([[1, 2, 3], [4, 5, 6]], pa.list_(pa.float32(), 3)) >>> arr = pa.ExtensionArray.from_storage(Point3DType(), storage) >>> arr[0].as_py() Point3D(x=1.0, y=2.0, z=3.0) - >>> arr.to_pylist() [Point3D(x=1.0, y=2.0, z=3.0), Point3D(x=4.0, y=5.0, z=6.0)] @@ -426,26 +442,28 @@ For this, the :meth:`ExtensionType.to_pandas_dtype` method needs to be implemented, and should return a ``pandas.api.extensions.ExtensionDtype`` subclass instance. -Using the pandas period type from above as example, this would look like:: +Using the pandas period type from above as example, this would look like: - class PeriodType(pa.ExtensionType): - ... +.. code-block:: python - def to_pandas_dtype(self): - import pandas as pd - return pd.PeriodDtype(freq=self.freq) + >>> class PeriodType(pa.ExtensionType): + ... + ... def to_pandas_dtype(self): + ... import pandas as pd + ... return pd.PeriodDtype(freq=self.freq) Secondly, the pandas ``ExtensionDtype`` on its turn needs to have the ``__from_arrow__`` method implemented: a method that given a PyArrow Array or ChunkedArray of the extension type can construct the corresponding -pandas ``ExtensionArray``. This method should have the following signature:: - +pandas ``ExtensionArray``. This method should have the following signature: - class MyExtensionDtype(pd.api.extensions.ExtensionDtype): - ... +.. code-block:: python - def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> pandas.ExtensionArray: - ... + >>> import pandas as pd + >>> class MyExtensionDtype(pd.api.extensions.ExtensionDtype): + ... + ... def __from_arrow__(self, array): # pyarrow.Array/ChunkedArray -> pandas.ExtensionArray + ... pass This way, you can control the conversion of a PyArrow ``Array`` of your PyArrow extension type to a pandas ``ExtensionArray`` that can be stored in a DataFrame. @@ -530,12 +548,14 @@ in the numpy ndarray: >>> numpy_tensor array([[[ 1., 2.], [ 3., 4.]], + [[ 10., 20.], [ 30., 40.]], + [[100., 200.], - [300., 400.]]]) + [300., 400.]]], dtype=float32) >>> numpy_tensor.shape - (3, 2, 2) + (3, 2, 2) .. note:: @@ -594,13 +614,13 @@ example .. code-block:: python - >>> tensor_type = pa.fixed_shape_tensor(pa.float64(), [2, 2, 3], permutation=[0, 2, 1]) + >>> tensor_type = pa.fixed_shape_tensor(pa.float64(), [2, 2, 3], permutation=[0, 2, 1]) or .. code-block:: python - >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=["C", "H", "W"]) + >>> tensor_type = pa.fixed_shape_tensor(pa.bool_(), [2, 2, 3], dim_names=["C", "H", "W"]) for ``NCHW`` format where: diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst index ebb3664d82eb..196a1ed21a21 100644 --- a/docs/source/python/filesystems.rst +++ b/docs/source/python/filesystems.rst @@ -56,17 +56,22 @@ Instantiating a filesystem ~~~~~~~~~~~~~~~~~~~~~~~~~~ A FileSystem object can be created with one of the constructors (and check the -respective constructor for its options):: +respective constructor for its options): + +.. code-block:: python >>> from pyarrow import fs + >>> import pyarrow as pa >>> local = fs.LocalFileSystem() -or alternatively inferred from a URI:: +or alternatively inferred from a URI: + +.. code-block:: python - >>> s3, path = fs.FileSystem.from_uri("s3://my-bucket") - >>> s3 - - >>> path + >>> s3, path = fs.FileSystem.from_uri("s3://my-bucket") # doctest: +SKIP + >>> s3 # doctest: +SKIP + + >>> path # doctest: +SKIP 'my-bucket' @@ -76,27 +81,28 @@ Reading and writing files Several of the IO-related functions in PyArrow accept either a URI (and infer the filesystem) or an explicit ``filesystem`` argument to specify the filesystem to read or write from. For example, the :meth:`pyarrow.parquet.read_table` -function can be used in the following ways:: +function can be used in the following ways: - import pyarrow.parquet as pq +.. code-block:: python - # using a URI -> filesystem is inferred - pq.read_table("s3://my-bucket/data.parquet") - # using a path and filesystem - s3 = fs.S3FileSystem(..) - pq.read_table("my-bucket/data.parquet", filesystem=s3) + >>> import pyarrow.parquet as pq + >>> # using a URI -> filesystem is inferred + >>> pq.read_table("s3://my-bucket/data.parquet") # doctest: +SKIP + >>> # using a path and filesystem + >>> s3 = fs.S3FileSystem(..) # doctest: +SKIP + >>> pq.read_table("my-bucket/data.parquet", filesystem=s3) # doctest: +SKIP The filesystem interface further allows to open files for reading (input) or writing (output) directly, which can be combined with functions that work with -file-like objects. For example:: - - import pyarrow as pa +file-like objects. For example: - local = fs.LocalFileSystem() +.. code-block:: python - with local.open_output_stream("test.arrow") as file: - with pa.RecordBatchFileWriter(file, table.schema) as writer: - writer.write_table(table) + >>> table = pa.table({'col1': [1, 2, 3]}) + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream("test.arrow") as file: + ... with pa.RecordBatchFileWriter(file, table.schema) as writer: + ... writer.write_table(table) Listing files @@ -104,9 +110,11 @@ Listing files Inspecting the directories and files on a filesystem can be done with the :meth:`FileSystem.get_file_info` method. To list the contents of a directory, -use the :class:`FileSelector` object to specify the selection:: +use the :class:`FileSelector` object to specify the selection: - >>> local.get_file_info(fs.FileSelector("dataset/", recursive=True)) +.. code-block:: python + + >>> local.get_file_info(fs.FileSelector("dataset/", recursive=True)) # doctest: +SKIP [, , , @@ -116,11 +124,12 @@ This returns a list of :class:`FileInfo` objects, containing information about the type (file or directory), the size, the date last modified, etc. You can also get this information for a single explicit path (or list of -paths):: +paths): - >>> local.get_file_info('test.arrow') - +.. code-block:: python + >>> local.get_file_info('test.arrow') + >>> local.get_file_info('non_existent') @@ -132,15 +141,16 @@ Local FS The :class:`LocalFileSystem` allows you to access files on the local machine. -Example how to write to disk and read it back:: +Example how to write to disk and read it back: + +.. code-block:: python - >>> from pyarrow import fs >>> local = fs.LocalFileSystem() - >>> with local.open_output_stream('/tmp/pyarrowtest.dat') as stream: - stream.write(b'data') + >>> with local.open_output_stream('pyarrowtest.dat') as stream: + ... stream.write(b'data') 4 - >>> with local.open_input_stream('/tmp/pyarrowtest.dat') as stream: - print(stream.readall()) + >>> with local.open_input_stream('pyarrowtest.dat') as stream: + ... print(stream.readall()) b'data' @@ -159,13 +169,13 @@ supported by AWS (such as the ``AWS_ACCESS_KEY_ID`` and and EC2 Instance Metadata Service for EC2 nodes). -Example how you can read contents from a S3 bucket:: +Example how you can read contents from a S3 bucket: - >>> from pyarrow import fs - >>> s3 = fs.S3FileSystem(region='eu-west-3') +.. code-block:: python - # List all contents in a bucket, recursively - >>> s3.get_file_info(fs.FileSelector('my-test-bucket', recursive=True)) + >>> s3 = fs.S3FileSystem(region='eu-west-3') # doctest: +SKIP + >>> # List all contents in a bucket, recursively + >>> s3.get_file_info(fs.FileSelector('my-test-bucket', recursive=True)) # doctest: +SKIP [, , , @@ -175,10 +185,9 @@ Example how you can read contents from a S3 bucket:: , , ] - - # Open a file for reading and download its contents - >>> f = s3.open_input_stream('my-test-bucket/Dir1/File2') - >>> f.readall() + >>> # Open a file for reading and download its contents + >>> f = s3.open_input_stream('my-test-bucket/Dir1/File2') # doctest: +SKIP + >>> f.readall() # doctest: +SKIP b'some data' @@ -192,13 +201,13 @@ It is also possible to resolve the region from the bucket name for :class:`S3FileSystem` by using :func:`pyarrow.fs.resolve_s3_region` or :func:`pyarrow.fs.S3FileSystem.from_uri`. -Here are a couple examples in code:: +Here are a couple examples in code: - >>> from pyarrow import fs - >>> s3 = fs.S3FileSystem(region=fs.resolve_s3_region('my-test-bucket')) +.. code-block:: python - # Or via URI: - >>> s3, path = fs.S3FileSystem.from_uri('s3://[access_key:secret_key@]bucket/path]') + >>> s3 = fs.S3FileSystem(region=fs.resolve_s3_region('my-test-bucket')) # doctest: +SKIP + >>> # Or via URI: + >>> s3, path = fs.S3FileSystem.from_uri('s3://[access_key:secret_key@]bucket/path]') # doctest: +SKIP .. seealso:: @@ -237,19 +246,18 @@ To connect to a public bucket without using any credentials, you must pass will report ``Couldn't resolve host name`` since there are different host names for authenticated and public access. -Example showing how you can read contents from a GCS bucket:: - - >>> from datetime import timedelta - >>> from pyarrow import fs - >>> gcs = fs.GcsFileSystem(anonymous=True, retry_time_limit=timedelta(seconds=15)) +Example showing how you can read contents from a GCS bucket: - # List all contents in a bucket, recursively - >>> uri = "gcp-public-data-landsat/LC08/01/001/003/" - >>> file_list = gcs.get_file_info(fs.FileSelector(uri, recursive=True)) +.. code-block:: python - # Open a file for reading and download its contents - >>> f = gcs.open_input_stream(file_list[0].path) - >>> f.read(64) + >>> from datetime import timedelta + >>> gcs = fs.GcsFileSystem(anonymous=True, retry_time_limit=timedelta(seconds=15)) # doctest: +SKIP + >>> # List all contents in a bucket, recursively + >>> uri = "gcp-public-data-landsat/LC08/01/001/003/" # doctest: +SKIP + >>> file_list = gcs.get_file_info(fs.FileSelector(uri, recursive=True)) # doctest: +SKIP + >>> # Open a file for reading and download its contents + >>> f = gcs.open_input_stream(file_list[0].path) # doctest: +SKIP + >>> f.read(64) # doctest: +SKIP b'GROUP = FILE_HEADER\n LANDSAT_SCENE_ID = "LC80010032013082LGN03"\n S' .. seealso:: @@ -270,8 +278,7 @@ using the :class:`HadoopFileSystem` constructor: .. code-block:: python - from pyarrow import fs - hdfs = fs.HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path) + >>> hdfs = fs.HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path) # doctest: +SKIP The ``libhdfs`` library is loaded **at runtime** (rather than at link / library load time, since the library may not be in your LD_LIBRARY_PATH), and relies on @@ -289,9 +296,9 @@ some environment variables. .. code-block:: shell - export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` - # or on Windows - %HADOOP_HOME%/bin/hadoop classpath --glob > %CLASSPATH% + >>> export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` # doctest: +SKIP + >>> # or on Windows + >>> %HADOOP_HOME%/bin/hadoop classpath --glob > %CLASSPATH% # doctest: +SKIP In contrast to the legacy HDFS filesystem with ``pa.hdfs.connect``, setting ``CLASSPATH`` is not optional (pyarrow will not attempt to infer it). @@ -312,21 +319,20 @@ is used for authentication. This means it will try several types of authenticati and go with the first one that works. If any authentication parameters are provided when initialising the FileSystem, they will be used instead of the default credential. -Example showing how you can read contents from an Azure Blob Storage account:: +Example showing how you can read contents from an Azure Blob Storage account: - >>> from pyarrow import fs - >>> azure_fs = fs.AzureFileSystem(account_name='myaccount') +.. code-block:: python - # List all contents in a container, recursively - >>> azure_fs.get_file_info(fs.FileSelector('my-container', recursive=True)) + >>> azure_fs = fs.AzureFileSystem(account_name='myaccount') # doctest: +SKIP + >>> # List all contents in a container, recursively + >>> azure_fs.get_file_info(fs.FileSelector('my-container', recursive=True)) # doctest: +SKIP [, , , ] - - # Open a file for reading and download its contents - >>> f = azure_fs.open_input_stream('my-container/File1') - >>> f.readall() + >>> # Open a file for reading and download its contents + >>> f = azure_fs.open_input_stream('my-container/File1') # doctest: +SKIP + >>> f.readall() # doctest: +SKIP b'some data' For more details on the parameters and usage, refer to the :class:`AzureFileSystem` class documentation. @@ -346,46 +352,49 @@ The Python ecosystem, however, also has several filesystem packages. Those packages following the `fsspec`_ interface can be used in PyArrow as well. Functions accepting a filesystem object will also accept an fsspec subclass. -For example:: +For example: - # creating an fsspec-based filesystem object for Google Cloud Storage - import gcsfs - fs = gcsfs.GCSFileSystem(project='my-google-project') +.. code-block:: python - # using this to read a partitioned dataset - import pyarrow.dataset as ds - ds.dataset("data/", filesystem=fs) + >>> # creating an fsspec-based filesystem object for Google Cloud Storage + >>> import gcsfs # doctest: +SKIP + >>> fs_gcs = gcsfs.GCSFileSystem(project='my-google-project') # doctest: +SKIP + >>> # using this to read a partitioned dataset + >>> import pyarrow.dataset as ds # doctest: +SKIP + >>> ds.dataset("data/", filesystem=fs_gcs) # doctest: +SKIP -Similarly for Azure Blob Storage:: +Similarly for Azure Blob Storage: - import adlfs - # ... load your credentials and configure the filesystem - fs = adlfs.AzureBlobFileSystem(account_name=account_name, account_key=account_key) +.. code-block:: python - import pyarrow.dataset as ds - ds.dataset("mycontainer/data/", filesystem=fs) + >>> import adlfs # doctest: +SKIP + >>> # ... load your credentials and configure the filesystem + >>> fs_azure = adlfs.AzureBlobFileSystem(account_name=account_name, account_key=account_key) # doctest: +SKIP + >>> ds.dataset("mycontainer/data/", filesystem=fs_azure) # doctest: +SKIP Under the hood, the fsspec filesystem object is wrapped into a python-based PyArrow filesystem (:class:`PyFileSystem`) using :class:`FSSpecHandler`. You can also manually do this to get an object with the PyArrow FileSystem -interface:: +interface: - from pyarrow.fs import PyFileSystem, FSSpecHandler - pa_fs = PyFileSystem(FSSpecHandler(fs)) +.. code-block:: python -Then all the functionalities of :class:`FileSystem` are accessible:: + >>> from pyarrow.fs import PyFileSystem, FSSpecHandler # doctest: +SKIP + >>> pa_fs = PyFileSystem(FSSpecHandler(fs_azure)) # doctest: +SKIP - # write data - with pa_fs.open_output_stream('mycontainer/pyarrowtest.dat') as stream: - stream.write(b'data') +Then all the functionalities of :class:`FileSystem` are accessible: - # read data - with pa_fs.open_input_stream('mycontainer/pyarrowtest.dat') as stream: - print(stream.readall()) - #b'data' +.. code-block:: python - # read a partitioned dataset - ds.dataset("data/", filesystem=pa_fs) + >>> # write data + >>> with pa_fs.open_output_stream('mycontainer/pyarrowtest.dat') as stream: # doctest: +SKIP + ... stream.write(b'data') + >>> # read data + >>> with pa_fs.open_input_stream('mycontainer/pyarrowtest.dat') as stream: # doctest: +SKIP + ... print(stream.readall()) + b'data' + >>> # read a partitioned dataset + >>> ds.dataset("data/", filesystem=pa_fs) # doctest: +SKIP Using fsspec-compatible filesystem URIs @@ -395,23 +404,26 @@ PyArrow can automatically instantiate fsspec filesystems by prefixing the URI scheme with ``fsspec+``. This allows you to use the fsspec-compatible filesystems directly with PyArrow's IO functions without needing to manually create a filesystem object. Example writing and reading a Parquet file -using an in-memory filesystem provided by `fsspec`_:: +using an in-memory filesystem provided by `fsspec`_: + +.. code-block:: python - import pyarrow as pa - import pyarrow.parquet as pq + >>> table = pa.table({'a': [1, 2, 3]}) + >>> pq.write_table(table, "fsspec+memory://path/to/my_table.parquet") # doctest: +SKIP + >>> pq.read_table("fsspec+memory://path/to/my_table.parquet") # doctest: +SKIP - table = pa.table({'a': [1, 2, 3]}) - pq.write_table(table, "fsspec+memory://path/to/my_table.parquet") - pq.read_table("fsspec+memory://path/to/my_table.parquet") +Example reading parquet file from GitHub directly: -Example reading parquet file from GitHub directly:: +.. code-block:: python - pq.read_table("fsspec+github://apache:arrow-testing@/data/parquet/alltypes-java.parquet") + >>> pq.read_table("fsspec+github://apache:arrow-testing@/data/parquet/alltypes-java.parquet") # doctest: +SKIP Hugging Face URIs are explicitly allowed as a shortcut without needing to prefix -with ``fsspec+``. This is useful for reading datasets hosted on Hugging Face:: +with ``fsspec+``. This is useful for reading datasets hosted on Hugging Face: + +.. code-block:: python - pq.read_table("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet") + >>> pq.read_table("hf://datasets/stanfordnlp/imdb/plain_text/train-00000-of-00001.parquet") # doctest: +SKIP Using Arrow filesystems with fsspec @@ -425,20 +437,23 @@ need to interact with a package that expects fsspec-compatible filesystem objects, you can wrap an Arrow FileSystem object with fsspec. Starting with ``fsspec`` version 2021.09, the ``ArrowFSWrapper`` can be used -for this:: +for this: + +.. code-block:: python - >>> from pyarrow import fs >>> local = fs.LocalFileSystem() - >>> from fsspec.implementations.arrow import ArrowFSWrapper - >>> local_fsspec = ArrowFSWrapper(local) + >>> from fsspec.implementations.arrow import ArrowFSWrapper # doctest: +SKIP + >>> local_fsspec = ArrowFSWrapper(local) # doctest: +SKIP The resulting object now has an fsspec-compatible interface, while being backed by the Arrow FileSystem under the hood. -Example usage to create a directory and file, and list the content:: +Example usage to create a directory and file, and list the content: + +.. code-block:: python - >>> local_fsspec.mkdir("./test") - >>> local_fsspec.touch("./test/file.txt") - >>> local_fsspec.ls("./test/") + >>> local_fsspec.mkdir("./test") # doctest: +SKIP + >>> local_fsspec.touch("./test/file.txt") # doctest: +SKIP + >>> local_fsspec.ls("./test/") # doctest: +SKIP ['./test/file.txt'] For more information, see the `fsspec`_ documentation. diff --git a/docs/source/python/getstarted.rst b/docs/source/python/getstarted.rst index ef8fe690ab57..1573f65037d3 100644 --- a/docs/source/python/getstarted.rst +++ b/docs/source/python/getstarted.rst @@ -15,17 +15,6 @@ .. specific language governing permissions and limitations .. under the License. -.. ipython:: python - :suppress: - - # set custom tmp working directory for files that create data - import os - import tempfile - - orig_working_dir = os.getcwd() - temp_working_dir = tempfile.mkdtemp(prefix="pyarrow-") - os.chdir(temp_working_dir) - .. _getstarted: Getting Started @@ -47,24 +36,29 @@ Arrow to use the best performing implementation to store the data and perform computations on it. So each array is meant to have data and a type -.. ipython:: python +.. code-block:: python - import pyarrow as pa - - days = pa.array([1, 12, 17, 23, 28], type=pa.int8()) + >>> import pyarrow as pa + >>> days = pa.array([1, 12, 17, 23, 28], type=pa.int8()) Multiple arrays can be combined in tables to form the columns in tabular data when attached to a column name -.. ipython:: python - - months = pa.array([1, 3, 5, 7, 1], type=pa.int8()) - years = pa.array([1990, 2000, 1995, 2000, 1995], type=pa.int16()) - - birthdays_table = pa.table([days, months, years], - names=["days", "months", "years"]) - - birthdays_table +.. code-block:: python + + >>> months = pa.array([1, 3, 5, 7, 1], type=pa.int8()) + >>> years = pa.array([1990, 2000, 1995, 2000, 1995], type=pa.int16()) + >>> birthdays_table = pa.table([days, months, years], + ... names=["days", "months", "years"]) + >>> birthdays_table + pyarrow.Table + days: int8 + months: int8 + years: int16 + ---- + days: [[1,12,17,23,28]] + months: [[1,3,5,7,1]] + years: [[1990,2000,1995,2000,1995]] See :ref:`data` for more details. @@ -75,21 +69,27 @@ Once you have tabular data, Arrow provides out of the box the features to save and restore that data for common formats like Parquet: -.. ipython:: python - - import pyarrow.parquet as pq +.. code-block:: python - pq.write_table(birthdays_table, 'birthdays.parquet') + >>> import pyarrow.parquet as pq + >>> pq.write_table(birthdays_table, 'birthdays.parquet') Once you have your data on disk, loading it back is a single function call, and Arrow is heavily optimized for memory and speed so loading data will be as quick as possible -.. ipython:: python +.. code-block:: python - reloaded_birthdays = pq.read_table('birthdays.parquet') - - reloaded_birthdays + >>> reloaded_birthdays = pq.read_table('birthdays.parquet') + >>> reloaded_birthdays + pyarrow.Table + days: int8 + months: int8 + years: int16 + ---- + days: [[1,12,17,23,28]] + months: [[1,3,5,7,1]] + years: [[1990,2000,1995,2000,1995]] Saving and loading back data in arrow is usually done through :ref:`Parquet `, :ref:`IPC format ` (:ref:`feather`), @@ -102,11 +102,24 @@ Arrow ships with a bunch of compute functions that can be applied to its arrays and tables, so through the compute functions it's possible to apply transformations to the data -.. ipython:: python - - import pyarrow.compute as pc - - pc.value_counts(birthdays_table["years"]) +.. code-block:: python + + >>> import pyarrow.compute as pc + >>> pc.value_counts(birthdays_table["years"]) + + -- is_valid: all not null + -- child 0 type: int16 + [ + 1990, + 2000, + 1995 + ] + -- child 1 type: int64 + [ + 1, + 2, + 2 + ] See :ref:`compute` for a list of available compute functions and how to use them. @@ -118,33 +131,40 @@ Arrow also provides the :class:`pyarrow.dataset` API to work with large data, which will handle for you partitioning of your data in smaller chunks -.. ipython:: python - - import pyarrow.dataset as ds +.. code-block:: python - ds.write_dataset(birthdays_table, "savedir", format="parquet", - partitioning=ds.partitioning( - pa.schema([birthdays_table.schema.field("years")]) - )) + >>> import pyarrow.dataset as ds + >>> ds.write_dataset(birthdays_table, "savedir", format="parquet", + ... partitioning=ds.partitioning( + ... pa.schema([birthdays_table.schema.field("years")]) + ... )) Loading back the partitioned dataset will detect the chunks -.. ipython:: python - - birthdays_dataset = ds.dataset("savedir", format="parquet", partitioning=["years"]) +.. code-block:: python - birthdays_dataset.files + >>> birthdays_dataset = ds.dataset("savedir", format="parquet", partitioning=["years"]) + >>> birthdays_dataset.files + ['savedir/1990/part-0.parquet', 'savedir/1995/part-0.parquet', 'savedir/2000/part-0.parquet'] and will lazily load chunks of data only when iterating over them -.. ipython:: python - :okexcept: - - import datetime - - current_year = datetime.datetime.now(datetime.UTC).year - for table_chunk in birthdays_dataset.to_batches(): - print("AGES", pc.subtract(current_year, table_chunk["years"])) +.. code-block:: python + + >>> current_year = 2025 + >>> for table_chunk in birthdays_dataset.to_batches(): + ... print("AGES", pc.subtract(current_year, table_chunk["years"])) + AGES [ + 35 + ] + AGES [ + 30, + 30 + ] + AGES [ + 25, + 25 + ] For further details on how to work with big datasets, how to filter them, how to project them, etc., refer to :ref:`dataset` documentation. @@ -155,14 +175,3 @@ Continuing from here For digging further into Arrow, you might want to read the :doc:`PyArrow Documentation <./index>` itself or the `Arrow Python Cookbook `_ - - -.. ipython:: python - :suppress: - - # clean-up custom working directory - import os - import shutil - - os.chdir(orig_working_dir) - shutil.rmtree(temp_working_dir, ignore_errors=True) diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst index c6f098ee20a2..31f7c9c14eaa 100644 --- a/docs/source/python/install.rst +++ b/docs/source/python/install.rst @@ -54,7 +54,7 @@ and macOS): .. code-block:: bash - pip install pyarrow + pip install pyarrow If you encounter any importing issues of the pip wheels on Windows, you may need to install the `latest Visual C++ Redistributable for Visual Studio @@ -96,7 +96,7 @@ a custom path to the database from Python: .. code-block:: python >>> import pyarrow as pa - >>> pa.set_timezone_db_path("custom_path") + >>> pa.set_timezone_db_path("custom_path") # doctest: +SKIP You may encounter problems writing datetime data to an ORC file if you install pyarrow with pip. One possible solution to fix this problem: @@ -109,8 +109,8 @@ command: .. code-block:: python - >>> import tzdata - >>> print(tzdata.__file__) + >>> import tzdata # doctest: +SKIP + >>> print(tzdata.__file__) # doctest: +SKIP path\to\.venv\Lib\site-packages\tzdata\__init__.py diff --git a/docs/source/python/integration/cuda.rst b/docs/source/python/integration/cuda.rst index b0150c1c5c8a..2c84ecb39577 100644 --- a/docs/source/python/integration/cuda.rst +++ b/docs/source/python/integration/cuda.rst @@ -33,48 +33,55 @@ CUDA Contexts ------------- A CUDA context represents access to a particular CUDA-capable device. -For example, this is creating a CUDA context accessing CUDA device number 0:: +For example, this is creating a CUDA context accessing CUDA device number 0: - >>> from pyarrow import cuda - >>> ctx = cuda.Context(0) - >>> +.. code-block:: python + + >>> from pyarrow import cuda # doctest: +SKIP + >>> ctx = cuda.Context(0) # doctest: +SKIP CUDA Buffers ------------ A CUDA buffer can be created by copying data from host memory to the memory of a CUDA device, using the :meth:`Context.buffer_from_data` method. -The source data can be any Python buffer-like object, including Arrow buffers:: +The source data can be any Python buffer-like object, including Arrow buffers: + +.. code-block:: python >>> import numpy as np >>> arr = np.arange(4, dtype=np.int32) >>> arr.nbytes 16 - >>> cuda_buf = ctx.buffer_from_data(arr) - >>> type(cuda_buf) + >>> cuda_buf = ctx.buffer_from_data(arr) # doctest: +SKIP + >>> type(cuda_buf) # doctest: +SKIP pyarrow._cuda.CudaBuffer - >>> cuda_buf.size # The buffer's size in bytes + >>> cuda_buf.size # doctest: +SKIP 16 - >>> cuda_buf.address # The buffer's address in device memory + >>> cuda_buf.address # doctest: +SKIP 30088364544 - >>> cuda_buf.context.device_number + >>> cuda_buf.context.device_number # doctest: +SKIP 0 Conversely, you can copy back a CUDA buffer to device memory, getting a regular -CPU buffer:: +CPU buffer: + +.. code-block:: python - >>> buf = cuda_buf.copy_to_host() - >>> type(buf) + >>> buf = cuda_buf.copy_to_host() # doctest: +SKIP + >>> type(buf) # doctest: +SKIP pyarrow.lib.Buffer - >>> np.frombuffer(buf, dtype=np.int32) + >>> np.frombuffer(buf, dtype=np.int32) # doctest: +SKIP array([0, 1, 2, 3], dtype=int32) .. warning:: Many Arrow functions expect a CPU buffer but will not check the buffer's actual type. You will get a crash if you pass a CUDA buffer to such a - function:: + function: - >>> pa.py_buffer(b"x" * 16).equals(cuda_buf) + .. code-block:: python + + >>> pa.py_buffer(b"x" * 16).equals(cuda_buf) # doctest: +SKIP Segmentation fault Numba Integration @@ -88,15 +95,16 @@ Arrow to Numba ~~~~~~~~~~~~~~ First let's define a Numba CUDA kernel operating on an ``int32`` array. Here, -we will simply increment each array element (assuming the array is writable):: +we will simply increment each array element (assuming the array is writable): - import numba.cuda +.. code-block:: python - @numba.cuda.jit - def increment_by_one(an_array): - pos = numba.cuda.grid(1) - if pos < an_array.size: - an_array[pos] += 1 + >>> import numba.cuda # doctest: +SKIP + >>> @numba.cuda.jit # doctest: +SKIP + ... def increment_by_one(an_array): + ... pos = numba.cuda.grid(1) + ... if pos < an_array.size: + ... an_array[pos] += 1 Then we need to wrap our CUDA buffer into a Numba "device array" with the right array metadata (shape, strides and datatype). This is necessary so that Numba @@ -104,23 +112,29 @@ can identify the array's characteristics and compile the kernel with the appropriate type declarations. In this case the metadata can simply be got from the original Numpy array. -Note the GPU data isn't copied, just pointed to:: +Note the GPU data isn't copied, just pointed to: + +.. code-block:: python - >>> from numba.cuda.cudadrv.devicearray import DeviceNDArray - >>> device_arr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=cuda_buf.to_numba()) + >>> from numba.cuda.cudadrv.devicearray import DeviceNDArray # doctest: +SKIP + >>> device_arr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=cuda_buf.to_numba()) # doctest: +SKIP (ideally we could have defined an Arrow array in CPU memory, copied it to CUDA memory without losing type information, and then invoked the Numba kernel on it without constructing the DeviceNDArray by hand; this is not yet possible) Finally we can run the Numba CUDA kernel on the Numba device array (here -with a 16x16 grid size):: +with a 16x16 grid size): - >>> increment_by_one[16, 16](device_arr) +.. code-block:: python -And the results can be checked by copying back the CUDA buffer to CPU memory:: + >>> increment_by_one[16, 16](device_arr) # doctest: +SKIP - >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) +And the results can be checked by copying back the CUDA buffer to CPU memory: + +.. code-block:: python + + >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) # doctest: +SKIP array([1, 2, 3, 4], dtype=int32) Numba to Arrow @@ -129,30 +143,34 @@ Numba to Arrow Conversely, a Numba-created device array can be viewed as an Arrow CUDA buffer, using the :meth:`CudaBuffer.from_numba` factory method. -For the sake of example, let's first create a Numba device array:: +For the sake of example, let's first create a Numba device array: + +.. code-block:: python >>> arr = np.arange(10, 14, dtype=np.int32) >>> arr array([10, 11, 12, 13], dtype=int32) - >>> device_arr = numba.cuda.to_device(arr) + >>> device_arr = numba.cuda.to_device(arr) # doctest: +SKIP Then we can create a CUDA buffer pointing the device array's memory. We don't need to pass a CUDA context explicitly this time: the appropriate CUDA context is automatically retrieved and adapted from the Numba object. -:: +.. code-block:: python - >>> cuda_buf = cuda.CudaBuffer.from_numba(device_arr.gpu_data) - >>> cuda_buf.size + >>> cuda_buf = cuda.CudaBuffer.from_numba(device_arr.gpu_data) # doctest: +SKIP + >>> cuda_buf.size # doctest: +SKIP 16 - >>> cuda_buf.address + >>> cuda_buf.address # doctest: +SKIP 30088364032 - >>> cuda_buf.context.device_number + >>> cuda_buf.context.device_number # doctest: +SKIP 0 -Of course, we can copy the CUDA buffer back to host memory:: +Of course, we can copy the CUDA buffer back to host memory: + +.. code-block:: python - >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) + >>> np.frombuffer(cuda_buf.copy_to_host(), dtype=np.int32) # doctest: +SKIP array([10, 11, 12, 13], dtype=int32) .. seealso:: diff --git a/docs/source/python/integration/substrait.rst b/docs/source/python/integration/substrait.rst index f7a8f20761da..ebc730614d8f 100644 --- a/docs/source/python/integration/substrait.rst +++ b/docs/source/python/integration/substrait.rst @@ -35,36 +35,41 @@ Arrow schemas can be encoded and decoded using the :meth:`pyarrow.substrait.seri .. code-block:: python - import pyarrow as pa - import pyarrow.substrait as pa_substrait - - arrow_schema = pa.schema([ - pa.field("x", pa.int32()), - pa.field("y", pa.string()) - ]) - substrait_schema = pa_substrait.serialize_schema(arrow_schema) + >>> import pyarrow as pa + >>> import pyarrow.substrait as pa_substrait + >>> arrow_schema = pa.schema([ + ... pa.field("x", pa.int32()), + ... pa.field("y", pa.string()) + ... ]) + >>> substrait_schema = pa_substrait.serialize_schema(arrow_schema) The schema marshalled as a Substrait ``NamedStruct`` is directly -available as ``substrait_schema.schema``:: +available as ``substrait_schema.schema``: + +.. code-block:: python - >>> print(substrait_schema.schema) + >>> print(bytes(substrait_schema.schema)) b'\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01' In case arrow custom types were used, the schema will require extensions for those types to be actually usable, for this reason the schema is also available as an `Extended Expression`_ including -all the extensions types:: +all the extensions types: - >>> print(substrait_schema.expression) - b'"\x14\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01:\x19\x10,*\x15Acero 17.0.0' +.. code-block:: python + + >>> print(bytes(substrait_schema.expression)) + b'"\x14\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01:\x19\x10,*\x15Acero ...' If ``Substrait Python`` is installed, the schema can also be converted to -a ``substrait-python`` object:: +a ``substrait-python`` object: + +.. code-block:: python - >>> print(substrait_schema.to_pysubstrait()) + >>> print(substrait_schema.to_pysubstrait()) # doctest: +SKIP version { minor_number: 44 - producer: "Acero 17.0.0" + producer: "Acero ..." } base_schema { names: "x" @@ -92,33 +97,33 @@ Arrow compute expressions can be encoded and decoded using the .. code-block:: python - import pyarrow as pa - import pyarrow.compute as pa - import pyarrow.substrait as pa_substrait - - arrow_schema = pa.schema([ - pa.field("x", pa.int32()), - pa.field("y", pa.int32()) - ]) - - substrait_expr = pa_substrait.serialize_expressions( - exprs=[pc.field("x") + pc.field("y")], - names=["total"], - schema=arrow_schema - ) + >>> import pyarrow.compute as pc + >>> arrow_schema = pa.schema([ + ... pa.field("x", pa.int32()), + ... pa.field("y", pa.int32()) + ... ]) + >>> substrait_expr = pa_substrait.serialize_expressions( + ... exprs=[pc.field("x") + pc.field("y")], + ... names=["total"], + ... schema=arrow_schema + ... ) The result of encoding to substrait an expression will be the -protobuf ``ExtendedExpression`` message data itself:: +protobuf ``ExtendedExpression`` message data itself: + +.. code-block:: python >>> print(bytes(substrait_expr)) - b'\nZ\x12Xhttps://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml\x12\x07\x1a\x05\x1a\x03add\x1a>\n5\x1a3\x1a\x04*\x02\x10\x01"\n\x1a\x08\x12\x06\n\x02\x12\x00"\x00"\x0c\x1a\n\x12\x08\n\x04\x12\x02\x08\x01"\x00*\x11\n\x08overflow\x12\x05ERROR\x1a\x05total"\x14\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04*\x02\x10\x01:\x19\x10,*\x15Acero 17.0.0' + b'\nZ\x12Xhttps://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml\x12\x07\x1a\x05\x1a\x03add\x1a>\n5\x1a3\x1a\x04*\x02\x10\x01"\n\x1a\x08\x12\x06\n\x02\x12\x00"\x00"\x0c\x1a\n\x12\x08\n\x04\x12\x02\x08\x01"\x00*\x11\n\x08overflow\x12\x05ERROR\x1a\x05total"\x14\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04*\x02\x10\x01:\x19\x10,*\x15Acero ...' So in case a ``Substrait Python`` object is required, the expression -has to be decoded from ``substrait-python`` itself:: +has to be decoded from ``substrait-python`` itself: + +.. code-block:: python - >>> import substrait - >>> pysubstrait_expr = substrait.proto.ExtendedExpression.FromString(substrait_expr) - >>> print(pysubstrait_expr) + >>> import substrait # doctest: +SKIP + >>> pysubstrait_expr = substrait.proto.ExtendedExpression.FromString(substrait_expr) # doctest: +SKIP + >>> print(pysubstrait_expr) # doctest: +SKIP version { minor_number: 44 producer: "Acero 17.0.0" @@ -198,39 +203,33 @@ the expressions can be passed to the dataset scanner in the form of .. code-block:: python - import pyarrow.dataset as ds - import pyarrow.substrait as pa_substrait - - # Use substrait-python to create the queries - from substrait import proto - - dataset = ds.dataset("./data/index-0.parquet") - substrait_schema = pa_substrait.serialize_schema(dataset.schema).to_pysubstrait() - - # SELECT project_name FROM dataset WHERE project_name = 'pyarrow' - - projection = proto.ExtendedExpression(referred_expr=[ - {"expression": {"selection": {"direct_reference": {"struct_field": {"field": 0}}}}, - "output_names": ["project_name"]} - ]) - projection.MergeFrom(substrait_schema) - - filtering = proto.ExtendedExpression( - extension_uris=[{"extension_uri_anchor": 99, "uri": "/functions_comparison.yaml"}], - extensions=[{"extension_function": {"extension_uri_reference": 99, "function_anchor": 199, "name": "equal:any1_any1"}}], - referred_expr=[ - {"expression": {"scalar_function": {"function_reference": 199, "arguments": [ - {"value": {"selection": {"direct_reference": {"struct_field": {"field": 0}}}}}, - {"value": {"literal": {"string": "pyarrow"}}} - ], "output_type": {"bool": {"nullability": False}}}}} - ] - ) - filtering.MergeFrom(substrait_schema) - - results = dataset.scanner( - columns=pa.substrait.BoundExpressions.from_substrait(projection), - filter=pa.substrait.BoundExpressions.from_substrait(filtering) - ).head(5) + >>> import pyarrow.dataset as ds # doctest: +SKIP + >>> import pyarrow.substrait as pa_substrait # doctest: +SKIP + >>> # Use substrait-python to create the queries + >>> from substrait import proto # doctest: +SKIP + >>> dataset = ds.dataset("./data/index-0.parquet") # doctest: +SKIP + >>> substrait_schema = pa_substrait.serialize_schema(dataset.schema).to_pysubstrait() # doctest: +SKIP + >>> # SELECT project_name FROM dataset WHERE project_name = 'pyarrow' + >>> projection = proto.ExtendedExpression(referred_expr=[ # doctest: +SKIP + ... {"expression": {"selection": {"direct_reference": {"struct_field": {"field": 0}}}}, + ... "output_names": ["project_name"]} + ... ]) + >>> projection.MergeFrom(substrait_schema) # doctest: +SKIP + >>> filtering = proto.ExtendedExpression( # doctest: +SKIP + ... extension_uris=[{"extension_uri_anchor": 99, "uri": "/functions_comparison.yaml"}], + ... extensions=[{"extension_function": {"extension_uri_reference": 99, "function_anchor": 199, "name": "equal:any1_any1"}}], + ... referred_expr=[ + ... {"expression": {"scalar_function": {"function_reference": 199, "arguments": [ + ... {"value": {"selection": {"direct_reference": {"struct_field": {"field": 0}}}}}, + ... {"value": {"literal": {"string": "pyarrow"}}} + ... ], "output_type": {"bool": {"nullability": False}}}}} + ... ] + ... ) + >>> filtering.MergeFrom(substrait_schema) # doctest: +SKIP + >>> results = dataset.scanner( # doctest: +SKIP + ... columns=pa.substrait.BoundExpressions.from_substrait(projection), + ... filter=pa.substrait.BoundExpressions.from_substrait(filtering) + ... ).head(5) .. code-block:: text diff --git a/docs/source/python/interchange_protocol.rst b/docs/source/python/interchange_protocol.rst index 2a5ec8afede7..efb10d2ab52e 100644 --- a/docs/source/python/interchange_protocol.rst +++ b/docs/source/python/interchange_protocol.rst @@ -29,7 +29,6 @@ data type. The protocol also has missing data support and it supports chunking, meaning accessing the data in “batches” of rows. - The Python dataframe interchange protocol is designed by the `Consortium for Python Data API Standards `_ in order to enable data interchange between dataframe @@ -43,7 +42,7 @@ From PyArrow to other libraries: ``__dataframe__()`` method The ``__dataframe__()`` method creates a new exchange object that the consumer library can take and construct an object of it's own. -.. code-block:: +.. code-block:: python >>> import pyarrow as pa >>> table = pa.table({"n_attendees": [100, 10, 1]}) @@ -65,7 +64,7 @@ protocol. We can for example take a pandas dataframe and construct a PyArrow table with the use of the interchange protocol: -.. code-block:: +.. code-block:: python >>> import pyarrow >>> from pyarrow.interchange import from_dataframe @@ -90,18 +89,18 @@ PyArrow table with the use of the interchange protocol: We can do the same with a polars dataframe: -.. code-block:: +.. code-block:: python - >>> import polars as pl - >>> from datetime import datetime - >>> arr = [datetime(2023, 5, 20, 10, 0), + >>> import polars as pl # doctest: +SKIP + >>> from datetime import datetime # doctest: +SKIP + >>> arr = [datetime(2023, 5, 20, 10, 0), # doctest: +SKIP ... datetime(2023, 5, 20, 11, 0), ... datetime(2023, 5, 20, 13, 30)] - >>> df = pl.DataFrame({ + >>> df = pl.DataFrame({ # doctest: +SKIP ... 'Talk': ['About Polars','Intro into PyArrow','Coding in Rust'], ... 'Time': arr, ... }) - >>> df + >>> df # doctest: +SKIP shape: (3, 2) ┌────────────────────┬─────────────────────┐ │ Talk ┆ Time │ @@ -112,7 +111,7 @@ We can do the same with a polars dataframe: │ Intro into PyArrow ┆ 2023-05-20 11:00:00 │ │ Coding in Rust ┆ 2023-05-20 13:30:00 │ └────────────────────┴─────────────────────┘ - >>> from_dataframe(df) + >>> from_dataframe(df) # doctest: +SKIP pyarrow.Table Talk: large_string Time: timestamp[us] diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst index f55e8f8bc5dc..8f963639689f 100644 --- a/docs/source/python/ipc.rst +++ b/docs/source/python/ipc.rst @@ -15,16 +15,6 @@ .. specific language governing permissions and limitations .. under the License. -.. ipython:: python - :suppress: - - # set custom tmp working directory for files that create data - import os - import tempfile - - orig_working_dir = os.getcwd() - temp_working_dir = tempfile.mkdtemp(prefix="pyarrow-") - os.chdir(temp_working_dir) .. currentmodule:: pyarrow @@ -54,32 +44,31 @@ Using streams First, let's create a small record batch: -.. ipython:: python - - import pyarrow as pa +.. code-block:: python - data = [ - pa.array([1, 2, 3, 4]), - pa.array(['foo', 'bar', 'baz', None]), - pa.array([True, None, False, True]) - ] - - batch = pa.record_batch(data, names=['f0', 'f1', 'f2']) - batch.num_rows - batch.num_columns + >>> import pyarrow as pa + >>> data = [ + ... pa.array([1, 2, 3, 4]), + ... pa.array(['foo', 'bar', 'baz', None]), + ... pa.array([True, None, False, True]) + ... ] + >>> batch = pa.record_batch(data, names=['f0', 'f1', 'f2']) + >>> batch.num_rows + 4 + >>> batch.num_columns + 3 Now, we can begin writing a stream containing some number of these batches. For this we use :class:`~pyarrow.RecordBatchStreamWriter`, which can write to a writeable ``NativeFile`` object or a writeable Python object. For convenience, this one can be created with :func:`~pyarrow.ipc.new_stream`: -.. ipython:: python - - sink = pa.BufferOutputStream() +.. code-block:: python - with pa.ipc.new_stream(sink, batch.schema) as writer: - for i in range(5): - writer.write_batch(batch) + >>> sink = pa.BufferOutputStream() + >>> with pa.ipc.new_stream(sink, batch.schema) as writer: + ... for i in range(5): + ... writer.write_batch(batch) Here we used an in-memory Arrow buffer stream (``sink``), but this could have been a socket or some other IO sink. @@ -88,29 +77,34 @@ When creating the ``StreamWriter``, we pass the schema, since the schema (column names and types) must be the same for all of the batches sent in this particular stream. Now we can do: -.. ipython:: python +.. code-block:: python - buf = sink.getvalue() - buf.size + >>> buf = sink.getvalue() + >>> buf.size + 1984 Now ``buf`` contains the complete stream as an in-memory byte buffer. We can read such a stream with :class:`~pyarrow.RecordBatchStreamReader` or the convenience function ``pyarrow.ipc.open_stream``: -.. ipython:: python - - with pa.ipc.open_stream(buf) as reader: - schema = reader.schema - batches = [b for b in reader] +.. code-block:: python - schema - len(batches) + >>> with pa.ipc.open_stream(buf) as reader: + ... schema = reader.schema + ... batches = [b for b in reader] + >>> schema + f0: int64 + f1: string + f2: bool + >>> len(batches) + 5 We can check the returned batches are the same as the original input: -.. ipython:: python +.. code-block:: python - batches[0].equals(batch) + >>> batches[0].equals(batch) + True An important point is that if the input source supports zero-copy reads (e.g. like a memory map, or ``pyarrow.BufferReader``), then the returned @@ -123,35 +117,36 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as :class:`~pyarrow.RecordBatchStreamWriter`. You can create one with :func:`~pyarrow.ipc.new_file`: -.. ipython:: python +.. code-block:: python - sink = pa.BufferOutputStream() - - with pa.ipc.new_file(sink, batch.schema) as writer: - for i in range(10): - writer.write_batch(batch) - - buf = sink.getvalue() - buf.size + >>> sink = pa.BufferOutputStream() + >>> with pa.ipc.new_file(sink, batch.schema) as writer: + ... for i in range(10): + ... writer.write_batch(batch) + >>> buf = sink.getvalue() + >>> buf.size + 4226 The difference between :class:`~pyarrow.RecordBatchFileReader` and :class:`~pyarrow.RecordBatchStreamReader` is that the input source must have a ``seek`` method for random access. The stream reader only requires read operations. We can also use the :func:`~pyarrow.ipc.open_file` method to open a file: -.. ipython:: python +.. code-block:: python - with pa.ipc.open_file(buf) as reader: - num_record_batches = reader.num_record_batches - b = reader.get_batch(3) + >>> with pa.ipc.open_file(buf) as reader: + ... num_record_batches = reader.num_record_batches + ... b = reader.get_batch(3) Because we have access to the entire payload, we know the number of record batches in the file, and can read any at random. -.. ipython:: python +.. code-block:: python - num_record_batches - b.equals(batch) + >>> num_record_batches + 10 + >>> b.equals(batch) + True Reading from Stream and File Format for pandas ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -160,12 +155,17 @@ The stream and file reader classes have a special ``read_pandas`` method to simplify reading multiple record batches and converting them to a single DataFrame output: -.. ipython:: python - - with pa.ipc.open_file(buf) as reader: - df = reader.read_pandas() +.. code-block:: python - df[:5] + >>> with pa.ipc.open_file(buf) as reader: + ... df = reader.read_pandas() + >>> df[:5] + f0 f1 f2 + 0 1 foo True + 1 2 bar None + 2 3 baz False + 3 4 NaN True + 4 1 foo True Efficiently Writing and Reading Arrow Data ------------------------------------------ @@ -183,18 +183,16 @@ that can be used to write batches of data to that file. For example to write an array of 10M integers, we could write it in 1000 chunks of 10000 entries: -.. ipython:: python +.. code-block:: python - BATCH_SIZE = 10000 - NUM_BATCHES = 1000 - - schema = pa.schema([pa.field('nums', pa.int32())]) - - with pa.OSFile('bigfile.arrow', 'wb') as sink: - with pa.ipc.new_file(sink, schema) as writer: - for row in range(NUM_BATCHES): - batch = pa.record_batch([pa.array(range(BATCH_SIZE), type=pa.int32())], schema) - writer.write(batch) + >>> BATCH_SIZE = 10000 + >>> NUM_BATCHES = 1000 + >>> schema = pa.schema([pa.field('nums', pa.int32())]) + >>> with pa.OSFile('bigfile.arrow', 'wb') as sink: + ... with pa.ipc.new_file(sink, schema) as writer: + ... for row in range(NUM_BATCHES): + ... batch = pa.record_batch([pa.array(range(BATCH_SIZE), type=pa.int32())], schema) + ... writer.write(batch) record batches support multiple columns, so in practice we always write the equivalent of a :class:`~pyarrow.Table`. @@ -206,13 +204,14 @@ by directly mapping the data from disk and avoid allocating any new memory on re Under normal conditions, reading back our file will consume a few hundred megabytes of memory: -.. ipython:: python - - with pa.OSFile('bigfile.arrow', 'rb') as source: - loaded_array = pa.ipc.open_file(source).read_all() +.. code-block:: python - print("LEN:", len(loaded_array)) - print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) + >>> with pa.OSFile('bigfile.arrow', 'rb') as source: + ... loaded_array = pa.ipc.open_file(source).read_all() + >>> print("LEN:", len(loaded_array)) + LEN: 10000000 + >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) + RSS: 38MB To more efficiently read big data from disk, we can memory map the file, so that Arrow can directly reference the data mapped from disk and avoid having to @@ -221,12 +220,14 @@ In such case the operating system will be able to page in the mapped memory lazily and page it out without any write back cost when under pressure, allowing to more easily read arrays bigger than the total memory. -.. ipython:: python +.. code-block:: python - with pa.memory_map('bigfile.arrow', 'rb') as source: - loaded_array = pa.ipc.open_file(source).read_all() - print("LEN:", len(loaded_array)) - print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) + >>> with pa.memory_map('bigfile.arrow', 'rb') as source: + ... loaded_array = pa.ipc.open_file(source).read_all() + >>> print("LEN:", len(loaded_array)) + LEN: 10000000 + >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) + RSS: 0MB .. note:: diff --git a/docs/source/python/json.rst b/docs/source/python/json.rst index 277b8e134947..53aa3536a362 100644 --- a/docs/source/python/json.rst +++ b/docs/source/python/json.rst @@ -47,18 +47,20 @@ Usage JSON reading functionality is available through the :mod:`pyarrow.json` module. In many cases, you will simply call the :func:`read_json` function -with the file path you want to read from:: +with the file path you want to read from: + +.. code-block:: python >>> from pyarrow import json - >>> fn = 'my_data.json' - >>> table = json.read_json(fn) - >>> table + >>> fn = 'my_data.json' # doctest: +SKIP + >>> table = json.read_json(fn) # doctest: +SKIP + >>> table # doctest: +SKIP pyarrow.Table a: int64 b: double c: string d: bool - >>> table.to_pandas() + >>> table.to_pandas() # doctest: +SKIP a b c d 0 1 2.0 foo False 1 4 -5.5 None True @@ -89,17 +91,19 @@ Thus, reading this JSON file: {"a": [1, 2], "b": {"c": true, "d": "1991-02-03"}} {"a": [3, 4, 5], "b": {"c": false, "d": "2019-04-01"}} -returns the following data:: +returns the following data: + +.. code-block:: python - >>> table = json.read_json("my_data.json") - >>> table + >>> table = json.read_json("my_data.json") # doctest: +SKIP + >>> table # doctest: +SKIP pyarrow.Table a: list child 0, item: int64 b: struct child 0, c: bool child 1, d: timestamp[s] - >>> table.to_pandas() + >>> table.to_pandas() # doctest: +SKIP a b 0 [1, 2] {'c': True, 'd': 1991-02-03 00:00:00} 1 [3, 4, 5] {'c': False, 'd': 2019-04-01 00:00:00} diff --git a/docs/source/python/memory.rst b/docs/source/python/memory.rst index 029d30cc1b69..3bb7c57f3a9c 100644 --- a/docs/source/python/memory.rst +++ b/docs/source/python/memory.rst @@ -52,14 +52,15 @@ A :class:`Buffer` can be created from any Python object implementing the buffer protocol by calling the :func:`py_buffer` function. Let's consider a bytes object: -.. ipython:: python +.. code-block:: python - import pyarrow as pa - - data = b'abcdefghijklmnopqrstuvwxyz' - buf = pa.py_buffer(data) - buf - buf.size + >>> import pyarrow as pa + >>> data = b'abcdefghijklmnopqrstuvwxyz' + >>> buf = pa.py_buffer(data) + >>> buf + + >>> buf.size + 26 Creating a Buffer in this way does not allocate any memory; it is a zero-copy view on the memory exported from the ``data`` bytes object. @@ -70,16 +71,18 @@ referenced using the :func:`foreign_buffer` function. Buffers can be used in circumstances where a Python buffer or memoryview is required, and such conversions are zero-copy: -.. ipython:: python +.. code-block:: python - memoryview(buf) + >>> memoryview(buf) + The Buffer's :meth:`~Buffer.to_pybytes` method converts the Buffer's data to a Python bytestring (thus making a copy of the data): -.. ipython:: python +.. code-block:: python - buf.to_pybytes() + >>> buf.to_pybytes() + b'abcdefghijklmnopqrstuvwxyz' Memory Pools ------------ @@ -88,26 +91,30 @@ All memory allocations and deallocations (like ``malloc`` and ``free`` in C) are tracked in an instance of :class:`MemoryPool`. This means that we can then precisely track amount of memory that has been allocated: -.. ipython:: python +.. code-block:: python - pa.total_allocated_bytes() + >>> pa.total_allocated_bytes() + 0 Let's allocate a resizable :class:`Buffer` from the default pool: -.. ipython:: python +.. code-block:: python - buf = pa.allocate_buffer(1024, resizable=True) - pa.total_allocated_bytes() - buf.resize(2048) - pa.total_allocated_bytes() + >>> buf = pa.allocate_buffer(1024, resizable=True) + >>> pa.total_allocated_bytes() + 1024 + >>> buf.resize(2048) + >>> pa.total_allocated_bytes() + 2048 The default allocator requests memory in a minimum increment of 64 bytes. If the buffer is garbage-collected, all of the memory is freed: -.. ipython:: python +.. code-block:: python - buf = None - pa.total_allocated_bytes() + >>> buf = None + >>> pa.total_allocated_bytes() + 0 Besides the default built-in memory pool, there may be additional memory pools to choose from (such as `jemalloc `_) @@ -182,11 +189,12 @@ The :func:`~pyarrow.input_stream` function allows creating a readable * If passed a :class:`~pyarrow.Buffer` or a ``memoryview`` object, a :class:`~pyarrow.BufferReader` will be returned: - .. ipython:: python + .. code-block:: python - buf = memoryview(b"some data") - stream = pa.input_stream(buf) - stream.read(4) + >>> buf = memoryview(b"some data") + >>> stream = pa.input_stream(buf) + >>> stream.read(4) + b'some' * If passed a string or file path, it will open the given file on disk for reading, creating a :class:`~pyarrow.OSFile`. Optionally, the file @@ -194,14 +202,15 @@ The :func:`~pyarrow.input_stream` function allows creating a readable such as ``.gz``, its contents will automatically be decompressed on reading. - .. ipython:: python - - import gzip - with gzip.open('example.gz', 'wb') as f: - f.write(b'some data\n' * 3) + .. code-block:: python - stream = pa.input_stream('example.gz') - stream.read() + >>> import gzip + >>> with gzip.open('example.gz', 'wb') as f: + ... f.write(b'some data\n' * 3) + 30 + >>> stream = pa.input_stream('example.gz') + >>> stream.read() + b'some data\nsome data\nsome data\n' * If passed a Python file object, it will wrapped in a :class:`PythonFile` such that the Arrow C++ libraries can read data from it (at the expense @@ -215,13 +224,14 @@ and allows creating a writable :class:`~pyarrow.NativeFile`. It has the same features as explained above for :func:`~pyarrow.input_stream`, such as being able to write to buffers or do on-the-fly compression. -.. ipython:: python +.. code-block:: python - with pa.output_stream('example1.dat') as stream: - stream.write(b'some data') - - f = open('example1.dat', 'rb') - f.read() + >>> with pa.output_stream('example1.dat') as stream: + ... stream.write(b'some data') + 9 + >>> with open('example1.dat', 'rb') as f: + ... f.read() + b'some data' On-Disk and Memory Mapped Files @@ -231,17 +241,19 @@ PyArrow includes two ways to interact with data on disk: standard operating system-level file APIs, and memory-mapped files. In regular Python we can write: -.. ipython:: python +.. code-block:: python - with open('example2.dat', 'wb') as f: - f.write(b'some example data') + >>> with open('example2.dat', 'wb') as f: + ... f.write(b'some example data') + 17 Using pyarrow's :class:`~pyarrow.OSFile` class, you can write: -.. ipython:: python +.. code-block:: python - with pa.OSFile('example3.dat', 'wb') as f: - f.write(b'some example data') + >>> with pa.OSFile('example3.dat', 'wb') as f: + ... f.write(b'some example data') + 17 For reading files, you can use :class:`~pyarrow.OSFile` or :class:`~pyarrow.MemoryMappedFile`. The difference between these is that @@ -249,50 +261,52 @@ For reading files, you can use :class:`~pyarrow.OSFile` or objects. In reads from memory maps, the library constructs a buffer referencing the mapped memory without any memory allocation or copying: -.. ipython:: python +.. code-block:: python - file_obj = pa.OSFile('example2.dat') - mmap = pa.memory_map('example3.dat') - file_obj.read(4) - mmap.read(4) + >>> file_obj = pa.OSFile('example2.dat') + >>> mmap = pa.memory_map('example3.dat') + >>> file_obj.read(4) + b'some' + >>> mmap.read(4) + b'some' The ``read`` method implements the standard Python file ``read`` API. To read into Arrow Buffer objects, use ``read_buffer``: -.. ipython:: python +.. code-block:: python - mmap.seek(0) - buf = mmap.read_buffer(4) - print(buf) - buf.to_pybytes() + >>> mmap.seek(0) + 0 + >>> buf = mmap.read_buffer(4) + >>> buf + + >>> buf.to_pybytes() + b'some' Many tools in PyArrow, particular the Apache Parquet interface and the file and stream messaging tools, are more efficient when used with these ``NativeFile`` types than with normal Python file objects. -.. ipython:: python - :suppress: - - buf = mmap = file_obj = None - !rm example.dat - !rm example2.dat - In-Memory Reading and Writing ----------------------------- To assist with serialization and deserialization of in-memory data, we have file interfaces that can read and write to Arrow Buffers. -.. ipython:: python - - writer = pa.BufferOutputStream() - writer.write(b'hello, friends') - - buf = writer.getvalue() - buf - buf.size - reader = pa.BufferReader(buf) - reader.seek(7) - reader.read(7) +.. code-block:: python + + >>> writer = pa.BufferOutputStream() + >>> writer.write(b'hello, friends') + 14 + >>> buf = writer.getvalue() + >>> buf + + >>> buf.size + 14 + >>> reader = pa.BufferReader(buf) + >>> reader.seek(7) + 7 + >>> reader.read(7) + b'friends' These have similar semantics to Python's built-in ``io.BytesIO``. diff --git a/docs/source/python/numpy.rst b/docs/source/python/numpy.rst index 870f9cb73479..01fb1982d598 100644 --- a/docs/source/python/numpy.rst +++ b/docs/source/python/numpy.rst @@ -29,14 +29,14 @@ NumPy to Arrow To convert a NumPy array to Arrow, one can simply call the :func:`pyarrow.array` factory function. -.. code-block:: pycon +.. code-block:: python >>> import numpy as np >>> import pyarrow as pa >>> data = np.arange(10, dtype='int16') >>> arr = pa.array(data) >>> arr - + [ 0, 1, @@ -61,7 +61,7 @@ for use with NumPy using the :meth:`~pyarrow.Array.to_numpy` method. This is limited to primitive types for which NumPy has the same physical representation as Arrow, and assuming the Arrow data has no nulls. -.. code-block:: pycon +.. code-block:: python >>> import numpy as np >>> import pyarrow as pa diff --git a/docs/source/python/orc.rst b/docs/source/python/orc.rst index 7c16a94673a8..8f3ae75e1013 100644 --- a/docs/source/python/orc.rst +++ b/docs/source/python/orc.rst @@ -37,7 +37,9 @@ Obtaining pyarrow with ORC Support -------------------------------------- If you installed ``pyarrow`` with pip or conda, it should be built with ORC -support bundled:: +support bundled: + +.. code-block:: python >>> from pyarrow import orc @@ -52,7 +54,9 @@ Reading and Writing Single Files The functions :func:`~.orc.read_table` and :func:`~.orc.write_table` read and write the :ref:`pyarrow.Table ` object, respectively. -Let's look at a simple table:: +Let's look at a simple table: + +.. code-block:: python >>> import numpy as np >>> import pyarrow as pa @@ -65,19 +69,25 @@ Let's look at a simple table:: ... } ... ) -We write this to ORC format with ``write_table``:: +We write this to ORC format with ``write_table``: + +.. code-block:: python >>> from pyarrow import orc >>> orc.write_table(table, 'example.orc') This creates a single ORC file. In practice, an ORC dataset may consist of many files in many directories. We can read a single file back with -``read_table``:: +``read_table``: + +.. code-block:: python >>> table2 = orc.read_table('example.orc') You can pass a subset of columns to read, which can be much faster than reading -the whole file (due to the columnar layout):: +the whole file (due to the columnar layout): + +.. code-block:: python >>> orc.read_table('example.orc', columns=['one', 'three']) pyarrow.Table @@ -120,11 +130,13 @@ See the :func:`~pyarrow.orc.write_table()` docstring for more details. Finer-grained Reading and Writing --------------------------------- -``read_table`` uses the :class:`~.ORCFile` class, which has other features:: +``read_table`` uses the :class:`~.ORCFile` class, which has other features: + +.. code-block:: python >>> orc_file = orc.ORCFile('example.orc') >>> orc_file.metadata - + -- metadata -- >>> orc_file.schema one: double @@ -139,7 +151,9 @@ As you can learn more in the `Apache ORC format `_, an ORC file consists of multiple stripes. ``read_table`` will read all of the stripes and concatenate them into a single table. You can read individual stripes with -``read_stripe``:: +``read_stripe``: + +.. code-block:: python >>> orc_file.nstripes 1 @@ -148,8 +162,14 @@ concatenate them into a single table. You can read individual stripes with one: double two: string three: bool + ---- + one: [-1,nan,2.5] + two: ["foo","bar","baz"] + three: [true,false,true] -We can write an ORC file using ``ORCWriter``:: +We can write an ORC file using ``ORCWriter``: + +.. code-block:: python >>> with orc.ORCWriter('example2.orc') as writer: ... writer.write(table) @@ -159,12 +179,14 @@ Compression The data pages within a column in a row group can be compressed after the encoding passes (dictionary, RLE encoding). In PyArrow we don't use compression -by default, but Snappy, ZSTD, Zlib, and LZ4 are also supported:: +by default, but Snappy, ZSTD, Zlib, and LZ4 are also supported: + +.. code-block:: python - >>> orc.write_table(table, where, compression='uncompressed') - >>> orc.write_table(table, where, compression='zlib') - >>> orc.write_table(table, where, compression='zstd') - >>> orc.write_table(table, where, compression='snappy') + >>> orc.write_table(table, 'example.orc', compression='uncompressed') + >>> orc.write_table(table, 'example.orc', compression='zlib') + >>> orc.write_table(table, 'example.orc', compression='zstd') + >>> orc.write_table(table, 'example.orc', compression='snappy') Snappy generally results in better performance, while Zlib may yield smaller files. @@ -173,12 +195,14 @@ Reading from cloud storage -------------------------- In addition to local files, pyarrow supports other filesystems, such as cloud -filesystems, through the ``filesystem`` keyword:: +filesystems, through the ``filesystem`` keyword: + +.. code-block:: python >>> from pyarrow import fs - >>> s3 = fs.S3FileSystem(region="us-east-2") - >>> table = orc.read_table("bucket/object/key/prefix", filesystem=s3) + >>> s3 = fs.S3FileSystem(region="us-east-2") # doctest: +SKIP + >>> table = orc.read_table("bucket/object/key/prefix", filesystem=s3) # doctest: +SKIP .. seealso:: :ref:`Documentation for filesystems `. diff --git a/docs/source/python/pandas.rst b/docs/source/python/pandas.rst index 23a4b73bd096..7aacaaff60cd 100644 --- a/docs/source/python/pandas.rst +++ b/docs/source/python/pandas.rst @@ -31,10 +31,10 @@ to them. To follow examples in this document, make sure to run: -.. ipython:: python +.. code-block:: python - import pandas as pd - import pyarrow as pa + >>> import pandas as pd + >>> import pyarrow as pa DataFrames ---------- @@ -50,17 +50,14 @@ Conversion from a Table to a DataFrame is done by calling .. code-block:: python - import pyarrow as pa - import pandas as pd - - df = pd.DataFrame({"a": [1, 2, 3]}) - # Convert from pandas to Arrow - table = pa.Table.from_pandas(df) - # Convert back to pandas - df_new = table.to_pandas() + >>> df = pd.DataFrame({"a": [1, 2, 3]}) + >>> # Convert from pandas to Arrow + >>> table = pa.Table.from_pandas(df) + >>> # Convert back to pandas + >>> df_new = table.to_pandas() - # Infer Arrow schema from pandas - schema = pa.Schema.from_pandas(df) + >>> # Infer Arrow schema from pandas + >>> schema = pa.Schema.from_pandas(df) By default ``pyarrow`` tries to preserve and restore the ``.index`` data as accurately as possible. See the section below for more about @@ -169,24 +166,52 @@ columns are converted to :ref:`Arrow dictionary arrays `, a special array type optimized to handle repeated and limited number of possible values. -.. ipython:: python - - df = pd.DataFrame({"cat": pd.Categorical(["a", "b", "c", "a", "b", "c"])}) - df.cat.dtype.categories - df +.. code-block:: python - table = pa.Table.from_pandas(df) - table + >>> df = pd.DataFrame({"cat": pd.Categorical(["a", "b", "c", "a", "b", "c"])}) + >>> df.cat.dtype.categories + Index(['a', 'b', 'c'], dtype='str') + >>> df + cat + 0 a + 1 b + 2 c + 3 a + 4 b + 5 c + >>> table = pa.Table.from_pandas(df) + >>> table + pyarrow.Table + cat: dictionary + ---- + cat: [ -- dictionary: + ["a","b","c"] -- indices: + [0,1,2,0,1,2]] We can inspect the :class:`~.ChunkedArray` of the created table and see the same categories of the Pandas DataFrame. -.. ipython:: python +.. code-block:: python - column = table[0] - chunk = column.chunk(0) - chunk.dictionary - chunk.indices + >>> column = table[0] + >>> chunk = column.chunk(0) + >>> chunk.dictionary + + [ + "a", + "b", + "c" + ] + >>> chunk.indices + + [ + 0, + 1, + 2, + 0, + 1, + 2 + ] Datetime (Timestamp) types ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -195,14 +220,23 @@ Datetime (Timestamp) types use the ``datetime64[ns]`` type in Pandas and are converted to an Arrow :class:`~.TimestampArray`. -.. ipython:: python - - df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="h", periods=3)}) - df.dtypes - df +.. code-block:: python - table = pa.Table.from_pandas(df) - table + >>> df = pd.DataFrame({"datetime": pd.date_range("2020-01-01T00:00:00Z", freq="h", periods=3)}) + >>> df.dtypes + datetime datetime64[us, UTC] + dtype: object + >>> df + datetime + 0 2020-01-01 00:00:00+00:00 + 1 2020-01-01 01:00:00+00:00 + 2 2020-01-01 02:00:00+00:00 + >>> table = pa.Table.from_pandas(df) + >>> table + pyarrow.Table + datetime: timestamp[us, tz=UTC] + ---- + datetime: [[2020-01-01 00:00:00.000000Z,2020-01-01 01:00:00.000000Z,2020-01-01 02:00:00.000000Z]] In this example the Pandas Timestamp is time zone aware (``UTC`` on this case), and this information is used to create the Arrow @@ -215,42 +249,54 @@ While dates can be handled using the ``datetime64[ns]`` type in pandas, some systems work with object arrays of Python's built-in ``datetime.date`` object: -.. ipython:: python +.. code-block:: python - from datetime import date - s = pd.Series([date(2018, 12, 31), None, date(2000, 1, 1)]) - s + >>> from datetime import date + >>> s = pd.Series([date(2018, 12, 31), None, date(2000, 1, 1)]) + >>> s + 0 2018-12-31 + 1 None + 2 2000-01-01 + dtype: object When converting to an Arrow array, the ``date32`` type will be used by default: -.. ipython:: python +.. code-block:: python - arr = pa.array(s) - arr.type - arr[0] + >>> arr = pa.array(s) + >>> arr.type + DataType(date32[day]) + >>> arr[0] + To use the 64-bit ``date64``, specify this explicitly: -.. ipython:: python +.. code-block:: python - arr = pa.array(s, type='date64') - arr.type + >>> arr = pa.array(s, type='date64') + >>> arr.type + DataType(date64[ms]) When converting back with ``to_pandas``, object arrays of ``datetime.date`` objects are returned: -.. ipython:: python +.. code-block:: python - arr.to_pandas() + >>> arr.to_pandas() + 0 2018-12-31 + 1 None + 2 2000-01-01 + dtype: object If you want to use NumPy's ``datetime64`` dtype instead, pass ``date_as_object=False``: -.. ipython:: python +.. code-block:: python - s2 = pd.Series(arr.to_pandas(date_as_object=False)) - s2.dtype + >>> s2 = pd.Series(arr.to_pandas(date_as_object=False)) + >>> s2.dtype + dtype('>> from datetime import time + >>> s = pd.Series([time(1, 1, 1), time(2, 2, 2)]) + >>> s + 0 01:01:01 + 1 02:02:02 + dtype: object + >>> arr = pa.array(s) + >>> arr.type + Time64Type(time64[us]) + >>> arr + + [ + 01:01:01.000000, + 02:02:02.000000 + ] When converting to pandas, arrays of ``datetime.time`` objects are returned: -.. ipython:: python +.. code-block:: python - arr.to_pandas() + >>> arr.to_pandas() + 0 01:01:01 + 1 02:02:02 + dtype: object Nullable types -------------- @@ -294,7 +351,7 @@ missing values are present: >>> arr = pa.array([1, 2, None]) >>> arr - + [ 1, 2, @@ -321,7 +378,6 @@ round trip conversion for those: >>> table = pa.table(df) >>> table - Out[32]: pyarrow.Table a: int64 ---- @@ -371,22 +427,21 @@ dictionary becomes: .. code-block:: python - dtype_mapping = { - pa.int8(): pd.Int8Dtype(), - pa.int16(): pd.Int16Dtype(), - pa.int32(): pd.Int32Dtype(), - pa.int64(): pd.Int64Dtype(), - pa.uint8(): pd.UInt8Dtype(), - pa.uint16(): pd.UInt16Dtype(), - pa.uint32(): pd.UInt32Dtype(), - pa.uint64(): pd.UInt64Dtype(), - pa.bool_(): pd.BooleanDtype(), - pa.float32(): pd.Float32Dtype(), - pa.float64(): pd.Float64Dtype(), - pa.string(): pd.StringDtype(), - } - - df = table.to_pandas(types_mapper=dtype_mapping.get) + >>> dtype_mapping = { + ... pa.int8(): pd.Int8Dtype(), + ... pa.int16(): pd.Int16Dtype(), + ... pa.int32(): pd.Int32Dtype(), + ... pa.int64(): pd.Int64Dtype(), + ... pa.uint8(): pd.UInt8Dtype(), + ... pa.uint16(): pd.UInt16Dtype(), + ... pa.uint32(): pd.UInt32Dtype(), + ... pa.uint64(): pd.UInt64Dtype(), + ... pa.bool_(): pd.BooleanDtype(), + ... pa.float32(): pd.Float32Dtype(), + ... pa.float64(): pd.Float64Dtype(), + ... pa.string(): pd.StringDtype(), + ... } + >>> df = table.to_pandas(types_mapper=dtype_mapping.get) When using the pandas API for reading Parquet files (``pd.read_parquet(..)``), @@ -394,7 +449,7 @@ this can also be achieved by passing ``use_nullable_dtypes``: .. code-block:: python - df = pd.read_parquet(path, use_nullable_dtypes=True) + >>> df = pd.read_parquet(path, use_nullable_dtypes=True) # doctest: +SKIP Memory Usage and Zero Copy @@ -463,8 +518,8 @@ Used together, the call .. code-block:: python - df = table.to_pandas(split_blocks=True, self_destruct=True) - del table # not necessary, but a good practice + >>> df = table.to_pandas(split_blocks=True, self_destruct=True) + >>> del table # not necessary, but a good practice will yield significantly lower memory usage in some scenarios. Without these options, ``to_pandas`` will always double memory. diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index ebc67e7e7493..2c42d97f9895 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -44,9 +44,9 @@ Obtaining pyarrow with Parquet Support If you installed ``pyarrow`` with pip or conda, it should be built with Parquet support bundled: -.. ipython:: python +.. code-block:: python - import pyarrow.parquet as pq + >>> import pyarrow.parquet as pq If you are building ``pyarrow`` from source, you must use ``-DARROW_PARQUET=ON`` when compiling the C++ libraries and enable the Parquet extensions when @@ -62,47 +62,60 @@ read and write the :ref:`pyarrow.Table ` object, respectively. Let's look at a simple table: -.. ipython:: python - - import numpy as np - import pandas as pd - import pyarrow as pa +.. code-block:: python - df = pd.DataFrame({'one': [-1, np.nan, 2.5], - 'two': ['foo', 'bar', 'baz'], - 'three': [True, False, True]}, - index=list('abc')) - table = pa.Table.from_pandas(df) + >>> import numpy as np + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame({'one': [-1, np.nan, 2.5], + ... 'two': ['foo', 'bar', 'baz'], + ... 'three': [True, False, True]}, + ... index=list('abc')) + >>> table = pa.Table.from_pandas(df) We write this to Parquet format with ``write_table``: -.. ipython:: python +.. code-block:: python - import pyarrow.parquet as pq - pq.write_table(table, 'example.parquet') + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, 'example.parquet') This creates a single Parquet file. In practice, a Parquet dataset may consist of many files in many directories. We can read a single file back with ``read_table``: -.. ipython:: python +.. code-block:: python - table2 = pq.read_table('example.parquet') - table2.to_pandas() + >>> table2 = pq.read_table('example.parquet') + >>> table2.to_pandas() + one two three + a -1.0 foo True + b NaN bar False + c 2.5 baz True You can pass a subset of columns to read, which can be much faster than reading the whole file (due to the columnar layout): -.. ipython:: python +.. code-block:: python - pq.read_table('example.parquet', columns=['one', 'three']) + >>> pq.read_table('example.parquet', columns=['one', 'three']) + pyarrow.Table + one: double + three: bool + ---- + one: [[-1,null,2.5]] + three: [[true,false,true]] When reading a subset of columns from a file that used a Pandas dataframe as the source, we use ``read_pandas`` to maintain any additional index column data: -.. ipython:: python +.. code-block:: python - pq.read_pandas('example.parquet', columns=['two']).to_pandas() + >>> pq.read_pandas('example.parquet', columns=['two']).to_pandas() + two + a foo + b bar + c baz We do not need to use a string to specify the origin of the file. It can be any of: @@ -126,13 +139,13 @@ but won't help much with resident memory consumption. .. code-block:: python - >>> pq_array = pa.parquet.read_table("area1.parquet", memory_map=True) - >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) - RSS: 4299MB + >>> pq_array = pa.parquet.read_table(path, memory_map=True) # doctest: +SKIP + >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) # doctest: +SKIP + RSS: 4299MB - >>> pq_array = pa.parquet.read_table("area1.parquet", memory_map=False) - >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) - RSS: 4299MB + >>> pq_array = pa.parquet.read_table(path, memory_map=False) # doctest: +SKIP + >>> print("RSS: {}MB".format(pa.total_allocated_bytes() >> 20)) # doctest: +SKIP + RSS: 4299MB If you need to deal with Parquet data bigger than memory, the :ref:`dataset` and partitioning is probably what you are looking for. @@ -164,22 +177,25 @@ one or more special columns are added to keep track of the index (row labels). Storing the index takes extra space, so if your index is not valuable, you may choose to omit it by passing ``preserve_index=False`` -.. ipython:: python +.. code-block:: python - df = pd.DataFrame({'one': [-1, np.nan, 2.5], - 'two': ['foo', 'bar', 'baz'], - 'three': [True, False, True]}, - index=list('abc')) - df - table = pa.Table.from_pandas(df, preserve_index=False) + >>> df = pd.DataFrame({'one': [-1, np.nan, 2.5], + ... 'two': ['foo', 'bar', 'baz'], + ... 'three': [True, False, True]}, + ... index=list('abc')) + >>> table = pa.Table.from_pandas(df, preserve_index=False) Then we have: -.. ipython:: python +.. code-block:: python - pq.write_table(table, 'example_noindex.parquet') - t = pq.read_table('example_noindex.parquet') - t.to_pandas() + >>> pq.write_table(table, 'example_noindex.parquet') + >>> t = pq.read_table('example_noindex.parquet') + >>> t.to_pandas() + one two three + 0 -1.0 foo True + 1 NaN bar False + 2 2.5 baz True Here you see the index did not survive the round trip. @@ -188,11 +204,26 @@ Finer-grained Reading and Writing ``read_table`` uses the :class:`~.ParquetFile` class, which has other features: -.. ipython:: python +.. code-block:: python - parquet_file = pq.ParquetFile('example.parquet') - parquet_file.metadata - parquet_file.schema + >>> parquet_file = pq.ParquetFile('example.parquet') + >>> parquet_file.metadata + + created_by: parquet-cpp-arrow version ... + num_columns: 4 + num_rows: 3 + num_row_groups: 1 + format_version: 2.6 + serialized_size: ... + >>> parquet_file.schema + + required group field_id=-1 schema { + optional double field_id=-1 one; + optional binary field_id=-1 two (String); + optional boolean field_id=-1 three; + optional binary field_id=-1 __index_level_0__ (String); + } + As you can learn more in the `Apache Parquet format `_, a Parquet file consists of @@ -200,22 +231,33 @@ multiple row groups. ``read_table`` will read all of the row groups and concatenate them into a single table. You can read individual row groups with ``read_row_group``: -.. ipython:: python +.. code-block:: python - parquet_file.num_row_groups - parquet_file.read_row_group(0) + >>> parquet_file.num_row_groups + 1 + >>> parquet_file.read_row_group(0) + pyarrow.Table + one: double + two: large_string + three: bool + __index_level_0__: large_string + ---- + one: [[-1,null,2.5]] + two: [["foo","bar","baz"]] + three: [[true,false,true]] + __index_level_0__: [["a","b","c"]] We can similarly write a Parquet file with multiple row groups by using ``ParquetWriter``: -.. ipython:: python - - with pq.ParquetWriter('example2.parquet', table.schema) as writer: - for i in range(3): - writer.write_table(table) +.. code-block:: python - pf2 = pq.ParquetFile('example2.parquet') - pf2.num_row_groups + >>> with pq.ParquetWriter('example2.parquet', table.schema) as writer: + ... for i in range(3): + ... writer.write_table(table) + >>> pf2 = pq.ParquetFile('example2.parquet') + >>> pf2.num_row_groups + 3 Inspecting the Parquet File Metadata ------------------------------------ @@ -223,34 +265,73 @@ Inspecting the Parquet File Metadata The ``FileMetaData`` of a Parquet file can be accessed through :class:`~.ParquetFile` as shown above: -.. ipython:: python +.. code-block:: python - parquet_file = pq.ParquetFile('example.parquet') - metadata = parquet_file.metadata + >>> parquet_file = pq.ParquetFile('example.parquet') + >>> metadata = parquet_file.metadata + >>> metadata + + created_by: parquet-cpp-arrow version ... + num_columns: 4 + num_rows: 3 + num_row_groups: 1 + format_version: 2.6 + serialized_size: ... or can also be read directly using :func:`~parquet.read_metadata`: -.. ipython:: python +.. code-block:: python - metadata = pq.read_metadata('example.parquet') - metadata + >>> metadata = pq.read_metadata('example.parquet') + >>> metadata + + created_by: parquet-cpp-arrow version ... + num_columns: 4 + num_rows: 3 + num_row_groups: 1 + format_version: 2.6 + serialized_size: ... The returned ``FileMetaData`` object allows to inspect the `Parquet file metadata `__, such as the row groups and column chunk metadata and statistics: -.. ipython:: python - - metadata.row_group(0) - metadata.row_group(0).column(0) - -.. ipython:: python - :suppress: +.. code-block:: python - !rm example.parquet - !rm example_noindex.parquet - !rm example2.parquet - !rm example3.parquet + >>> metadata.row_group(0) + + num_columns: 4 + num_rows: 3 + total_byte_size: 290 + sorting_columns: () + >>> metadata.row_group(0).column(0) + + file_offset: 0 + file_path:... + physical_type: DOUBLE + num_values: 3 + path_in_schema: one + is_stats_set: True + statistics: + + has_min_max: True + min: -1.0 + max: 2.5 + null_count: 1 + distinct_count: None + num_values: 2 + physical_type: DOUBLE + logical_type: None + converted_type (legacy): NONE + geo_statistics: + None + compression: SNAPPY + encodings: ('PLAIN', 'RLE', 'RLE_DICTIONARY') + has_dictionary_page: True + dictionary_page_offset: 4 + data_page_offset: 36 + total_compressed_size: 106 + total_uncompressed_size: 102 Data Type Handling ------------------ @@ -266,7 +347,19 @@ and improved performance for columns with many repeated string values. .. code-block:: python - pq.read_table(table, where, read_dictionary=['binary_c0', 'stringb_c2']) + >>> pq.read_table('example.parquet', read_dictionary=['two']) + pyarrow.Table + one: double + two: dictionary + three: bool + __index_level_0__: large_string + ---- + one: [[-1,null,2.5]] + two: [ -- dictionary: + ["foo","bar","baz"] -- indices: + [0,1,2]] + three: [[true,false,true]] + __index_level_0__: [["a","b","c"]] Storing timestamps ~~~~~~~~~~~~~~~~~~ @@ -282,7 +375,7 @@ the desired resolution: .. code-block:: python - pq.write_table(table, where, coerce_timestamps='ms') + >>> pq.write_table(table, 'example.parquet', coerce_timestamps='ms') If a cast to a lower resolution value may result in a loss of data, by default an exception will be raised. This can be suppressed by passing @@ -290,15 +383,15 @@ an exception will be raised. This can be suppressed by passing .. code-block:: python - pq.write_table(table, where, coerce_timestamps='ms', - allow_truncated_timestamps=True) + >>> pq.write_table(table, 'example.parquet', coerce_timestamps='ms', + ... allow_truncated_timestamps=True) Timestamps with nanoseconds can be stored without casting when using the more recent Parquet format version 2.6: .. code-block:: python - pq.write_table(table, where, version='2.6') + >>> pq.write_table(table, 'example.parquet', version='2.6') However, many Parquet readers do not yet support this newer format version, and therefore the default is to write version 1.0 files. When compatibility across @@ -313,7 +406,7 @@ this format, set the ``use_deprecated_int96_timestamps`` option to .. code-block:: python - pq.write_table(table, where, use_deprecated_int96_timestamps=True) + >>> pq.write_table(table, 'example.parquet', use_deprecated_int96_timestamps=True) Compression, Encoding, and File Compatibility --------------------------------------------- @@ -325,7 +418,7 @@ plain encoding. Whether dictionary encoding is used can be toggled using the .. code-block:: python - pq.write_table(table, where, use_dictionary=False) + >>> pq.write_table(table, 'example.parquet', use_dictionary=False) The data pages within a column in a row group can be compressed after the encoding passes (dictionary, RLE encoding). In PyArrow we use Snappy @@ -334,22 +427,25 @@ also supported: .. code-block:: python - pq.write_table(table, where, compression='snappy') - pq.write_table(table, where, compression='gzip') - pq.write_table(table, where, compression='brotli') - pq.write_table(table, where, compression='zstd') - pq.write_table(table, where, compression='lz4') - pq.write_table(table, where, compression='none') + >>> pq.write_table(table, 'example.parquet', compression='snappy') + >>> pq.write_table(table, 'example.parquet', compression='gzip') + >>> pq.write_table(table, 'example.parquet', compression='brotli') + >>> pq.write_table(table, 'example.parquet', compression='zstd') + >>> pq.write_table(table, 'example.parquet', compression='lz4') + >>> pq.write_table(table, 'example.parquet', compression='none') Snappy generally results in better performance, while Gzip may yield smaller files. +``'lz4_raw'`` is also accepted as an alias for ``'lz4'``. Both use the +LZ4_RAW codec as defined in the Parquet specification. + These settings can also be set on a per-column basis: .. code-block:: python - pq.write_table(table, where, compression={'foo': 'snappy', 'bar': 'gzip'}, - use_dictionary=['foo', 'bar']) + >>> pq.write_table(table, 'example.parquet', compression={'one': 'snappy', 'two': 'gzip'}, + ... use_dictionary=['one', 'two']) Partitioned Datasets (Multiple Files) ------------------------------------------------ @@ -390,9 +486,9 @@ added is to use the local filesystem. .. code-block:: python - # Local dataset write - pq.write_to_dataset(table, root_path='dataset_name', - partition_cols=['one', 'two']) + >>> # Local dataset write + >>> pq.write_to_dataset(table, root_path='dataset_name', + ... partition_cols=['one', 'two']) The root path in this case specifies the parent directory to which data will be saved. The partition columns are the column names by which to partition the @@ -405,11 +501,11 @@ individual table writes are wrapped using ``with`` statements so the .. code-block:: python - # Remote file-system example - from pyarrow.fs import HadoopFileSystem - fs = HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path) - pq.write_to_dataset(table, root_path='dataset_name', - partition_cols=['one', 'two'], filesystem=fs) + >>> # Remote file-system example + >>> from pyarrow.fs import HadoopFileSystem # doctest: +SKIP + >>> fs = HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path) # doctest: +SKIP + >>> pq.write_to_dataset(table, root_path='dataset_name', # doctest: +SKIP + ... partition_cols=['one', 'two'], filesystem=fs) Compatibility Note: if using ``pq.write_to_dataset`` to create a table that will then be used by HIVE then partition column values must be compatible with @@ -439,18 +535,19 @@ combine and write them manually: .. code-block:: python - # Write a dataset and collect metadata information of all written files - metadata_collector = [] - pq.write_to_dataset(table, root_path, metadata_collector=metadata_collector) + >>> # Write a dataset and collect metadata information of all written files + >>> metadata_collector = [] + >>> root_path = "dataset_name_1" + >>> pq.write_to_dataset(table, root_path, metadata_collector=metadata_collector) - # Write the ``_common_metadata`` parquet file without row groups statistics - pq.write_metadata(table.schema, root_path / '_common_metadata') + >>> # Write the ``_common_metadata`` parquet file without row groups statistics + >>> pq.write_metadata(table.schema, root_path + '/_common_metadata') - # Write the ``_metadata`` parquet file with row groups statistics of all files - pq.write_metadata( - table.schema, root_path / '_metadata', - metadata_collector=metadata_collector - ) + >>> # Write the ``_metadata`` parquet file with row groups statistics of all files + >>> pq.write_metadata( + ... table.schema, root_path + '/_metadata', + ... metadata_collector=metadata_collector + ... ) When not using the :func:`~pyarrow.parquet.write_to_dataset` function, but writing the individual files of the partitioned dataset using @@ -463,26 +560,38 @@ the same: .. code-block:: python - metadata_collector = [] - pq.write_table( - table1, root_path / "year=2017/data1.parquet", - metadata_collector=metadata_collector - ) - - # set the file path relative to the root of the partitioned dataset - metadata_collector[-1].set_file_path("year=2017/data1.parquet") - - # combine and write the metadata - metadata = metadata_collector[0] - for _meta in metadata_collector[1:]: - metadata.append_row_groups(_meta) - metadata.write_metadata_file(root_path / "_metadata") - - # or use pq.write_metadata to combine and write in a single step - pq.write_metadata( - table1.schema, root_path / "_metadata", - metadata_collector=metadata_collector - ) + >>> import os + >>> os.mkdir("year=2017") + + >>> metadata_collector = [] + >>> pq.write_table( + ... table, "year=2017/data1.parquet", + ... metadata_collector=metadata_collector + ... ) + + >>> # set the file path relative to the root of the partitioned dataset + >>> metadata_collector[-1].set_file_path("year=2017/data1.parquet") + + >>> # combine and write the metadata + >>> metadata = metadata_collector[0] + >>> for _meta in metadata_collector[1:]: + ... metadata.append_row_groups(_meta) + >>> metadata.write_metadata_file("_metadata") + + >>> # or use pq.write_metadata to combine and write in a single step + >>> pq.write_metadata( + ... table.schema, "_metadata", + ... metadata_collector=metadata_collector + ... ) + + >>> pq.read_metadata("_metadata") + + created_by: parquet-cpp-arrow version ... + num_columns: 3 + num_rows: 3 + num_row_groups: 1 + format_version: 2.6 + serialized_size: ... Reading from Partitioned Datasets ------------------------------------------------ @@ -493,8 +602,29 @@ such as those produced by Hive: .. code-block:: python - dataset = pq.ParquetDataset('dataset_name/') - table = dataset.read() + >>> dataset = pq.ParquetDataset('dataset_name/') + >>> table = dataset.read() + >>> table + pyarrow.Table + three: bool + one: dictionary + two: dictionary + ---- + three: [[true],[true],[false]] + one: [ -- dictionary: + ["-1","2.5"] -- indices: + [0], -- dictionary: + ["-1","2.5"] -- indices: + [1], -- dictionary: + [null] -- indices: + [0]] + two: [ -- dictionary: + ["foo","baz","bar"] -- indices: + [0], -- dictionary: + ["foo","baz","bar"] -- indices: + [1], -- dictionary: + ["foo","baz","bar"] -- indices: + [2]] You can also use the convenience function ``read_table`` exposed by ``pyarrow.parquet`` that avoids the need for an additional Dataset object @@ -502,7 +632,7 @@ creation step. .. code-block:: python - table = pq.read_table('dataset_name') + >>> table = pq.read_table('dataset_name') Note: the partition columns in the original table will have their types converted to Arrow dictionary types (pandas categorical) on load. Ordering of @@ -557,10 +687,10 @@ filesystems, through the ``filesystem`` keyword: .. code-block:: python - from pyarrow import fs + >>> from pyarrow import fs - s3 = fs.S3FileSystem(region="us-east-2") - table = pq.read_table("bucket/object/key/prefix", filesystem=s3) + >>> s3 = fs.S3FileSystem(region="us-east-2") # doctest: +SKIP + >>> table = pq.read_table("bucket/object/key/prefix", filesystem=s3) # doctest: +SKIP Currently, :class:`HDFS ` and :class:`Amazon S3-compatible storage ` are @@ -570,7 +700,7 @@ if specified as a URI: .. code-block:: python - table = pq.read_table("s3://bucket/object/key/prefix") + >>> table = pq.read_table("s3://bucket/object/key/prefix") # doctest: +SKIP Other filesystems can still be supported if there is an `fsspec `__-compatible @@ -580,10 +710,9 @@ One example is Azure Blob storage, which can be interfaced through the .. code-block:: python - from adlfs import AzureBlobFileSystem - - abfs = AzureBlobFileSystem(account_name="XXXX", account_key="XXXX", container_name="XXXX") - table = pq.read_table("file.parquet", filesystem=abfs) + >>> from adlfs import AzureBlobFileSystem # doctest: +SKIP + >>> abfs = AzureBlobFileSystem(account_name="XXXX", account_key="XXXX", container_name="XXXX") # doctest: +SKIP + >>> table = pq.read_table("file.parquet", filesystem=abfs) # doctest: +SKIP Parquet Modular Encryption (Columnar Encryption) ------------------------------------------------ @@ -605,20 +734,20 @@ Writing an encrypted Parquet file: .. code-block:: python - encryption_properties = crypto_factory.file_encryption_properties( - kms_connection_config, encryption_config) - with pq.ParquetWriter(filename, schema, - encryption_properties=encryption_properties) as writer: - writer.write_table(table) + >>> encryption_properties = crypto_factory.file_encryption_properties( # doctest: +SKIP + ... kms_connection_config, encryption_config) + >>> with pq.ParquetWriter(filename, schema, # doctest: +SKIP + ... encryption_properties=encryption_properties) as writer: + ... writer.write_table(table) Reading an encrypted Parquet file: .. code-block:: python - decryption_properties = crypto_factory.file_decryption_properties( - kms_connection_config) - parquet_file = pq.ParquetFile(filename, - decryption_properties=decryption_properties) + >>> decryption_properties = crypto_factory.file_decryption_properties( # doctest: +SKIP + ... kms_connection_config) + >>> parquet_file = pq.ParquetFile(filename, # doctest: +SKIP + ... decryption_properties=decryption_properties) In order to create the encryption and decryption properties, a @@ -637,25 +766,24 @@ defined by :class:`pyarrow.parquet.encryption.KmsClient` as following: .. code-block:: python - import pyarrow.parquet.encryption as pe - - class MyKmsClient(pe.KmsClient): - - """An example KmsClient implementation skeleton""" - def __init__(self, kms_connection_configuration): - pe.KmsClient.__init__(self) - # Any KMS-specific initialization based on - # kms_connection_configuration comes here - - def wrap_key(self, key_bytes, master_key_identifier): - wrapped_key = ... # call KMS to wrap key_bytes with key specified by - # master_key_identifier - return wrapped_key - - def unwrap_key(self, wrapped_key, master_key_identifier): - key_bytes = ... # call KMS to unwrap wrapped_key with key specified by - # master_key_identifier - return key_bytes + >>> import pyarrow.parquet.encryption as pe + >>> class MyKmsClient(pe.KmsClient): + ... + ... """An example KmsClient implementation skeleton""" + ... def __init__(self, kms_connection_configuration): + ... pe.KmsClient.__init__(self) + ... # Any KMS-specific initialization based on + ... # kms_connection_configuration comes here + ... + ... def wrap_key(self, key_bytes, master_key_identifier): + ... wrapped_key = ... # call KMS to wrap key_bytes with key specified by + ... # master_key_identifier + ... return wrapped_key + ... + ... def unwrap_key(self, wrapped_key, master_key_identifier): + ... key_bytes = ... # call KMS to unwrap wrapped_key with key specified by + ... # master_key_identifier + ... return key_bytes The concrete implementation will be loaded at runtime by a factory function provided by the user. This factory function will be used to initialize the @@ -666,10 +794,10 @@ For example, in order to use the ``MyKmsClient`` defined above: .. code-block:: python - def kms_client_factory(kms_connection_configuration): - return MyKmsClient(kms_connection_configuration) + >>> def kms_client_factory(kms_connection_configuration): + ... return MyKmsClient(kms_connection_configuration) - crypto_factory = CryptoFactory(kms_client_factory) + >>> crypto_factory = pe.CryptoFactory(kms_client_factory) An :download:`example <../../../python/examples/parquet_encryption/sample_vault_kms_client.py>` of such a class for an open source @@ -732,12 +860,12 @@ An example encryption configuration: .. code-block:: python - encryption_config = pq.EncryptionConfiguration( - footer_key="footer_key_name", - column_keys={ - "column_key_name": ["Column1", "Column2"], - }, - ) + >>> encryption_config = pe.EncryptionConfiguration( + ... footer_key="footer_key_name", + ... column_keys={ + ... "column_key_name": ["Column1", "Column2"], + ... }, + ... ) .. note:: @@ -757,7 +885,7 @@ all columns are encrypted with the same key identified by ``column_key_id``: .. code-block:: python - import pyarrow.parquet.encryption as pe + >>> import pyarrow.parquet.encryption as pe schema = pa.schema([ ("MapColumn", pa.map_(pa.string(), pa.int32())), @@ -776,19 +904,19 @@ some inner fields are encrypted with the same key identified by ``column_key_id` .. code-block:: python - import pyarrow.parquet.encryption as pe + >>> import pyarrow.parquet.encryption as pe - schema = pa.schema([ - ("MapColumn", pa.map_(pa.string(), pa.int32())), - ("StructColumn", pa.struct([("f1", pa.int32()), ("f2", pa.string())])), - ]) + >>> schema = pa.schema([ + ... ("MapColumn", pa.map_(pa.string(), pa.int32())), + ... ("StructColumn", pa.struct([("f1", pa.int32()), ("f2", pa.string())])), + ... ]) - encryption_config = pe.EncryptionConfiguration( - footer_key="footer_key_name", - column_keys={ - "column_key_id": [ "MapColumn.key_value.value", "StructColumn.f1" ], - }, - ) + >>> encryption_config = pe.EncryptionConfiguration( + ... footer_key="footer_key_name", + ... column_keys={ + ... "column_key_id": [ "MapColumn.key_value.value", "StructColumn.f1" ], + ... }, + ... ) Decryption configuration ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -842,20 +970,17 @@ compression used. .. code-block:: python - import pyarrow as pa - import pyarrow.parquet as p + >>> table = pa.Table.from_pandas(df) - table = pa.Table.from_pandas(df) + >>> # Enable content-defined chunking with default settings + >>> pq.write_table(table, 'example.parquet', use_content_defined_chunking=True) - # Enable content-defined chunking with default settings - pq.write_table(table, 'example.parquet', use_content_defined_chunking=True) - - # Enable content-defined chunking with custom settings - pq.write_table( - table, - 'example_custom.parquet', - use_content_defined_chunking={ - 'min_chunk_size': 128 * 1024, # 128 KiB - 'max_chunk_size': 512 * 1024, # 512 KiB - } - ) + >>> # Enable content-defined chunking with custom settings + >>> pq.write_table( + ... table, + ... 'example_custom.parquet', + ... use_content_defined_chunking={ + ... 'min_chunk_size': 128 * 1024, # 128 KiB + ... 'max_chunk_size': 512 * 1024, # 512 KiB + ... } + ... ) diff --git a/docs/source/python/timestamps.rst b/docs/source/python/timestamps.rst index 80a1b7280cbf..6b144bd79daa 100644 --- a/docs/source/python/timestamps.rst +++ b/docs/source/python/timestamps.rst @@ -60,18 +60,23 @@ Spark to Pandas (through Apache Arrow) The following cases assume the Spark configuration ``spark.sql.execution.arrow.enabled`` is set to ``"true"``. -:: +.. code-block:: python + >>> import pandas as pd + >>> from datetime import datetime, timedelta, timezone >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], - ... 'aware': [Timestamp(year=2019, month=1, day=1, - ... nanosecond=500, tz=timezone(timedelta(hours=-8)))]}) + ... 'aware': [pd.Timestamp(year=2019, month=1, day=1, + ... nanosecond=500, + ... tz=timezone(timedelta(hours=-8)))]}) >>> pdf naive aware - 0 2019-01-01 2019-01-01 00:00:00.000000500-08:00 + 0 2019-01-01 2019-01-01 00:00:00.000000500-08:00 - >>> spark.conf.set("spark.sql.session.timeZone", "UTC") - >>> utc_df = sqlContext.createDataFrame(pdf) - >>> utf_df.show() + >>> from pyspark.sql import SparkSession # doctest: +SKIP + >>> spark = SparkSession.builder.appName("MyApp").getOrCreate() # doctest: +SKIP + >>> spark.conf.set("spark.sql.session.timeZone", "UTC") # doctest: +SKIP + >>> utc_df = spark.createDataFrame(pdf) # doctest: +SKIP + >>> utc_df.show() # doctest: +SKIP +-------------------+-------------------+ | naive| aware| +-------------------+-------------------+ @@ -89,11 +94,11 @@ Now if the session time zone is set to US Pacific Time (PST) we don't see any shift in the display of the aware time zone (it still represents the same instant in time): -:: +.. code-block:: python - >>> spark.conf.set("spark.sql.session.timeZone", "US/Pacific") - >>> pst_df = sqlContext.createDataFrame(pdf) - >>> pst_df.show() + >>> spark.conf.set("spark.sql.session.timeZone", "US/Pacific") # doctest: +SKIP + >>> pst_df = spark.createDataFrame(pdf) # doctest: +SKIP + >>> pst_df.show() # doctest: +SKIP +-------------------+-------------------+ | naive| aware| +-------------------+-------------------+ @@ -105,9 +110,9 @@ zone. The naive timestamp was initially converted assuming UTC, the instant it reflects is actually earlier than the naive time zone from the PST converted data frame: -:: +.. code-block:: python - >>> utc_df.show() + >>> utc_df.show() # doctest: +SKIP +-------------------+-------------------+ | naive| aware| +-------------------+-------------------+ @@ -120,27 +125,28 @@ Spark to Pandas We can observe what happens when converting back to Arrow/Pandas. Assuming the session time zone is still PST: -:: +.. code-block:: python - >>> pst_df.show() + >>> pst_df.show() # doctest: +SKIP +-------------------+-------------------+ | naive| aware| +-------------------+-------------------+ |2019-01-01 00:00:00|2019-01-01 00:00:00| +-------------------+-------------------+ - - >>> pst_df.toPandas() - naive aware + >>> pst_df.toPandas() # doctest: +SKIP + naive aware 0 2019-01-01 2019-01-01 - >>> pst_df.toPandas().info() + >>> pst_df.toPandas().info() # doctest: +SKIP RangeIndex: 1 entries, 0 to 0 Data columns (total 2 columns): - naive 1 non-null datetime64[ns] - aware 1 non-null datetime64[ns] + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 naive 1 non-null datetime64[ns] + 1 aware 1 non-null datetime64[ns] dtypes: datetime64[ns](2) - memory usage: 96.0 bytes + memory usage: ... bytes Notice that, in addition to being a "time zone naive" timestamp, the 'aware' value will now differ when converting to an epoch offset. Spark does the conversion @@ -149,13 +155,13 @@ session time zones isn't set) and then localizes to remove the time zone information. This results in the timestamp being 8 hours before the original time: -:: +.. code-block:: python - >>> pst_df.toPandas()['aware'][0] + >>> pst_df.toPandas()['aware'][0] # doctest: +SKIP Timestamp('2019-01-01 00:00:00') - >>> pdf['aware'][0] + >>> pdf['aware'][0] # doctest: +SKIP Timestamp('2019-01-01 00:00:00.000000500-0800', tz='UTC-08:00') - >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 + >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 # doctest: +SKIP -8.0 The same type of conversion happens with the data frame converted while @@ -163,36 +169,36 @@ the session time zone was UTC. In this case both naive and aware represent different instants in time (the naive instant is due to the change in session time zone between creating data frames): -:: +.. code-block:: python - >>> utc_df.show() + >>> utc_df.show() # doctest: +SKIP +-------------------+-------------------+ | naive| aware| +-------------------+-------------------+ |2018-12-31 16:00:00|2019-01-01 00:00:00| +-------------------+-------------------+ - >>> utc_df.toPandas() - naive aware + >>> utc_df.toPandas() # doctest: +SKIP + naive aware 0 2018-12-31 16:00:00 2019-01-01 Note that the surprising shift for aware doesn't happen when the session time zone is UTC (but the timestamps still become "time zone naive"): -:: +.. code-block:: python - >>> spark.conf.set("spark.sql.session.timeZone", "UTC") - >>> pst_df.show() + >>> spark.conf.set("spark.sql.session.timeZone", "UTC") # doctest: +SKIP + >>> pst_df.show() # doctest: +SKIP +-------------------+-------------------+ | naive| aware| +-------------------+-------------------+ |2019-01-01 08:00:00|2019-01-01 08:00:00| +-------------------+-------------------+ - >>> pst_df.toPandas()['aware'][0] + >>> pst_df.toPandas()['aware'][0] # doctest: +SKIP Timestamp('2019-01-01 08:00:00') - >>> pdf['aware'][0] + >>> pdf['aware'][0] # doctest: +SKIP Timestamp('2019-01-01 00:00:00.000000500-0800', tz='UTC-08:00') - >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 + >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 # doctest: +SKIP 0.0 diff --git a/orc-predicate-pushdown.allium b/orc-predicate-pushdown.allium new file mode 100644 index 000000000000..cae54800ffab --- /dev/null +++ b/orc-predicate-pushdown.allium @@ -0,0 +1,1943 @@ +-- allium: 1 +-- orc-predicate-pushdown.allium +-- +-- Apache Arrow Dataset Scanning with ORC Predicate Pushdown +-- +-- Scope: Predicate pushdown flow from Scanner through Fragment to ORC stripes +-- Includes: OrcFile, Stripe, StripeStatistics, Predicate, FilterStripes, Dataset integration +-- Excludes: +-- - ORC file writing (separate feature) +-- - Row-level filtering (handled by scanner after stripe selection) +-- - ORC encoding/decoding internals +-- - Schema evolution details (covered separately) + + +-- ============================================================================= +-- ENTITY RELATIONSHIPS +-- ============================================================================= +-- +-- Dataset (contains)──────────────────────────────→ Fragment* (1:N) +-- │ │ +-- │ FileSystemDataset │ FileFragment +-- │ └── partitioning: Partitioning │ └── source: FileSource +-- │ └── fragments: List │ └── format: FileFormat +-- │ │ +-- │ │ OrcFileFragment +-- │ │ └── stripes: List? +-- │ │ └── metadata: OrcFileMetadata? +-- │ │ └── manifest: OrcSchemaManifest? +-- │ │ └── statistics_cache: StripeStatisticsCache? +-- │ │ +-- └── schema: Schema └── physical_schema: Schema +-- └── partition_expression: Expression └── partition_expression: Expression +-- +-- +-- Scanner (uses)──────────────────────────────────→ Dataset (1:1) +-- │ +-- └── options: ScanOptions +-- └── filter: Expression +-- └── projection: Expression +-- └── dataset_schema: Schema +-- └── projected_schema: Schema +-- +-- +-- OrcFileMetadata (contains)────────────────────────→ StripeMetadata* (1:N) +-- │ +-- └── ColumnStatistics* +-- └── min, max, has_null +-- +-- +-- OrcSchemaManifest (contains)──────────────────────→ OrcSchemaField* (1:N) +-- │ │ +-- └── num_columns: Int └── field: Field +-- └── column_index: Int? +-- └── is_leaf: Boolean +-- └── children: List + + +-- ============================================================================= +-- Domain Overview +-- ============================================================================= +-- +-- ORC predicate pushdown enables skipping entire stripes (horizontal file +-- partitions) when their statistics prove no rows can match the filter. +-- +-- Key insight: ORC files store per-stripe column statistics (min/max/nulls). +-- By comparing the filter predicate against these statistics, we can determine +-- if a stripe could possibly contain matching rows. If not, skip it entirely. +-- +-- This matches Parquet behavior: conservative when statistics are missing. +-- +-- Terminology mapping (ORC vs Parquet): +-- ORC Stripe ≡ Parquet Row Group +-- ORC Footer ≡ Parquet FileMetaData +-- ORC Statistics ≡ Parquet ColumnStatistics +-- ============================================================================= + + +-- ============================================================================= +-- CORE DOMAIN TYPES +-- ============================================================================= + +type FieldPath = List +-- Nested path into schema, e.g. [1, 0] means field 1's child 0 + +type FieldRef = String | FieldPath | List +-- Reference to a field by name, path, or nested name sequence + +function is_nested(ref: FieldRef) -> Boolean { + -- Returns true if the field reference points to a nested field within a struct/list/map. + -- Nested fields have complex column mapping in ORC and may not support statistics pushdown. + -- + -- Examples: + -- "x" -> false (top-level field) + -- [0] -> false (top-level field by index) + -- ["a", "b"] -> true (field "b" inside struct "a") + -- [0, 1] -> true (child 1 of field 0) + + match ref: + case String: return false -- Simple name is never nested + case FieldPath: + return ref.length > 1 -- Path with multiple segments is nested + case List: + return ref.length > 1 -- Multiple names indicates nested path +} + +type Timestamp = Instant + +-- Column types that support statistics-based predicate pushdown. +-- Other types are handled conservatively (all stripes included). +-- +-- NOT YET IMPLEMENTED: float32, float64, string, binary +-- These types will be added in future iterations. +type SupportedStatisticsType = int32 | int64 + +-- Comparison predicates that can eliminate stripes using min/max statistics. +type ComparisonOp = equal | not_equal | less | less_equal | greater | greater_equal + +-- Logical combinators for compound predicates. +type LogicalOp = and | or | not + +-- Null-testing predicates. +type NullOp = is_null | is_valid + + +-- ============================================================================= +-- FLOATING-POINT EDGE CASES (NOT YET IMPLEMENTED) +-- ============================================================================= +-- +-- NOTE: Float support is not yet implemented. This section documents the +-- intended behavior for future implementation. +-- +-- IEEE 754 floating-point values include special values that require careful +-- handling in statistics-based predicate pushdown: +-- +-- NaN (Not a Number): +-- - NaN compares as UNORDERED with all values, including itself +-- - NaN != NaN is TRUE, NaN == NaN is FALSE +-- - NaN > x, NaN < x, NaN >= x, NaN <= x are all FALSE +-- - ORC statistics: NaN values are EXCLUDED from min/max computation +-- - If a stripe contains only NaN values, min/max will be absent +-- - Predicate pushdown: If min/max are absent, include the stripe (conservative) +-- +-- Positive/Negative Infinity: +-- - +Inf is greater than all finite values +-- - -Inf is less than all finite values +-- - ORC statistics: Infinities ARE included in min/max +-- - min = -Inf means the stripe may contain -Inf +-- - max = +Inf means the stripe may contain +Inf +-- +-- Negative Zero (-0.0): +-- - IEEE 754: -0.0 == +0.0 is TRUE +-- - ORC treats -0.0 and +0.0 as equal for statistics purposes +-- - No special handling needed for predicate pushdown +-- +-- Examples: +-- Predicate: x > 10.0 +-- Statistics: min=NaN, max=NaN (no valid min/max) +-- Result: INCLUDE stripe (may contain non-NaN values we don't know about) +-- +-- Predicate: x > 10.0 +-- Statistics: min=5.0, max=+Inf +-- Result: INCLUDE stripe (max=+Inf means values above 10 may exist) +-- +-- Predicate: x < -1000.0 +-- Statistics: min=-Inf, max=20.0 +-- Result: INCLUDE stripe (min=-Inf means values below -1000 may exist) +-- +-- Predicate: x == NaN +-- Statistics: min=5.0, max=20.0 +-- Result: INCLUDE stripe (NaN not in statistics, but may exist in data) +-- Note: Use is_nan(x) function for NaN checks, not equality +-- +-- Implementation note: +-- When comparing predicate values against statistics bounds: +-- - If either value is NaN, comparison returns FALSE (unordered) +-- - This means predicates involving NaN literals cannot skip stripes +-- - This is correct conservative behavior + +-- Float helper functions (NOT YET IMPLEMENTED) +-- These will be needed when float support is added. +-- function is_nan(x: float32 | float64) -> Boolean +-- function is_finite(x: float32 | float64) -> Boolean +-- function is_infinite(x: float32 | float64) -> Boolean + + +-- ============================================================================= +-- NULL SEMANTICS IN PREDICATE EVALUATION +-- ============================================================================= +-- +-- Arrow/ORC uses SQL-style three-valued logic (TRUE, FALSE, UNKNOWN/NULL). +-- This affects how predicates interact with NULL values and statistics. +-- +-- Comparison with NULL: +-- x > 10 where x is NULL -> UNKNOWN (not TRUE, not FALSE) +-- x = NULL -> UNKNOWN (use is_null() instead) +-- NULL > NULL -> UNKNOWN +-- +-- Logical operators with UNKNOWN: +-- TRUE AND UNKNOWN -> UNKNOWN +-- FALSE AND UNKNOWN -> FALSE +-- TRUE OR UNKNOWN -> TRUE +-- FALSE OR UNKNOWN -> UNKNOWN +-- NOT UNKNOWN -> UNKNOWN +-- +-- For predicate pushdown, UNKNOWN is treated conservatively: +-- - A stripe is INCLUDED if the predicate could be TRUE for any row +-- - UNKNOWN means "might be TRUE" so the stripe is included +-- - Only definite FALSE excludes a stripe +-- +-- Statistics interaction with NULLs: +-- - min/max statistics exclude NULL values +-- - A guarantee like "x >= 10 AND x <= 20" only constrains non-NULL values +-- - To fully describe a column: (x >= min AND x <= max) OR is_null(x) +-- - The has_null statistic tells us if NULLs exist in the stripe +-- +-- Example: +-- Predicate: x > 5 +-- Statistics: min=10, max=20, has_null=true +-- Guarantee: (x >= 10 AND x <= 20) OR is_null(x) +-- +-- For non-NULL values: x > 5 is TRUE (since min=10 > 5) +-- For NULL values: x > 5 is UNKNOWN +-- Result: Stripe INCLUDED (some rows definitely match, NULLs are UNKNOWN) +-- +-- Predicate: x < 5 +-- Statistics: min=10, max=20, has_null=true +-- +-- For non-NULL values: x < 5 is FALSE (since min=10 >= 5) +-- For NULL values: x < 5 is UNKNOWN +-- Result: Stripe INCLUDED (NULL rows have UNKNOWN result, might match) +-- +-- Predicate: x < 5 AND x IS NOT NULL +-- Statistics: min=10, max=20, has_null=true +-- +-- For non-NULL values: x < 5 is FALSE +-- For NULL values: filtered out by IS NOT NULL +-- Result: Stripe EXCLUDED (no rows can match) + +enum TernaryLogic { + true_, + false_, + unknown +} + +function evaluate_with_nulls(predicate: Expression, guarantee: Expression) -> TernaryLogic { + -- Evaluate a predicate against a guarantee using three-valued logic. + -- Returns: + -- true_ : predicate is definitely TRUE for all rows matching the guarantee + -- false_ : predicate is definitely FALSE for all rows matching the guarantee + -- unknown : predicate may be TRUE, FALSE, or UNKNOWN for different rows + -- + -- This is more precise than is_satisfiable which only distinguishes + -- "definitely false" from "possibly true or unknown". +} + + +-- ============================================================================= +-- EXPRESSION SYSTEM +-- ============================================================================= + +entity Expression { + -- A compute expression that can be evaluated against record batches + -- Used for filters, projections, and statistics guarantees + + kind: literal | field_ref | call + + -- For literals + value: Scalar? + + -- For field references + field_ref: FieldRef? + + -- For function calls (e.g., and, or, equal, greater) + function: String? + arguments: List? + + -- Computed properties + is_satisfiable: Boolean + -- true unless the expression is provably always false (e.g., literal(false)) + + is_bound: Boolean + -- true if all field references are resolved to schema paths with known types + + fields_referenced: Set + -- All fields that appear in this expression +} + +-- Standard expression constructors +function literal(value: Scalar) -> Expression +function field_ref(ref: FieldRef) -> Expression +function and_(left: Expression, right: Expression) -> Expression +function or_(left: Expression, right: Expression) -> Expression +function not_(expr: Expression) -> Expression +function equal(left: Expression, right: Expression) -> Expression +function not_equal(left: Expression, right: Expression) -> Expression +function greater(left: Expression, right: Expression) -> Expression +function greater_equal(left: Expression, right: Expression) -> Expression +function less(left: Expression, right: Expression) -> Expression +function less_equal(left: Expression, right: Expression) -> Expression +function is_null(expr: Expression) -> Expression +function is_valid(expr: Expression) -> Expression +function in_(field: Expression, values: List) -> Expression + +-- The canonical "no filter" expression +constant TRUE_EXPRESSION: Expression = literal(true) + +-- Expression binding +function bind(expr: Expression, schema: Schema) -> Expression { + -- Bind an expression to a schema, resolving field references to types + -- An unbound expression has field_ref by name + -- A bound expression has field_ref resolved to path with known type +} + + +-- ============================================================================= +-- SCHEMA +-- ============================================================================= + +entity Schema { + fields: List +} + +entity Scalar { + -- A typed scalar value (single element) + type: DataType + -- value is type-dependent +} + +entity Field { + name: String + type: DataType + nullable: Boolean + metadata: Map? +} + +entity DataType { + -- Abstract representation of Arrow data types + id: TypeId + children: List? -- For nested types like struct, list +} + +enum TypeId { + int32, int64, float32, float64, + string, binary, boolean, + struct_, list, map, + timestamp, date, time + -- ... other Arrow types +} + + +-- ============================================================================= +-- External Entities (owned by ORC file format, not this system) +-- ============================================================================= + +-- An ORC file containing tabular data organized into stripes +external entity OrcFile { + stripes: List + schema: Schema + footer: OrcFileFooter +} + +-- ORC file footer containing file-level metadata +external entity OrcFileFooter { + num_rows: Int + num_stripes: Int + writer_version: String? + -- Writer version affects statistics reliability (see statistics validation) +} + +-- A horizontal partition of rows within an ORC file +external entity Stripe { + index: Integer -- 0-based position in file + num_rows: Integer -- number of rows in stripe + first_row_id: Integer -- offset of first row + column_statistics: Map +} + +-- Statistics for a single column within a single stripe +external entity ColumnStatistics { + has_null: Boolean -- may contain null values + num_values: Integer -- count of non-null values + + -- Min/max availability + has_minimum: Boolean -- minimum statistic available + has_maximum: Boolean -- maximum statistic available + + -- Typed min/max values (type depends on column type) + minimum: Scalar? -- minimum value + maximum: Scalar? -- maximum value + + -- Distinct count (NOT YET IMPLEMENTED) + -- Could enable more aggressive IN predicate optimization by detecting + -- when all distinct values are known. For example, if distinct_count equals + -- the number of values in the IN list and all match, the stripe fully satisfies. + -- + -- Implementation note: liborc doesn't expose a stable distinct-count API + -- in all versions we support, so this is left unimplemented. + distinct_count: Integer? -- number of distinct non-null values + + -- For string/binary columns: truncation metadata (NOT YET IMPLEMENTED) + -- ORC may truncate long strings in statistics for space efficiency + -- These fields will be used when string/binary support is added. + is_minimum_truncated: Boolean -- true if minimum was truncated + is_maximum_truncated: Boolean -- true if maximum was truncated (and incremented) + + -- Statistics reliability flag + -- Older ORC writers had bugs in statistics computation + is_statistics_deprecated: Boolean -- true if statistics should not be trusted +} + + +-- ============================================================================= +-- ORC SCHEMA MANIFEST - Mapping Between Arrow and ORC Schemas +-- ============================================================================= +-- +-- The SchemaManifest maps Arrow schema field paths to ORC physical column indices. +-- This is required for nested type support (struct, list, map) where a single Arrow +-- field may span multiple ORC leaf columns. +-- +-- This mirrors Parquet's SchemaManifest design (file_parquet.cc lines 727-753) +-- and enables consistent handling of nested types across formats. +-- +-- ============================================================================= +-- ORC COLUMN INDEXING SCHEME +-- ============================================================================= +-- +-- ORC uses a depth-first pre-order traversal to assign column IDs in the type tree: +-- +-- Column 0: Root struct (always present, represents the entire row) +-- Column 1+: User columns in depth-first order +-- +-- Example for schema: struct, e:int> +-- +-- Column 0: root struct +-- Column 1: a (int) <- leaf, has statistics +-- Column 2: b (struct) <- non-leaf, NO statistics +-- Column 3: c (string) <- leaf, has statistics +-- Column 4: d (float) <- leaf, has statistics +-- Column 5: e (int) <- leaf, has statistics +-- +-- KEY DIFFERENCES FROM PARQUET: +-- - ORC: Column 0 is ALWAYS the root struct; user columns start at 1 +-- - Parquet: Columns indexed by schema order; no dedicated root column +-- - ORC: Non-leaf columns (struct, list, map) have column IDs but NO statistics +-- - Parquet: Similar - only leaf columns have statistics +-- +-- STATISTICS AVAILABILITY: +-- - Only LEAF columns have min/max statistics in ORC stripe footers +-- - Container columns (struct, list, map) have column IDs but their statistics +-- are aggregates that don't support predicate pushdown +-- - The column_index in OrcSchemaField is the ORC type tree index, used to +-- look up statistics in stripe.column_statistics[column_index] +-- +-- SOURCE OF COLUMN INDICES: +-- Column indices are read from the ORC file's type tree in the file footer, +-- NOT computed by an arbitrary incrementing counter. The ORC writer assigns +-- indices during file creation; the reader extracts them from metadata. + +entity OrcSchemaManifest { + -- Mapping between Arrow schema and ORC schema + -- Required for nested type column index resolution + + schema_fields: List + -- Top-level fields with recursive structure for nested types + + num_columns: Int + -- Total number of ORC leaf columns +} + +entity OrcSchemaField { + -- Represents a field in the Arrow schema with its ORC column mapping + + field: Field + -- The Arrow field definition + + column_index: Int? + -- ORC column index (only for leaf fields) + -- null for non-leaf fields (struct, list, map containers) + + is_leaf: Boolean + -- true if this field maps directly to an ORC column + -- false for container types (struct, list, map) + + children: List + -- Nested fields for struct/list/map types + -- Empty for leaf fields +} + +function BuildOrcSchemaManifest(metadata: OrcFileMetadata) -> OrcSchemaManifest { + -- Builds mapping from Arrow schema to ORC physical column indices. + -- The column indices are extracted from the ORC file's type tree in the footer, + -- which assigns indices via depth-first pre-order traversal (column 0 = root struct). + -- + -- Algorithm: + -- 1. Read the ORC type tree from file metadata (already indexed by ORC writer) + -- 2. Walk the Arrow schema and ORC type tree in parallel + -- 3. For each Arrow field, find the corresponding ORC type node + -- 4. Extract the column_index from the ORC type node: + -- - Leaf fields: column_index is used for statistics lookup + -- - Container fields: column_index is null (no usable statistics) + -- 5. Recurse into children for nested types (struct, list, map) + -- + -- IMPORTANT: Column indices are NOT assigned by this function. They are + -- read from the ORC file metadata where they were assigned by the ORC writer + -- during file creation. This ensures consistency with stripe statistics + -- which are also indexed by the same ORC column IDs. + -- + -- Example: For Arrow field "b.c" (nested), this function traverses: + -- Arrow schema -> field "b" -> child "c" + -- ORC type tree -> column 2 (struct "b") -> column 3 (leaf "c") + -- Returns column_index = 3 for statistics lookup +} + +function GetOrcColumnIndex(manifest: OrcSchemaManifest, field_ref: FieldRef) -> Int? { + -- Resolve a field reference to its ORC column index using the manifest + -- + -- For top-level fields: Returns the column_index directly + -- For nested fields: Traverses the manifest tree to find the leaf column + -- + -- Map-path compatibility: + -- Canonical Arrow map leaf paths include the synthetic entries struct: + -- map/0/0 -> key, map/0/1 -> value + -- For compatibility with existing call sites, implementations may also + -- accept shorthand map/{0,1} and resolve them to the same key/value + -- leaves. + -- In shorthand mode, map/0 is treated as key (leaf), not as a distinct + -- entries container node. + -- + -- Returns null if: + -- - Field not found in manifest + -- - Field is a container type (not a leaf) +} + + +-- ============================================================================= +-- ORC FILE FRAGMENT - ORC-specific Fragment +-- ============================================================================= + +entity OrcFileFragment extends FileFragment { + -- A FileFragment with ORC-specific predicate pushdown capabilities + -- + -- KEY INSIGHT: This entity persists across scans. The statistics_cache + -- is populated incrementally and reused, making predicate pushdown + -- more efficient for repeated queries on the same file. + + stripes: List? + -- Indices of stripes selected by this fragment + -- null means all stripes are selected + + metadata: OrcFileMetadata? + -- Cached file metadata (lazy-loaded, immutable once set) + + manifest: OrcSchemaManifest? + -- Cached schema manifest for Arrow-to-ORC column mapping + -- Built from metadata when first needed (lazy-loaded) + + statistics_cache: StripeStatisticsCache? + -- Cached stripe statistics, lazily populated + -- Each stripe has a combined guarantee expression from all processed fields + + cache_status: uncached | cached | invalidated + -- Tracks the state of the statistics cache + + partition_expression: Expression + -- An expression that evaluates to true for ALL data in this fragment + -- E.g., for a partition "year=2024", this would be: year == 2024 + -- Inherited from directory structure or Hive-style partitioning + + physical_schema: Schema? + -- The actual schema of data in this fragment (lazy-loaded) + + -- Derived properties + effective_stripes: List + -- stripes if set, otherwise [0..metadata.num_stripes) +} + +entity OrcFileMetadata { + -- ORC file metadata containing stripe information + + num_rows: Int + num_stripes: Int + schema: Schema + writer_version: String? + -- Writer version affects statistics reliability +} + +entity StripeMetadata { + -- Metadata for a single ORC stripe + + num_rows: Int + data_length: Int + index_length: Int + footer_length: Int + columns: List +} + + +-- ============================================================================= +-- Core Entities +-- ============================================================================= + +-- A field referenced in a predicate, resolved to a physical schema column +entity PredicateField { + field_ref: FieldRef + arrow_field_index: Integer -- 0-based index in Arrow schema + orc_column_index: Integer -- Index in ORC leaf columns (via manifest) + data_type: DataType + supports_statistics: Boolean -- true if this field's type supports min/max statistics +} + +-- NOTE ON COLUMN INDICES: +-- arrow_field_index: Position in the Arrow schema's top-level field list +-- orc_column_index: Position in ORC's physical column layout (leaf columns only) +-- +-- For flat schemas these are typically identical, but for nested types they differ: +-- Arrow schema: [struct, int] -> arrow indices: [0, 1] +-- ORC columns: [struct, int, string, int] -> orc indices: [0, 1, 2, 3] +-- (struct is index 0, a=1, b=2, outer int=3) +-- +-- The orc_column_index is used for: +-- 1. Looking up column statistics in stripes +-- 2. Tracking which columns have been processed in statistics_complete[] +-- +-- This mirrors Parquet's PredicateField which has both arrow_field_index and +-- parquet_column_index (file_parquet.cc lines 708-725). + +-- Cached stripe-level statistics expressions for a file fragment +entity StripeStatisticsCache { + -- Per-stripe combined guarantee expressions + -- One expression per stripe, initialized to literal(true) + -- Tightened (AND'd) as more field statistics are processed + stripe_guarantees: List + + -- Tracks which fields have been processed + -- Prevents redundant statistics loading + fields_processed: Set + + -- Per-column completion status (indexed by column index) + -- true if that column's statistics have been processed for all stripes + statistics_complete: List +} + +-- Result of filtering stripes against a predicate +entity StripeFilterResult { + selected_indices: List -- 0-based stripe indices to read + skipped_count: Integer -- number of stripes eliminated +} + +-- Extracted bounds for a field from a guarantee expression. +-- Used by compound predicate rules (OR, IN, NOT) to access statistics. +entity FieldBounds { + min: Scalar? + max: Scalar? + has_nulls: Boolean + + -- Truncation flags for STRING/BINARY columns (NOT YET IMPLEMENTED) + -- These will be used when string/binary support is added. + min_truncated: Boolean + -- True if min is a truncated prefix (cannot prove lower bound definitively) + + max_truncated: Boolean + -- True if max is an incremented truncated prefix (still valid upper bound) +} + + +-- ============================================================================= +-- Rules +-- ============================================================================= + +-- Rule 1: Resolve which predicate fields can participate in pushdown +rule ResolvePredicateFields { + -- Resolves field references in a predicate to PredicateField entities + -- using the schema manifest for proper ORC column index resolution. + -- + -- With manifest support, nested fields CAN participate in pushdown + -- if they map to leaf columns with statistics support. + + when: ResolveFields(predicate, schema, manifest) + + ensures: + for field_ref in predicate.fields_referenced: + let field = schema.resolve(field_ref) + + if field = null: + skip + + -- Use manifest to resolve ORC column index + let orc_col_index = GetOrcColumnIndex(manifest, field_ref) + + -- Skip fields that don't map to a single leaf column + -- (e.g., struct containers without direct statistics) + if orc_col_index = null: + skip + + -- Check if type supports statistics + let supports_stats = field.type.id in SupportedStatisticsType + + -- Emit resolved field with both Arrow and ORC indices + yield PredicateField( + field_ref: field_ref, + arrow_field_index: field.index, + orc_column_index: orc_col_index, + data_type: field.type, + supports_statistics: supports_stats + ) +} + + +-- Rule 2: Derive a guarantee expression from stripe column statistics +-- +-- The guarantee captures what we know to be true about a field within a stripe, +-- based on its statistics. This is used to simplify the predicate. +rule DeriveFieldGuarantee { + when: DeriveGuarantee(stripe, predicate_field) + + -- Use orc_column_index for statistics lookup (not arrow_field_index) + -- This correctly handles nested types where indices differ + let stats = stripe.column_statistics.get(predicate_field.orc_column_index) + + -- Conservative: if statistics unavailable, no guarantee (include stripe) + if stats = null: + ensures: null.returned() + + -- Statistics marked as deprecated should not be trusted + if stats.is_statistics_deprecated: + ensures: null.returned() + + -- Empty stripe check (handled separately by FilterStripesEmptyCheck) + if stripe.num_rows = 0: + ensures: null.returned() + + -- Sanity check: num_values should not exceed stripe rows + if stats.num_values > stripe.num_rows: + -- Corrupted statistics, be conservative + ensures: null.returned() + + let field_expr = field_ref(predicate_field.field_ref) + + ensures: + -- ================================================================= + -- Case 1: All values are null (num_values counts non-null values) + -- ================================================================= + -- + -- ORC Statistics Semantics: + -- num_values: count of NON-NULL values in the stripe + -- has_null: true if ANY null values exist + -- + -- All-null detection: num_values = 0 means zero non-null values exist, + -- therefore all values must be null. The stripe.num_rows > 0 guard + -- prevents false positives on empty stripes. + -- + -- This differs from Parquet's null_count = num_values check but + -- produces identical filtering behavior: + -- ORC: num_values = 0 AND num_rows > 0 -> all null + -- Parquet: null_count = num_values -> all null + -- + -- Both correctly identify when a column has no non-null values. + -- ================================================================= + if stats.num_values = 0 and stripe.num_rows > 0: + -- The only thing we know is that all values are NULL + -- This allows is_null(x) to be TRUE and is_valid(x) to be FALSE + yield is_null(field_expr) + + -- Case 2: Statistics available with min/max + else if stats.has_minimum and stats.has_maximum: + let min_val = stats.minimum + let max_val = stats.maximum + + -- Handle floating-point edge cases (NOT YET IMPLEMENTED) + -- When float support is added: + -- if predicate_field.data_type.id in {float32, float64}: + -- -- If min or max is NaN, statistics are unusable + -- if is_nan(min_val) or is_nan(max_val): + -- yield null -- Cannot derive useful guarantee + + -- Validate statistics are sensible + if min_val > max_val: + -- Corrupted statistics, be conservative + yield null + + -- Single value case: equality guarantee + if min_val = max_val: + let guarantee = equal(field_expr, literal(min_val)) + if stats.has_null: + yield or_(guarantee, is_null(field_expr)) + else: + yield guarantee + + -- Range case: bounded guarantee + else: + let guarantee = and_( + greater_equal(field_expr, literal(min_val)), + less_equal(field_expr, literal(max_val)) + ) + if stats.has_null: + yield or_(guarantee, is_null(field_expr)) + else: + yield guarantee + + -- Case 3: Statistics incomplete - no guarantee + else: + yield null -- conservative: cannot derive guarantee +} + + +-- ============================================================================= +-- EDGE CASE: EMPTY STRIPES +-- ============================================================================= +-- +-- A stripe with num_rows = 0 is technically valid but unusual. +-- Handling: +-- - FilterStripes EXCLUDES empty stripes regardless of predicate +-- - Empty stripes cannot satisfy any predicate (no rows to match) +-- - This is an optimization, not a correctness requirement + +rule FilterStripesEmptyCheck { + -- Empty stripes are always excluded from scan. + -- This check happens before statistics evaluation. + + when: CheckStripeEmpty(stripe) + + if stripe.num_rows = 0: + ensures: false.returned() -- Exclude: no rows to match + else: + ensures: true.returned() -- Proceed with statistics-based filtering +} + + +-- ============================================================================= +-- STRING/BINARY TRUNCATION HANDLING (NOT YET IMPLEMENTED) +-- ============================================================================= +-- +-- NOTE: String/binary support is not yet implemented. This section documents +-- the intended behavior for future implementation. +-- +-- ORC allows truncating min/max statistics for STRING/BINARY columns +-- to save space in the footer. The truncation is conservative: +-- +-- Truncated MIN: +-- - The stored min is a PREFIX of the actual minimum value +-- - stored_min <= actual_min (lexicographically) +-- - For predicate x >= 'abc', if stored_min = 'ab' (truncated from 'abc...'), +-- we cannot skip even if 'ab' < 'abc' because actual_min might be >= 'abc' +-- - CONSERVATIVE: Truncated min means we CANNOT prove x >= value is FALSE +-- +-- Truncated MAX: +-- - The stored max is the prefix with last byte incremented (if possible) +-- - If prefix is 'ab', truncated max might be 'ac' (increment 'b' to 'c') +-- - stored_max >= actual_max +-- - For predicate x <= 'abc', truncated max is still useful +-- - CONSERVATIVE: Truncated max is an upper bound, still valid for <= checks +-- +-- Detection: +-- - is_minimum_truncated flag indicates min truncation occurred +-- - is_maximum_truncated flag indicates max truncation occurred + +rule SimplifyWithTruncatedStatistics { + -- Handle STRING/BINARY predicates when statistics may be truncated. + -- NOTE: NOT YET IMPLEMENTED - string/binary support pending. + + when: SimplifyWithGuarantee(comparison(field_ref(ref), literal(value)), guarantee) + where: value.type.id in {string, binary} -- Currently not matched (types unsupported) + + let field_bounds = ExtractFieldBounds(ref, guarantee) + + if field_bounds = null: + ensures: comparison(field_ref(ref), literal(value)).returned() + + -- Check for truncation + if field_bounds.min_truncated: + -- Cannot use min for lower-bound comparisons + if comparison in {greater, greater_equal}: + -- Cannot prove FALSE; return original predicate + ensures: comparison(field_ref(ref), literal(value)).returned() + + if field_bounds.max_truncated: + -- Max is an upper bound, still useful for upper-bound comparisons + -- But cannot prove TRUE from truncated max + pass -- Normal handling applies + + -- Proceed with normal simplification using possibly-truncated bounds +} + + +-- ============================================================================= +-- ROW INDEX STATISTICS (NOT YET IMPLEMENTED - FUTURE OPPORTUNITY) +-- ============================================================================= +-- +-- NOTE: Row index statistics are not currently used for predicate pushdown. +-- This section documents the opportunity for future implementation. +-- +-- ORC supports finer-grained statistics at the row index level, typically +-- every 10,000 rows (configurable via orc.row.index.stride). These statistics +-- are stored in the stripe's index streams and provide sub-stripe filtering. +-- +-- Current implementation: +-- - GetRowIndexStride() exists in the ORC adapter but is unused for pushdown +-- - Stripe-level statistics provide coarse-grained filtering only +-- +-- Future opportunity: +-- - Read row index statistics for even more aggressive filtering +-- - Skip row groups within a stripe when statistics prove no matches +-- - Particularly beneficial for large stripes with heterogeneous data +-- +-- Trade-offs: +-- - Additional I/O to read index streams (though typically small) +-- - More complex filtering logic with two-level statistics +-- - Diminishing returns if stripe-level filtering is already effective +-- +-- This mirrors Parquet's page-level statistics (column index) which provides +-- similar sub-row-group filtering capabilities. + + +-- ============================================================================= +-- COMPOUND PREDICATE HANDLING (OR, IN, NOT, !=) +-- ============================================================================= + +rule SimplifyOrPredicate { + -- Simplify OR predicates against statistics guarantees. + -- + -- Key insight: OR is satisfiable if ANY branch is satisfiable. + -- We can only skip a stripe if ALL branches are unsatisfiable. + -- + -- Partial statistics handling: + -- If some fields in the OR have statistics and others don't, + -- the branches without statistics are treated as "possibly true". + + when: SimplifyWithGuarantee(or_(left, right), guarantee) + + let left_simplified = SimplifyWithGuarantee(left, guarantee) + let right_simplified = SimplifyWithGuarantee(right, guarantee) + + -- If either branch is definitely TRUE, the OR is TRUE + if left_simplified = literal(true) or right_simplified = literal(true): + ensures: literal(true).returned() + + -- If both branches are definitely FALSE, the OR is FALSE + if left_simplified = literal(false) and right_simplified = literal(false): + ensures: literal(false).returned() + + -- If one branch is FALSE, return the other + if left_simplified = literal(false): + ensures: right_simplified.returned() + if right_simplified = literal(false): + ensures: left_simplified.returned() + + -- Otherwise, return simplified OR + ensures: or_(left_simplified, right_simplified).returned() +} + +rule SimplifyAndPredicate { + -- Simplify AND predicates against statistics guarantees. + -- + -- Key insight: AND is satisfiable only if BOTH branches are satisfiable. + -- We can skip a stripe if EITHER branch is unsatisfiable. + + when: SimplifyWithGuarantee(and_(left, right), guarantee) + + let left_simplified = SimplifyWithGuarantee(left, guarantee) + let right_simplified = SimplifyWithGuarantee(right, guarantee) + + -- If either branch is definitely FALSE, the AND is FALSE + if left_simplified = literal(false) or right_simplified = literal(false): + ensures: literal(false).returned() + + -- If both branches are definitely TRUE, the AND is TRUE + if left_simplified = literal(true) and right_simplified = literal(true): + ensures: literal(true).returned() + + -- If one branch is TRUE, return the other + if left_simplified = literal(true): + ensures: right_simplified.returned() + if right_simplified = literal(true): + ensures: left_simplified.returned() + + -- Otherwise, return simplified AND + ensures: and_(left_simplified, right_simplified).returned() +} + +rule SimplifyNotPredicate { + -- Simplify NOT predicates against statistics guarantees. + -- + -- NOT(x > 10) is equivalent to x <= 10 OR x IS NULL (in SQL semantics) + -- For pushdown purposes, we use: NOT(definitely_true) = definitely_false + -- + -- NOT inverts the satisfiability: + -- NOT(literal(true)) -> literal(false) + -- NOT(literal(false)) -> literal(true) + -- NOT(unknown) -> unknown (conservative) + + when: SimplifyWithGuarantee(not_(expr), guarantee) + + let inner_simplified = SimplifyWithGuarantee(expr, guarantee) + + if inner_simplified = literal(true): + ensures: literal(false).returned() + if inner_simplified = literal(false): + ensures: literal(true).returned() + + -- Cannot simplify further; return NOT of simplified inner + ensures: not_(inner_simplified).returned() +} + +rule SimplifyNotEqualPredicate { + -- Simplify NOT EQUAL (!=) predicates against statistics guarantees. + -- + -- x != value can skip a stripe when: + -- - min = max = value (all non-null values equal the excluded value) + -- + -- If NULLs are present, x != value evaluates to UNKNOWN for NULL rows, + -- never TRUE. Therefore NULLs do not prevent exclusion in this case. + -- + -- Example: + -- Predicate: x != 10 + -- Statistics: min=10, max=10, has_null=false + -- Result: literal(false) -- all values are exactly 10, none satisfy != 10 + -- + -- Predicate: x != 10 + -- Statistics: min=10, max=10, has_null=true + -- Result: literal(false) -- rows are either 10 (FALSE) or NULL (UNKNOWN) + + when: SimplifyWithGuarantee(not_equal(field_ref(ref), literal(value)), guarantee) + + -- Extract statistics bounds from guarantee if available + let field_bounds = ExtractFieldBounds(ref, guarantee) + + if field_bounds = null: + -- No statistics for this field; cannot simplify + ensures: not_equal(field_ref(ref), literal(value)).returned() + + if field_bounds.min = field_bounds.max = value: + -- All non-null values equal the excluded value. + -- NULL rows evaluate to UNKNOWN for !=, so no row can be TRUE. + ensures: literal(false).returned() + + if value < field_bounds.min or value > field_bounds.max: + if field_bounds.has_nulls = false: + -- Value is outside the range, so != is always TRUE + ensures: literal(true).returned() + + -- Cannot prove true or false + ensures: not_equal(field_ref(ref), literal(value)).returned() +} + +rule SimplifyInPredicate { + -- Simplify IN predicates against statistics guarantees. + -- + -- x IN (v1, v2, v3) is equivalent to x = v1 OR x = v2 OR x = v3 + -- + -- Optimization: Use min/max range intersection. + -- If no values in the IN list fall within [min, max], skip the stripe. + -- + -- Example: + -- Predicate: x IN (1, 2, 3) + -- Statistics: min=10, max=20 + -- Result: literal(false) -- no IN values are in [10, 20] + -- + -- Predicate: x IN (1, 15, 100) + -- Statistics: min=10, max=20 + -- Result: x IN (1, 15, 100) -- only 15 is in range, but keep full predicate + -- (post-scan filtering will handle correctness) + + when: SimplifyWithGuarantee(in_(field_ref(ref), values), guarantee) + + let field_bounds = ExtractFieldBounds(ref, guarantee) + + if field_bounds = null: + -- No statistics; cannot simplify + ensures: in_(field_ref(ref), values).returned() + + -- Filter values to those within [min, max] + let values_in_range = [ + v for v in values + if v >= field_bounds.min and v <= field_bounds.max + ] + + if values_in_range.empty: + if field_bounds.has_nulls = false: + -- No IN values overlap with statistics range, no NULLs + ensures: literal(false).returned() + else: + -- NULLs exist; IN with NULL handling is complex, be conservative + ensures: in_(field_ref(ref), values).returned() + + -- Some values are in range; return original predicate + -- Post-scan filtering will handle correctness + ensures: in_(field_ref(ref), values).returned() +} + + +-- ============================================================================= +-- Rule 3: Test a single stripe against a predicate +-- +-- Uses the stripe's statistics to simplify the predicate. If simplified +-- predicate is unsatisfiable, the stripe can be skipped. +-- ============================================================================= + +rule TestStripe { + when: TestStripe(stripe, predicate, predicate_fields, cache) + + -- Skip empty stripes immediately + if not CheckStripeEmpty(stripe): + ensures: literal(false).returned() + + -- Gather all field guarantees for this stripe + let stripe_guarantee = + for field in predicate_fields: + if field.supports_statistics: + let field_guarantee = DeriveFieldGuarantee(stripe, field) + if field_guarantee != null: + yield field_guarantee + combine with: and_ + + -- If no guarantees could be derived, return original predicate (conservative) + if stripe_guarantee = null: + ensures: predicate.returned() + + -- Simplify predicate using stripe guarantee + let simplified = SimplifyWithGuarantee(predicate, stripe_guarantee) + + ensures: simplified.returned() +} + + +-- ============================================================================= +-- EXPRESSION BINDING CONTRACT +-- ============================================================================= +-- +-- Predicates passed to FilterStripes/TestStripes may be unbound. +-- TestStripes handles binding defensively to ensure correctness. +-- +-- Preferred approach: Scanner binds expressions before passing to fragments. +-- Defensive approach: TestStripes checks and binds if needed. +-- +-- This matches the defensive pattern used in the actual implementation +-- (cpp/src/arrow/dataset/file_orc.cc line 514). +-- +-- Invariant: Binding is idempotent - re-binding a bound expression is safe. +-- +-- The binding process: +-- 1. Resolve field names to schema paths +-- 2. Infer types for field references +-- 3. Validate type compatibility for operations +-- +-- Example: +-- Unbound: field_ref("x") > 10 +-- Bound: field_ref([0], type=int64) > 10 (where "x" is at index 0) +-- ============================================================================= + + +-- ============================================================================= +-- Rule 4a: Test stripes - returns per-stripe simplified expressions +-- +-- This is the core statistics evaluation logic, separated from filtering +-- to enable reuse by TryCountRows and other optimization paths. +-- +-- Returns: List of simplified expressions, one per effective stripe. +-- Each expression represents the predicate after applying stripe statistics. +-- literal(false) means the stripe can be skipped. +-- literal(true) means all rows in stripe match. +-- Other expressions mean the stripe needs post-scan filtering. +-- +-- This mirrors Parquet's TestRowGroups (file_parquet.cc lines 918-1004). +-- ============================================================================= + +rule TestStripes { + when: TestStripes(orc_fragment, predicate) + + requires: orc_fragment.metadata != null + requires: orc_fragment.statistics_cache != null + requires: orc_fragment.manifest != null + + let cache = orc_fragment.statistics_cache + + -- Bind predicate if needed (defensive binding for edge cases) + let bound_predicate = + if predicate.is_bound: + predicate + else: + bind(predicate, orc_fragment.physical_schema) + + -- Simplify with partition guarantees first + let simplified_predicate = SimplifyWithGuarantee(bound_predicate, orc_fragment.partition_expression) + + -- Early exit: predicate unsatisfiable at partition level + if not simplified_predicate.is_satisfiable: + ensures: EmptyList.returned() + + -- Resolve predicate fields using manifest for proper column mapping + let predicate_fields = ResolvePredicateFields( + simplified_predicate, + orc_fragment.physical_schema, + orc_fragment.manifest + ) + + -- Find fields not yet in cache (incremental population) + let uncached_fields = [ + pf for pf in predicate_fields + if pf.field_ref not in cache.fields_processed + if pf.supports_statistics + ] + + -- Load statistics for uncached fields into cache + for pf in uncached_fields: + for (i, stripe_idx) in orc_fragment.effective_stripes.enumerate(): + let stripe = orc_fragment.metadata.stripes[stripe_idx] + let guarantee = DeriveFieldGuarantee(stripe, pf) + + if guarantee != null: + cache.stripe_guarantees[i] = FoldingAnd( + cache.stripe_guarantees[i], + guarantee + ) + + cache.fields_processed.add(pf.field_ref) + -- Use orc_column_index for statistics_complete (not arrow_field_index) + cache.statistics_complete[pf.orc_column_index] = true + + -- Return per-stripe simplified expressions + ensures: List.returned( + for (i, _) in orc_fragment.effective_stripes.enumerate(): + SimplifyWithGuarantee(simplified_predicate, cache.stripe_guarantees[i]) + ) +} + + +-- ============================================================================= +-- Rule 4b: Filter stripes - the main entry point +-- +-- Given a predicate, determine which stripes may contain matching rows. +-- Uses TestStripes for the core statistics evaluation. +-- ============================================================================= + +rule FilterStripes { + when: FilterStripes(orc_fragment, predicate) + + -- Ensure metadata, manifest, and statistics cache are loaded + EnsureFileMetadataCached(orc_fragment) + EnsureManifestCached(orc_fragment) + EnsureStatisticsCached(orc_fragment) + + -- Get per-stripe simplified expressions + let stripe_expressions = TestStripes(orc_fragment, predicate) + + -- Early exit: if empty (partition-level unsatisfiable) + if stripe_expressions.empty: + ensures: StripeFilterResult( + selected_indices: [], + skipped_count: orc_fragment.effective_stripes.count + ).returned() + + -- Select stripes where predicate is satisfiable + let selected = [] + for (i, stripe_idx) in orc_fragment.effective_stripes.enumerate(): + let stripe = orc_fragment.metadata.stripes[stripe_idx] + + -- Skip empty stripes + if stripe.num_rows = 0: + continue + + -- Include stripe if predicate is satisfiable + if stripe_expressions[i].is_satisfiable: + selected.append(stripe_idx) + + ensures: StripeFilterResult( + selected_indices: selected, + skipped_count: orc_fragment.effective_stripes.count - selected.count + ).returned() +} + + +-- ============================================================================= +-- Rule 4c: TryCountRows - count rows from metadata when possible +-- +-- Optimization: If the predicate can be fully evaluated using stripe statistics +-- (all stripes simplify to literal(true) or literal(false)), we can count +-- rows directly from metadata without reading any actual data. +-- +-- This mirrors Parquet's ParquetTryCountRows (file_parquet.cc lines 1299-1330). +-- ============================================================================= + +rule OrcTryCountRows { + when: TryCountRows(orc_fragment, predicate) + + -- Ensure metadata and statistics are cached + EnsureFileMetadataCached(orc_fragment) + EnsureManifestCached(orc_fragment) + EnsureStatisticsCached(orc_fragment) + + -- Fast path: no field references means count all rows + if not ExpressionHasFieldRefs(predicate): + -- Predicate is just literal(true) or similar + if predicate.is_satisfiable: + ensures: orc_fragment.metadata.num_rows.returned() + else: + ensures: 0.returned() + + -- Get per-stripe simplified expressions + let stripe_expressions = TestStripes(orc_fragment, predicate) + + -- If partition-level unsatisfiable, count is 0 + if stripe_expressions.empty: + ensures: 0.returned() + + -- Try to count from metadata alone + var total_rows = 0 + for (i, stripe_idx) in orc_fragment.effective_stripes.enumerate(): + let stripe = orc_fragment.metadata.stripes[stripe_idx] + let expr = stripe_expressions[i] + + -- Skip stripes that are provably empty + if not expr.is_satisfiable: + continue + + -- If expression is not literal(true), we can't count from metadata + -- because some rows in the stripe might not match + if expr != literal(true): + -- Cannot count from metadata alone; caller must do full scan + ensures: null.returned() + + -- This stripe fully matches; add its row count + total_rows += stripe.num_rows + + -- All stripes were either fully matched or fully excluded + ensures: total_rows.returned() +} + +function ExpressionHasFieldRefs(expr: Expression) -> Boolean { + -- Returns true if the expression references any fields + -- (as opposed to being purely literal-based) + return expr.fields_referenced.count > 0 +} + + +-- ============================================================================= +-- Invariants +-- ============================================================================= + +-- Invariant: Predicate pushdown is conservative (never loses valid rows) +-- +-- Any row that matches the predicate MUST be in a stripe that is selected. +-- Equivalently: we may include extra stripes, but never exclude required ones. +invariant ConservativeFiltering { + for orc_fragment, predicate: + let result = FilterStripes(orc_fragment, predicate) + for row in orc_fragment.all_rows: + if predicate.matches(row): + row.stripe.index in result.selected_indices +} + +-- Invariant: Missing statistics always include the stripe +invariant MissingStatisticsInclude { + for orc_fragment, predicate: + let result = FilterStripes(orc_fragment, predicate) + for stripe in orc_fragment.effective_stripes: + -- If any field in predicate lacks statistics for this stripe + let has_all_stats = all( + for field in predicate.fields_referenced: + stripe.column_statistics.get(field.index) != null + and stripe.column_statistics.get(field.index).has_minimum + and stripe.column_statistics.get(field.index).has_maximum + ) + -- If statistics are incomplete, stripe must be included + -- (unless excluded by other fully-statistical fields) + if not has_all_stats: + -- The stripe MIGHT be included; we cannot prove exclusion + -- without complete statistics for all predicate fields + true -- This is a necessary condition, not sufficient +} + +-- Invariant: Unsupported field types do not cause exclusion +-- +-- If a predicate references a field whose type does not support statistics +-- (e.g., boolean, nested types), that field cannot contribute to stripe exclusion. +-- The stripe must be included unless other supported fields prove exclusion. +invariant UnsupportedTypesConservative { + for orc_fragment, predicate: + let result = FilterStripes(orc_fragment, predicate) + let unsupported_fields = [ + f for f in predicate.fields_referenced + if f.type.id not in SupportedStatisticsType + ] + -- If predicate ONLY references unsupported fields, all stripes included + if unsupported_fields.count = predicate.fields_referenced.count: + result.selected_indices = orc_fragment.effective_stripes +} + +-- Invariant: Empty stripes are never included +invariant EmptyStripesExcluded { + for orc_fragment, predicate: + let result = FilterStripes(orc_fragment, predicate) + for stripe_idx in result.selected_indices: + orc_fragment.metadata.stripes[stripe_idx].num_rows > 0 +} + +-- Invariant: Deprecated statistics are not trusted +invariant DeprecatedStatisticsIgnored { + for orc_fragment, predicate: + let result = FilterStripes(orc_fragment, predicate) + for stripe in orc_fragment.effective_stripes: + for field in predicate.fields_referenced: + let stats = stripe.column_statistics.get(field.index) + -- If statistics are deprecated, they cannot cause exclusion + if stats != null and stats.is_statistics_deprecated: + -- This field's statistics do not contribute to exclusion + true +} + +-- Invariant: Metadata and manifest are immutable once populated +-- +-- Once OrcFileMetadata or OrcSchemaManifest is set on a fragment, it never changes. +-- This allows safe sharing across subset fragments without defensive copies. +-- The statistics_cache is mutable (accumulates field statistics) but metadata/manifest are not. +invariant MetadataCacheImmutability { + -- Once metadata is populated, it never changes. + -- This allows safe sharing across subset fragments. + + if fragment.metadata != null: + fragment.metadata = constant + + if fragment.manifest != null: + fragment.manifest = constant +} + +-- Invariant: Statistics guarantees monotonically tighten +-- +-- Each FoldingAnd operation can only narrow the possible value space, never widen it. +-- This ensures that adding more field statistics never causes a stripe to be +-- incorrectly excluded that was previously included. +invariant StatisticsMonotonicity { + -- Statistics guarantees can only become more restrictive. + -- Each FoldingAnd operation narrows the possible value space. + + for stripe_idx in fragment.effective_stripes: + after_field_added(cache.stripe_guarantees[stripe_idx]) implies + before_field_added(cache.stripe_guarantees[stripe_idx]) +} + + +-- ============================================================================= +-- CONCURRENT CACHE UPDATE SYNCHRONIZATION +-- ============================================================================= +-- +-- When multiple threads scan the same OrcFileFragment concurrently, +-- they may attempt to update the statistics_cache simultaneously. +-- +-- Synchronization Mechanism: +-- The statistics_cache is protected by physical_schema_mutex_. +-- All reads and writes to the cache must acquire this lock. +-- +-- Critical Sections: +-- 1. EnsureFileMetadataCached: Lock while checking/setting metadata +-- 2. EnsureManifestCached: Lock while checking/setting manifest +-- 3. EnsureStatisticsCached: Lock while initializing cache +-- 4. TestStripes: Lock while reading/updating fields_processed and stripe_guarantees +-- +-- Concurrent Update Protocol: +-- Thread A Thread B +-- --------- --------- +-- acquire(lock) +-- check: field in fields_processed? +-- (no) -> compute statistics +-- update stripe_guarantees +-- add field to fields_processed +-- release(lock) +-- acquire(lock) +-- check: field in fields_processed? +-- (yes) -> skip, use cached result +-- release(lock) +-- +-- Idempotency Guarantee: +-- If both threads compute statistics for the same field before either +-- checks fields_processed, they will compute identical guarantees. +-- Double-computation is wasteful but not incorrect. +-- +-- Invariant: All statistics cache mutations are serialized via physical_schema_mutex_. +-- +-- Note: The actual implementation (cpp/src/arrow/dataset/file_orc.cc) uses +-- physical_schema_mutex_ at lines 485, 534, 618 for this synchronization. +-- This matches Parquet's thread safety model (file_parquet.cc lines 2139-2199). + +entity CacheLock { + -- Mutex protecting the statistics cache of an OrcFileFragment + fragment: OrcFileFragment +} + +function acquire(lock: CacheLock) -> void { + -- Acquire exclusive access to the fragment's cache +} + +function release(lock: CacheLock) -> void { + -- Release exclusive access to the fragment's cache +} + + +-- ============================================================================= +-- Cache Management Rules +-- ============================================================================= + +-- Statistics are lazily computed and cached per-field per-fragment. +-- Once a field's statistics are loaded across all stripes, it is marked complete. +-- Subsequent predicates using that field reuse cached guarantees. + +rule EnsureFileMetadataCached { + -- Load file metadata if not already cached. + -- This is required before accessing stripe statistics. + + when: EnsureFileMetadataCached(orc_fragment) + + requires: orc_fragment.cache_status != invalidated + + if orc_fragment.metadata = null: + let metadata = ReadOrcFileMetadata(orc_fragment.source) + ensures: orc_fragment.metadata = metadata + ensures: orc_fragment.physical_schema = metadata.schema +} + +rule EnsureStatisticsCached { + -- Initialize the statistics cache if not already done. + -- Safe to call multiple times (idempotent). + + when: EnsureStatisticsCached(orc_fragment) + + requires: orc_fragment.cache_status != invalidated + requires: orc_fragment.metadata != null + + if orc_fragment.statistics_cache = null: + let num_stripes = orc_fragment.effective_stripes.length + let num_columns = orc_fragment.physical_schema.fields.length + + ensures: orc_fragment.statistics_cache = StripeStatisticsCache( + stripe_guarantees: [literal(true) for _ in 0..num_stripes], + fields_processed: {}, + statistics_complete: [false for _ in 0..num_columns] + ) + ensures: orc_fragment.cache_status = cached +} + +rule EnsureManifestCached { + -- Build and cache the schema manifest if not already done. + -- The manifest maps Arrow schema fields to ORC column indices. + + when: EnsureManifestCached(orc_fragment) + + requires: orc_fragment.metadata != null + + if orc_fragment.manifest = null: + let manifest = BuildOrcSchemaManifest(orc_fragment.metadata) + ensures: orc_fragment.manifest = manifest +} + +rule ClearCachedMetadata { + -- Invalidate all cached metadata. + -- Next FilterStripes call will rebuild from scratch. + + when: ClearCachedMetadata(orc_fragment) + + ensures: orc_fragment.statistics_cache = null + ensures: orc_fragment.manifest = null + ensures: orc_fragment.metadata = null + ensures: orc_fragment.cache_status = uncached +} + + +-- ============================================================================= +-- Expression Simplification +-- ============================================================================= + +function SimplifyWithGuarantee(predicate: Expression, guarantee: Expression) -> Expression { + -- Simplify a predicate given a guarantee expression + -- + -- The guarantee represents facts known to be true about the data. + -- If the predicate can be proven true/false given the guarantee, + -- it is replaced with literal(true) or literal(false). + -- + -- Example: + -- predicate: x > 10 + -- guarantee: x >= 15 AND x <= 20 + -- result: literal(true) -- because min(x) > 10 + -- + -- predicate: x < 5 + -- guarantee: x >= 15 AND x <= 20 + -- result: literal(false) -- because max(x) >= 5 + + -- Implementation uses algebraic simplification rules + -- Returns simplified expression +} + +function FoldingAnd(left: Expression, right: Expression) -> Expression { + -- Combine two expressions with AND, optimizing for literal(true) + + if left = literal(true): + return right + else: + return and_(left, right) +} + +function ExtractFieldBounds(ref: FieldRef, guarantee: Expression) -> FieldBounds? { + -- Extract min/max bounds for a field from a guarantee expression. + -- Returns null if the field is not constrained by the guarantee. + -- + -- Parses guarantee expressions of the form: + -- (field >= min AND field <= max) OR is_null(field) + -- + -- This is the inverse of DeriveFieldGuarantee. +} + + +-- ============================================================================= +-- ORC Format Scan - Integration with Dataset API +-- ============================================================================= + +rule OrcFileFormatScanBatchesAsync { + -- Scan an ORC fragment, applying predicate pushdown + -- This is the main entry point showing the complete flow + + when: ScanBatchesAsync(format: OrcFileFormat, scan_options, orc_fragment) + + -- OPTIMIZATION: Pre-filter stripes if metadata is already cached + -- This avoids opening the file at all if all stripes are excluded + var stripes: List + var pre_filtered = false + + if orc_fragment.metadata != null: + let filter_result = FilterStripes(orc_fragment, scan_options.filter) + stripes = filter_result.selected_indices + pre_filtered = true + + -- Early exit: if no stripes match, return empty generator + if stripes.empty: + ensures: EmptyRecordBatchGenerator.returned() + + -- Open the ORC file reader (the actual I/O cost) + let reader = OpenOrcReader(orc_fragment.source, scan_options) + + -- Ensure metadata and statistics cache are loaded + EnsureFileMetadataCached(orc_fragment) + EnsureStatisticsCached(orc_fragment) + + -- Filter stripes (if not already done) + if not pre_filtered: + let filter_result = FilterStripes(orc_fragment, scan_options.filter) + stripes = filter_result.selected_indices + + -- Early exit: if no stripes match, return empty generator + if stripes.empty: + ensures: EmptyRecordBatchGenerator.returned() + + -- Compute column projection from materialized fields + let column_projection = InferColumnProjection(reader, scan_options) + + -- Create the record batch generator + let generator = reader.GetRecordBatchGenerator( + stripes: stripes, + column_projection: column_projection, + batch_size: scan_options.batch_size + ) + + -- SLICING: Enforce maximum batch size via SlicingGenerator + -- This ensures batches respect batch_size even if the ORC reader + -- produces larger batches (e.g., when a stripe has more rows than batch_size). + -- This mirrors Parquet's use of SlicingGenerator (file_parquet.cc line 1442). + let sliced_generator = SlicingGenerator(generator, scan_options.batch_size) + + -- Apply readahead if enabled + if scan_options.batch_readahead > 0: + ensures: ReadaheadGenerator.returned(sliced_generator, scan_options.batch_readahead) + else: + ensures: sliced_generator.returned() +} + +rule OrcSubset { + -- Create a new fragment selecting a subset of stripes. + -- + -- CACHE SHARING POLICY: + -- The subset fragment shares the IMMUTABLE metadata and manifest but gets + -- a FRESH statistics_cache. + -- + -- Rationale: + -- - metadata (OrcFileMetadata) is immutable and safe to share + -- - manifest (OrcSchemaManifest) is immutable and safe to share + -- - statistics_cache contains per-stripe guarantees indexed by position + -- - Subset fragments have different stripes lists, so cache indices don't align + -- - A fresh cache is simpler and avoids index translation complexity + + when: Subset(orc_fragment, predicate) + + let filter_result = FilterStripes(orc_fragment, predicate) + + ensures: OrcFileFragment.created( + source: orc_fragment.source, + format: orc_fragment.format, + partition_expression: orc_fragment.partition_expression, + physical_schema: orc_fragment.physical_schema, + stripes: filter_result.selected_indices, + metadata: orc_fragment.metadata, -- SHARED: immutable file metadata + manifest: orc_fragment.manifest, -- SHARED: immutable schema manifest + statistics_cache: null, -- FRESH: new cache for subset indices + cache_status: uncached -- Will be initialized on first scan + ) +} + +rule OrcSplitByStripe { + -- Split a fragment into multiple fragments, one per stripe + + when: SplitByStripe(orc_fragment, predicate) + + let filter_result = FilterStripes(orc_fragment, predicate) + + ensures: List.returned( + for stripe_idx in filter_result.selected_indices: + OrcFileFragment.created( + source: orc_fragment.source, + format: orc_fragment.format, + partition_expression: orc_fragment.partition_expression, + physical_schema: orc_fragment.physical_schema, + stripes: [stripe_idx], + metadata: orc_fragment.metadata, + manifest: orc_fragment.manifest -- SHARED: immutable schema manifest + ) + ) +} + + +-- ============================================================================= +-- Column Projection Inference +-- ============================================================================= + +rule InferColumnProjection { + -- Compute the ORC column indices needed for the scan + -- Based on the fields referenced in filter and projection + + when: InferColumnProjection(reader, scan_options) + + let field_refs = MaterializedFields(scan_options) + + var column_indices: List = [] + + for field_ref in field_refs: + -- Look up the field in the schema + let field = reader.schema.resolve(field_ref) + + if field != null: + -- Add the column index + column_indices.append(field.index) + -- else: Virtual column (not in file), skip + + ensures: column_indices.returned() +} + +function MaterializedFields(options: ScanOptions) -> Set { + -- Compute the union of fields referenced in filter and projection + return options.filter.fields_referenced + .union(options.projection.fields_referenced) +} + + +-- ============================================================================= +-- DATA FLOW: Full predicate pushdown sequence +-- ============================================================================= + +-- The complete flow from user code to data filtering: +-- +-- +-----------------------------------------------------------------------------+ +-- | USER CODE | +-- +-----------------------------------------------------------------------------+ +-- | 1. Create ScannerBuilder with dataset | +-- | 2. builder.Filter(predicate) - stores unbound filter | +-- | 3. builder.Project(columns) - determines materialized fields | +-- | 4. builder.Finish() -> Scanner | +-- | 5. scanner.ScanBatches() or scanner.ToTable() | +-- +-----------------------------------------------------------------------------+ +-- | +-- v +-- +-----------------------------------------------------------------------------+ +-- | DATASET LEVEL: Partition Pruning | +-- +-----------------------------------------------------------------------------+ +-- | 6. Scanner calls dataset.GetFragments(filter) | +-- | For each fragment in dataset: | +-- | simplified = SimplifyWithGuarantee(filter, fragment.partition_expr) | +-- | if simplified.is_satisfiable: | +-- | yield fragment | +-- | else: | +-- | SKIP (partition pruned) | +-- +-----------------------------------------------------------------------------+ +-- | +-- v +-- +-----------------------------------------------------------------------------+ +-- | FRAGMENT LEVEL: Stripe Filtering (ORC-specific) | +-- +-----------------------------------------------------------------------------+ +-- | 7. For each non-pruned Fragment: | +-- | a. EnsureFileMetadataCached() / EnsureStatisticsCached() | +-- | - Load ORC file footer (stripes, column stats) | +-- | - Cache in fragment for reuse | +-- | | +-- | b. FilterStripes(filter) called by ScanBatchesAsync: | +-- | i. Simplify filter with partition guarantee | +-- | ii. For each field in filter: | +-- | - Look up column in schema | +-- | - For each stripe: extract column statistics | +-- | - Convert to guarantee: min <= field <= max | +-- | - Fold into per-stripe statistics_expressions | +-- | iii. For each stripe: | +-- | simplified = SimplifyWithGuarantee(filter, stats_expr) | +-- | if simplified.is_satisfiable: | +-- | include stripe | +-- | else: | +-- | SKIP (statistics pruned) | +-- +-----------------------------------------------------------------------------+ +-- | +-- v +-- +-----------------------------------------------------------------------------+ +-- | COLUMN LEVEL: Projection | +-- +-----------------------------------------------------------------------------+ +-- | 8. InferColumnProjection() | +-- | - Compute MaterializedFields from filter + projection | +-- | - Map to ORC column indices | +-- | - Only these columns are read from disk | +-- +-----------------------------------------------------------------------------+ +-- | +-- v +-- +-----------------------------------------------------------------------------+ +-- | FILE READER: Batch Generation | +-- +-----------------------------------------------------------------------------+ +-- | 9. reader.GetRecordBatchGenerator(stripes, column_projection) | +-- | - Read only selected stripes | +-- | - Read only projected columns | +-- | - Apply readahead for parallelism | +-- +-----------------------------------------------------------------------------+ +-- | +-- v +-- +-----------------------------------------------------------------------------+ +-- | POST-SCAN: Filtering and Evolution | +-- +-----------------------------------------------------------------------------+ +-- | 10. For batches that were NOT fully filtered by statistics: | +-- | - Apply remaining filter to actual row values | +-- | - Evolve batch from fragment schema to dataset schema | +-- | - Apply projection expression | +-- | - Yield to user | +-- +-----------------------------------------------------------------------------+ + +-- Performance benefits at each level: +-- - Partition pruning: Skip entire files (major I/O savings) +-- - Stripe filtering: Skip portions of files (moderate I/O savings) +-- - Column projection: Read fewer columns (I/O and memory savings) +-- - Statistics metadata: Tiny compared to actual data (minimal overhead) + + +-- ============================================================================= +-- External Entities (defined elsewhere or implementation-specific) +-- ============================================================================= + +external entity RecordBatch { + -- A batch of columnar data with a schema + schema: Schema + num_rows: Int +} + +external entity Table { + -- A collection of record batches with a common schema + schema: Schema + num_rows: Int +} + +external entity Buffer { + -- Raw byte buffer for in-memory data + size: Int +} + +external entity FileSystem { + -- Abstract filesystem interface (local, S3, GCS, HDFS, etc.) +} + +external entity FileSource { + -- Identifies where a file's data comes from + kind: path | buffer | custom + path: String? + buffer: Buffer? +} + +external entity FileFragment extends Fragment { + -- A Fragment that is stored in a file with a known format + source: FileSource + format: FileFormat +} + +external entity FileFormat { + -- Base class for file format implementations + type_name: String +} + +external entity OrcFileFormat extends FileFormat { + type_name: "orc" +} + +external entity ScanOptions { + -- Scan-specific options + filter: Expression + projection: Expression + batch_size: Int + batch_readahead: Int +} + +external entity Fragment { + -- Base class for dataset fragments + partition_expression: Expression + physical_schema: Schema? +} + +function ReadOrcFileMetadata(source: FileSource) -> OrcFileMetadata +function OpenOrcReader(source: FileSource, options: ScanOptions) -> OrcReader + +external entity OrcReader { + schema: Schema + GetRecordBatchGenerator(stripes: List, column_projection: List, batch_size: Int) -> RecordBatchGenerator +} + +external entity RecordBatchGenerator { + -- Async generator yielding record batches +} + +external entity EmptyRecordBatchGenerator extends RecordBatchGenerator { + -- Generator that yields no batches +} + +external entity ReadaheadGenerator extends RecordBatchGenerator { + -- Generator with readahead buffering +} + +external entity SlicingGenerator extends RecordBatchGenerator { + -- Wraps a generator to enforce maximum batch size + -- If the source generator produces batches larger than batch_size, + -- SlicingGenerator slices them into smaller batches. + -- + -- This ensures consistent batch sizes regardless of source behavior, + -- matching Parquet's SlicingGenerator (file_parquet.cc line 1442). + source: RecordBatchGenerator + batch_size: Int +} + +function SlicingGenerator(source: RecordBatchGenerator, batch_size: Int) -> SlicingGenerator { + -- Create a SlicingGenerator wrapping the source generator +} diff --git a/python/.gitignore b/python/.gitignore index dec4ffc1c9b9..ce97ba4af623 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -37,8 +37,6 @@ htmlcov # Cache .cache -# benchmark working dir -.asv pyarrow/_table_api.h # manylinux temporary files diff --git a/python/MANIFEST.in b/python/MANIFEST.in index ed7012e4b701..c37446c64fe4 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -4,6 +4,8 @@ include ../NOTICE.txt global-include CMakeLists.txt graft pyarrow +graft pyarrow-stubs +include scripts/update_stub_docstrings.py graft cmake_modules global-exclude *.so @@ -12,4 +14,3 @@ global-exclude *~ global-exclude \#* global-exclude .git* global-exclude .DS_Store -prune .asv diff --git a/python/asv-build.sh b/python/asv-build.sh deleted file mode 100755 index 2de4a2453b6d..000000000000 --- a/python/asv-build.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -# ASV doesn't activate its conda environment for us -if [ -z "$ASV_ENV_DIR" ]; then exit 1; fi - -if [ -z "$CONDA_HOME" ]; then - echo "Please set \$CONDA_HOME to point to your root conda installation" - exit 1; -fi - -eval "$($CONDA_HOME/bin/conda shell.bash hook)" - -conda activate $ASV_ENV_DIR -echo "== Conda Prefix for benchmarks: " $CONDA_PREFIX " ==" - -# Build Arrow C++ libraries -export ARROW_HOME=$CONDA_PREFIX -export PARQUET_HOME=$CONDA_PREFIX -export ORC_HOME=$CONDA_PREFIX -export PROTOBUF_HOME=$CONDA_PREFIX -export BOOST_ROOT=$CONDA_PREFIX - -pushd ../cpp -mkdir -p build -pushd build - -cmake -GNinja \ - -DCMAKE_BUILD_TYPE=release \ - -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - -DARROW_CXXFLAGS=$CXXFLAGS \ - -DARROW_USE_GLOG=off \ - -DARROW_FLIGHT=on \ - -DARROW_GCS=on \ - -DARROW_ORC=on \ - -DARROW_PARQUET=on \ - -DARROW_PYTHON=on \ - -DARROW_S3=on \ - -DARROW_BUILD_TESTS=off \ - .. -cmake --build . --target install - -popd -popd - -# Build pyarrow wrappers -export SETUPTOOLS_SCM_PRETEND_VERSION=0.0.1 -export PYARROW_BUILD_TYPE=release -export PYARROW_PARALLEL=8 -export PYARROW_WITH_FLIGHT=1 -export PYARROW_WITH_GCS=1 -export PYARROW_WITH_ORC=1 -export PYARROW_WITH_PARQUET=1 - -python setup.py clean -find pyarrow -name "*.so" -delete -python setup.py develop diff --git a/python/asv.conf.json b/python/asv.conf.json deleted file mode 100644 index b975936c99a1..000000000000 --- a/python/asv.conf.json +++ /dev/null @@ -1,187 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -{ - // The version of the config file format. Do not change, unless - // you know what you are doing. - "version": 1, - - // The name of the project being benchmarked - "project": "pyarrow", - - // The project's homepage - "project_url": "https://arrow.apache.org/", - - // The URL or local path of the source code repository for the - // project being benchmarked - "repo": "..", - - // The Python project's subdirectory in your repo. If missing or - // the empty string, the project is assumed to be located at the root - // of the repository. - "repo_subdir": "python", - - // Custom build commands for Arrow. - "build_command": ["/bin/bash {build_dir}/asv-build.sh"], - "install_command": ["/bin/bash {build_dir}/asv-install.sh"], - "uninstall_command": ["/bin/bash {build_dir}/asv-uninstall.sh"], - - // List of branches to benchmark. If not provided, defaults to "master" - // (for git) or "default" (for mercurial). - // "branches": ["master"], // for git - // "branches": ["default"], // for mercurial - - // The DVCS being used. If not set, it will be automatically - // determined from "repo" by looking at the protocol in the URL - // (if remote), or by looking for special directories, such as - // ".git" (if local). - "dvcs": "git", - - // The tool to use to create environments. May be "conda", - // "virtualenv" or other value depending on the plugins in use. - // If missing or the empty string, the tool will be automatically - // determined by looking for tools on the PATH environment - // variable. - "environment_type": "conda", - // Avoid conda-forge to avoid C++ ABI issues - "conda_channels": ["defaults"], - - // the base URL to show a commit for the project. - "show_commit_url": "https://github.com/apache/arrow/commit/", - - // The Pythons you'd like to test against. If not provided, defaults - // to the current version of Python used to run `asv`. - "pythons": ["3.9"], - - // The matrix of dependencies to test. Each key is the name of a - // package (in PyPI) and the values are version numbers. An empty - // list or empty string indicates to just test against the default - // (latest) version. null indicates that the package is to not be - // installed. If the package to be tested is only available from - // PyPi, and the 'environment_type' is conda, then you can preface - // the package name by 'pip+', and the package will be installed via - // pip (with all the conda available packages installed first, - // followed by the pip installed packages). - // - // "matrix": { - // "numpy": ["1.6", "1.7"], - // "six": ["", null], // test with and without six installed - // "pip+emcee": [""], // emcee is only available for install with pip. - // }, - "matrix": { - // Use older boost since it works on more editions of the project - "aws-sdk-cpp": [], - "boost-cpp": ["1.68.0"], - "brotli": [], - "cmake": [], - "cython": [], - "flatbuffers": [], - "libgrpc": [], - "libprotobuf": [], - "lz4-c": [], - "ninja": [], - "numpy": [], - "pandas": ["0.25.1"], - "pip+setuptools_scm": [], - "rapidjson": [], - "re2": [], - "snappy": [], - "thrift-cpp": [], - "zstd": [], - }, - - // Combinations of libraries/python versions can be excluded/included - // from the set to test. Each entry is a dictionary containing additional - // key-value pairs to include/exclude. - // - // An exclude entry excludes entries where all values match. The - // values are regexps that should match the whole string. - // - // An include entry adds an environment. Only the packages listed - // are installed. The 'python' key is required. The exclude rules - // do not apply to includes. - // - // In addition to package names, the following keys are available: - // - // - python - // Python version, as in the *pythons* variable above. - // - environment_type - // Environment type, as above. - // - sys_platform - // Platform, as in sys.platform. Possible values for the common - // cases: 'linux2', 'win32', 'cygwin', 'darwin'. - // - // "exclude": [ - // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows - // {"environment_type": "conda", "six": null}, // don't run without six on conda - // ], - // - // "include": [ - // // additional env for python2.7 - // {"python": "2.7", "numpy": "1.8"}, - // // additional env if run on windows+conda - // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, - // ], - - // The directory (relative to the current directory) that benchmarks are - // stored in. If not provided, defaults to "benchmarks" - "benchmark_dir": "benchmarks", - - // The directory (relative to the current directory) to cache the Python - // environments in. If not provided, defaults to "env" - "env_dir": ".asv/env", - - // The directory (relative to the current directory) that raw benchmark - // results are stored in. If not provided, defaults to "results". - "results_dir": ".asv/results", - - // The directory (relative to the current directory) that the html tree - // should be written to. If not provided, defaults to "html". - "html_dir": "build/benchmarks/html", - - // The number of characters to retain in the commit hashes. - // "hash_length": 8, - - // `asv` will cache wheels of the recent builds in each - // environment, making them faster to install next time. This is - // number of builds to keep, per environment. - // "wheel_cache_size": 0, - - // The commits after which the regression search in `asv publish` - // should start looking for regressions. Dictionary whose keys are - // regexps matching to benchmark names, and values corresponding to - // the commit (exclusive) after which to start looking for - // regressions. The default is to start from the first commit - // with results. If the commit is `null`, regression detection is - // skipped for the matching benchmark. - // - // "regressions_first_commits": { - // "some_benchmark": "352cdf", // Consider regressions only after this commit - // "another_benchmark": null, // Skip regression detection altogether - // } - - // The thresholds for relative change in results, after which `asv - // publish` starts reporting regressions. Dictionary of the same - // form as in ``regressions_first_commits``, with values - // indicating the thresholds. If multiple entries match, the - // maximum is taken. If no entry matches, the default is 5%. - // - // "regressions_thresholds": { - // "some_benchmark": 0.01, // Threshold of 1% - // "another_benchmark": 0.5, // Threshold of 50% - // } -} diff --git a/python/pyarrow-stubs/pyarrow/__init__.pyi b/python/pyarrow-stubs/pyarrow/__init__.pyi new file mode 100644 index 000000000000..ccec8d5abc07 --- /dev/null +++ b/python/pyarrow-stubs/pyarrow/__init__.pyi @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Type stubs for PyArrow. + +This is a placeholder stub file. +Complete type annotations will be added in subsequent PRs. +""" + +from typing import Any + +# TODO(GH-48970): remove __getattr__ before release as this +# will annotate non-existing attributes as Any. +# https://github.com/apache/arrow/issues/48970 +def __getattr__(name: str) -> Any: ... diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index b7e7af260c26..f447129cf40a 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -1666,7 +1666,7 @@ cdef class FlightClient(_Weakrefable): result = Result.__new__(Result) with nogil: check_flight_status(results.get().Next().Value(&result.result)) - if result.result == NULL: + if result.result == nullptr: break yield result return _do_action_response() @@ -1695,7 +1695,7 @@ cdef class FlightClient(_Weakrefable): result = FlightInfo.__new__(FlightInfo) with nogil: check_flight_status(listing.get().Next().Value(&result.info)) - if result.info == NULL: + if result.info == nullptr: break yield result diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index c1c20026db0b..fa89b6812eba 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1524,7 +1524,7 @@ cdef compression_name_from_enum(ParquetCompression compression_): cdef int check_compression_name(name) except -1: if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', - 'ZSTD'}: + 'LZ4_RAW', 'ZSTD'}: raise ArrowException("Unsupported compression: " + name) return 0 @@ -1539,7 +1539,7 @@ cdef ParquetCompression compression_from_name(name): return ParquetCompression_LZO elif name == 'BROTLI': return ParquetCompression_BROTLI - elif name == 'LZ4': + elif name == 'LZ4' or name == 'LZ4_RAW': return ParquetCompression_LZ4 elif name == 'ZSTD': return ParquetCompression_ZSTD @@ -1811,7 +1811,7 @@ cdef class ParquetReader(_Weakrefable): table : pyarrow.Table """ cdef: - shared_ptr[CTable] ctable + CResult[shared_ptr[CTable]] table_result vector[int] c_row_groups vector[int] c_column_indices @@ -1825,15 +1825,13 @@ cdef class ParquetReader(_Weakrefable): c_column_indices.push_back(index) with nogil: - check_status(self.reader.get() - .ReadRowGroups(c_row_groups, c_column_indices, - &ctable)) + table_result = self.reader.get().ReadRowGroups(c_row_groups, + c_column_indices) else: # Read all columns with nogil: - check_status(self.reader.get() - .ReadRowGroups(c_row_groups, &ctable)) - return pyarrow_wrap_table(ctable) + table_result = self.reader.get().ReadRowGroups(c_row_groups) + return pyarrow_wrap_table(GetResultValue(table_result)) def read_all(self, column_indices=None, bint use_threads=True): """ diff --git a/python/pyarrow/includes/libparquet.pxd b/python/pyarrow/includes/libparquet.pxd index c19977396fb1..f82ddd4197b6 100644 --- a/python/pyarrow/includes/libparquet.pxd +++ b/python/pyarrow/includes/libparquet.pxd @@ -534,15 +534,13 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: CStatus ReadSchemaField(int i, shared_ptr[CChunkedArray]* out) int num_row_groups() - CStatus ReadRowGroup(int i, shared_ptr[CTable]* out) - CStatus ReadRowGroup(int i, const vector[int]& column_indices, - shared_ptr[CTable]* out) - - CStatus ReadRowGroups(const vector[int]& row_groups, - shared_ptr[CTable]* out) - CStatus ReadRowGroups(const vector[int]& row_groups, - const vector[int]& column_indices, - shared_ptr[CTable]* out) + CResult[shared_ptr[CTable]] ReadRowGroup(int i) + CResult[shared_ptr[CTable]] ReadRowGroup(int i, + const vector[int]& column_indices) + + CResult[shared_ptr[CTable]] ReadRowGroups(const vector[int]& row_groups) + CResult[shared_ptr[CTable]] ReadRowGroups(const vector[int]& row_groups, + const vector[int]& column_indices) CResult[unique_ptr[CRecordBatchReader]] GetRecordBatchReader(const vector[int]& row_group_indices, const vector[int]& column_indices) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 676bc445238e..354f18124b53 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -768,7 +768,9 @@ def _sanitize_table(table, new_schema, flavor): doesn't support dictionary encoding. compression : str or dict, default 'snappy' Specify the compression codec, either on a general basis or per-column. - Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'LZ4_RAW', 'ZSTD'}. + 'LZ4_RAW' is accepted as an alias for 'LZ4' (both use the LZ4_RAW + codec as defined in the Parquet specification). write_statistics : bool or list, default True Specify if we should write statistics in general (default is True) or only for some columns. diff --git a/python/asv-install.sh b/python/pyarrow/py.typed old mode 100755 new mode 100644 similarity index 85% rename from python/asv-install.sh rename to python/pyarrow/py.typed index beef730b7b8c..13a83393a912 --- a/python/asv-install.sh +++ b/python/pyarrow/py.typed @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -16,6 +14,3 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -# Deliberately empty, but exists so that we don't have to change -# asv.conf.json if we need specific commands here. diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 8e258e38afef..361ba145c8b6 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1877,10 +1877,12 @@ cdef class _Tabular(_PandasConvertible): >>> df = pd.DataFrame({'year': [None, 2022, 2019, 2021], ... 'n_legs': [2, 4, 5, 100], ... 'animals': ["Flamingo", "Horse", None, "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[None, 2022, 2019, 2021], [2, 4, 5, 100], ["Flamingo", "Horse", None, "Centipede"]], + ... names=['year', 'n_legs', 'animals']) >>> table.drop_null() pyarrow.Table - year: double + year: int64 n_legs: int64 animals: string ---- @@ -1909,10 +1911,9 @@ cdef class _Tabular(_PandasConvertible): Table (works similarly for RecordBatch) >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) >>> table.field(0) pyarrow.Field >>> table.field(1) @@ -2064,10 +2065,9 @@ cdef class _Tabular(_PandasConvertible): Table (works similarly for RecordBatch) >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [None, 4, 5, None], - ... 'animals': ["Flamingo", "Horse", None, "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[None, 4, 5, None], ["Flamingo", "Horse", None, "Centipede"]], + ... names=['n_legs', 'animals']) >>> for i in table.itercolumns(): ... print(i.null_count) ... @@ -2133,13 +2133,12 @@ cdef class _Tabular(_PandasConvertible): -------- Table (works similarly for RecordBatch) - >>> import pandas as pd >>> import pyarrow as pa - >>> df = pd.DataFrame({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2020, 2022, 2021, 2022, 2019, 2021], + ... [2, 2, 4, 4, 5, 100], + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]], + ... names=['year', 'n_legs', 'animal']) >>> table.sort_by('animal') pyarrow.Table year: int64 @@ -2181,11 +2180,10 @@ cdef class _Tabular(_PandasConvertible): Table (works similarly for RecordBatch) >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021], - ... 'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2020, 2022, 2019, 2021], [2, 4, 5, 100], + ... ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['year', 'n_legs', 'animals']) >>> table.take([1,3]) pyarrow.Table year: int64 @@ -2473,10 +2471,9 @@ cdef class _Tabular(_PandasConvertible): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) Append column at the end: @@ -2545,7 +2542,7 @@ cdef class RecordBatch(_Tabular): month: int64 day: int64 n_legs: int64 - animals: string + animals: ...string ---- year: [2020,2022,2021,2022] month: [3,5,7,9] @@ -2585,7 +2582,7 @@ cdef class RecordBatch(_Tabular): month: int64 day: int64 n_legs: int64 - animals: string + animals: ...string ---- year: [2020,2022,2021,2022] month: [3,5,7,9] @@ -2858,10 +2855,9 @@ cdef class RecordBatch(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch = pa.RecordBatch.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) Add column: @@ -2931,10 +2927,9 @@ cdef class RecordBatch(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch = pa.RecordBatch.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) >>> batch.remove_column(1) pyarrow.RecordBatch n_legs: int64 @@ -2970,10 +2965,9 @@ cdef class RecordBatch(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch = pa.RecordBatch.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) Replace a column: @@ -3039,10 +3033,9 @@ cdef class RecordBatch(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch = pa.RecordBatch.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) >>> new_names = ["n", "name"] >>> batch.rename_columns(new_names) pyarrow.RecordBatch @@ -3318,15 +3311,12 @@ cdef class RecordBatch(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch = pa.RecordBatch.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) >>> batch.schema n_legs: int64 animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... Define new schema and cast batch values: @@ -3416,7 +3406,7 @@ cdef class RecordBatch(_Tabular): month: int64 day: int64 n_legs: int64 - animals: string + animals: ...string ---- year: [2020,2022,2021,2022] month: [3,5,7,9] @@ -3579,11 +3569,11 @@ cdef class RecordBatch(_Tabular): -------- >>> import pyarrow as pa >>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'}, - ... {'year': 2022, 'n_legs': 4}]) + ... {'year': 2022, 'n_legs': 4, 'animals': 'Goat'}]) >>> pa.RecordBatch.from_struct_array(struct).to_pandas() n_legs animals year 0 2 Parrot NaN - 1 4 None 2022.0 + 1 4 Goat 2022.0 """ cdef: shared_ptr[CRecordBatch] c_record_batch @@ -4156,7 +4146,7 @@ cdef class Table(_Tabular): pyarrow.Table year: int64 n_legs: int64 - animals: string + animals: ...string ---- year: [[2020,2022,2019,2021]] n_legs: [[2,4,5,100]] @@ -4282,11 +4272,10 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021], - ... 'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2020, 2022, 2019, 2021], [2, 4, 5, 100], + ... ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['year', 'n_legs', 'animals']) >>> table.slice(length=3) pyarrow.Table year: int64 @@ -4347,11 +4336,10 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021], - ... 'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2020, 2022, 2019, 2021], [2, 4, 5, 100], + ... ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['year', 'n_legs', 'animals']) >>> table.select([0,1]) pyarrow.Table year: int64 @@ -4687,15 +4675,12 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) >>> table.schema n_legs: int64 animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... Define new schema and cast table values: @@ -4787,7 +4772,7 @@ cdef class Table(_Tabular): >>> pa.Table.from_pandas(df) pyarrow.Table n_legs: int64 - animals: string + animals: ...string ---- n_legs: [[2,4,5,100]] animals: [["Flamingo","Horse","Brittle stars","Centipede"]] @@ -4934,11 +4919,11 @@ cdef class Table(_Tabular): -------- >>> import pyarrow as pa >>> struct = pa.array([{'n_legs': 2, 'animals': 'Parrot'}, - ... {'year': 2022, 'n_legs': 4}]) + ... {'year': 2022, 'n_legs': 4, 'animals': 'Goat'}]) >>> pa.Table.from_struct_array(struct).to_pandas() n_legs animals year 0 2 Parrot NaN - 1 4 None 2022.0 + 1 4 Goat 2022.0 """ if isinstance(struct_array, Array): return Table.from_batches([RecordBatch.from_struct_array(struct_array)]) @@ -5132,10 +5117,9 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) Convert a Table to a RecordBatchReader: @@ -5146,8 +5130,6 @@ cdef class Table(_Tabular): >>> reader.schema n_legs: int64 animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... >>> reader.read_all() pyarrow.Table n_legs: int64 @@ -5193,15 +5175,12 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) >>> table.schema n_legs: int64 animals: string - -- schema metadata -- - pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' ... """ return pyarrow_wrap_schema(self.table.schema()) @@ -5288,10 +5267,9 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [None, 4, 5, None], - ... 'animals': ["Flamingo", "Horse", None, "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[None, 4, 5, None], ["Flamingo", "Horse", None, "Centipede"]], + ... names=['n_legs', 'animals']) >>> table.nbytes 72 """ @@ -5318,10 +5296,9 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [None, 4, 5, None], - ... 'animals': ["Flamingo", "Horse", None, "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[None, 4, 5, None], ["Flamingo", "Horse", None, "Centipede"]], + ... names=['n_legs', 'animals']) >>> table.get_total_buffer_size() 76 """ @@ -5360,10 +5337,9 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) Add column: @@ -5426,10 +5402,9 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) >>> table.remove_column(1) pyarrow.Table n_legs: int64 @@ -5465,10 +5440,9 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) Replace a column: @@ -5527,10 +5501,9 @@ cdef class Table(_Tabular): Examples -------- >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'n_legs': [2, 4, 5, 100], - ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=['n_legs', 'animals']) >>> new_names = ["n", "name"] >>> table.rename_columns(new_names) pyarrow.Table @@ -5619,13 +5592,12 @@ cdef class Table(_Tabular): Examples -------- - >>> import pandas as pd >>> import pyarrow as pa - >>> df = pd.DataFrame({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) + >>> table = pa.Table.from_arrays( + ... [[2020, 2022, 2021, 2022, 2019, 2021], + ... [2, 2, 4, 4, 5, 100], + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]], + ... names=['year', 'n_legs', 'animal']) >>> table.group_by('year').aggregate([('n_legs', 'sum')]) pyarrow.Table year: int64 @@ -5682,16 +5654,14 @@ cdef class Table(_Tabular): Examples -------- - >>> import pandas as pd >>> import pyarrow as pa >>> import pyarrow.compute as pc - >>> df1 = pd.DataFrame({'id': [1, 2, 3], - ... 'year': [2020, 2022, 2019]}) - >>> df2 = pd.DataFrame({'id': [3, 4], - ... 'n_legs': [5, 100], - ... 'animal': ["Brittle stars", "Centipede"]}) - >>> t1 = pa.Table.from_pandas(df1) - >>> t2 = pa.Table.from_pandas(df2) + >>> t1 = pa.Table.from_arrays( + ... [[1, 2, 3], [2020, 2022, 2019]], + ... names=['id', 'year']) + >>> t2 = pa.Table.from_arrays( + ... [[3, 4], [5, 100], ["Brittle stars", "Centipede"]], + ... names=['id', 'n_legs', 'animal']) Left outer join: @@ -6003,7 +5973,7 @@ def record_batch(data, names=None, schema=None, metadata=None): month: int64 day: int64 n_legs: int64 - animals: string + animals: ...string ---- year: [2020,2022,2021,2022] month: [3,5,7,9] @@ -6164,7 +6134,7 @@ def table(data, names=None, schema=None, metadata=None, nthreads=None): pyarrow.Table year: int64 n_legs: int64 - animals: string + animals: ...string ---- year: [[2020,2022,2019,2021]] n_legs: [[2,4,5,100]] @@ -6316,8 +6286,8 @@ def concat_tables(tables, MemoryPool memory_pool=None, str promote_options="none "default" if promote_options == "none" else promote_options ) + options.unify_schemas = promote_options != "none" with nogil: - options.unify_schemas = promote_options != "none" c_result_table = GetResultValue( ConcatenateTables(c_tables, options, pool)) diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 73715c060981..4edbb41339a6 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -345,7 +345,22 @@ ctypedef CSparseCOOIndex* _CSparseCOOIndexPtr cdef class SparseCOOTensor(_Weakrefable): """ - A sparse COO tensor. + A sparse COO (COOrdinate) tensor. + + COO format stores a sparse tensor as a collection of (indices, values) + pairs. The indices specify the coordinates of non-zero elements, and + the values contain the actual data at those coordinates. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> dense_tensor = np.array([[0, 1, 0], [2, 0, 3]], dtype=np.float32) + >>> sparse_coo = pa.SparseCOOTensor.from_dense_numpy(dense_tensor) + >>> sparse_coo + + type: float + shape: (2, 3) """ def __init__(self): @@ -359,7 +374,7 @@ cdef class SparseCOOTensor(_Weakrefable): self.type = pyarrow_wrap_data_type(self.stp.type()) def __repr__(self): - return """ + return f""" type: {self.type} shape: {self.shape}""" @@ -650,7 +665,23 @@ shape: {self.shape}""" cdef class SparseCSRMatrix(_Weakrefable): """ - A sparse CSR matrix. + A sparse CSR (Compressed Sparse Row) matrix. + + CSR format stores a sparse matrix by compressing the row information. + It uses three arrays: data (non-zero values), indices (column indices), + and indptr (row pointers that indicate where each row starts in the + data array). + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> dense_matrix = np.array([[1, 0, 2], [0, 0, 3]], dtype=np.float64) + >>> sparse_csr = pa.SparseCSRMatrix.from_dense_numpy(dense_matrix) + >>> sparse_csr + + type: double + shape: (2, 3) """ def __init__(self): @@ -891,7 +922,23 @@ shape: {self.shape}""" cdef class SparseCSCMatrix(_Weakrefable): """ - A sparse CSC matrix. + A sparse CSC (Compressed Sparse Column) matrix. + + CSC format stores a sparse matrix by compressing the column information. + It uses three arrays: data (non-zero values), indices (row indices), + and indptr (column pointers that indicate where each column starts + in the data array). CSC is the transpose of CSR format. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> dense_matrix = np.array([[1, 0, 2], [0, 0, 3]], dtype=np.float64) + >>> sparse_csc = pa.SparseCSCMatrix.from_dense_numpy(dense_matrix) + >>> sparse_csc + + type: double + shape: (2, 3) """ def __init__(self): @@ -1142,6 +1189,20 @@ cdef class SparseCSFTensor(_Weakrefable): of prefix trees. Each path from a root to leaf forms one tensor non-zero index. CSF is implemented with two arrays of buffers and one arrays of integers. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> # Create a 3D sparse tensor + >>> dense_tensor = np.zeros((2, 3, 2), dtype=np.float32) + >>> dense_tensor[0, 1, 0] = 1.0 + >>> dense_tensor[1, 2, 1] = 2.0 + >>> sparse_csf = pa.SparseCSFTensor.from_dense_numpy(dense_tensor) + >>> sparse_csf + + type: float + shape: (2, 3, 2) """ def __init__(self): diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 94868741f39a..345aee3c4ef4 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -612,6 +612,14 @@ def test_compression_level(): compression_level=level) +def test_lz4_raw_compression_alias(): + # GH-41863: lz4_raw should be accepted as a compression name alias + arr = pa.array(list(map(int, range(1000)))) + table = pa.Table.from_arrays([arr, arr], names=['a', 'b']) + _check_roundtrip(table, expected=table, compression="lz4_raw") + _check_roundtrip(table, expected=table, compression="LZ4_RAW") + + def test_sanitized_spark_field_names(): a0 = pa.array([0, 1, 2, 3, 4]) name = 'prohib; ,\t{}' diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index d8a1c4d093eb..2ef14ff39be2 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -3930,7 +3930,8 @@ def test_list_slice_output_fixed(start, stop, step, expected, value_type, (0, 1,), (0, 2,), (1, 2,), - (2, 4,) + (2, 4,), + (0, 0,) )) @pytest.mark.parametrize("step", (1, 2)) @pytest.mark.parametrize("value_type", (pa.string, pa.int16, pa.float64)) @@ -3978,18 +3979,17 @@ def test_list_slice_field_names_retained(return_fixed_size, type): def test_list_slice_bad_parameters(): arr = pa.array([[1]], pa.list_(pa.int8(), 1)) - msg = r"`start`(.*) should be greater than 0 and smaller than `stop`(.*)" + msg = ( + r"`start`(.*) should be greater than or equal to 0 " + r"and not greater than `stop`(.*)" + ) with pytest.raises(pa.ArrowInvalid, match=msg): pc.list_slice(arr, -1, 1) # negative start? with pytest.raises(pa.ArrowInvalid, match=msg): pc.list_slice(arr, 2, 1) # start > stop? - # TODO(ARROW-18281): start==stop -> empty lists - with pytest.raises(pa.ArrowInvalid, match=msg): - pc.list_slice(arr, 0, 0) # start == stop? - # Step not >= 1 - msg = "`step` must be >= 1, got: " + msg = "`step` must be greater than or equal to 1, got: " with pytest.raises(pa.ArrowInvalid, match=msg + "0"): pc.list_slice(arr, 0, 1, step=0) with pytest.raises(pa.ArrowInvalid, match=msg + "-1"): diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index f510c6dbe23d..dce605c7156d 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -2065,3 +2065,37 @@ def readinto(self, *args): for i in range(20): with pytest.raises(pa.ArrowInvalid): read_csv(MyBytesIO(data)) + + +@pytest.mark.parametrize("tables,expected", [ + # GH-36889: Empty batch at the beginning + ( + lambda: [pa.table({"col1": []}).cast(pa.schema([("col1", pa.string())])), + pa.table({"col1": ["a"]}), + pa.table({"col1": ["b"]})], + b'"col1"\n"a"\n"b"\n' + ), + # GH-36889: Empty batch in the middle + ( + lambda: [pa.table({"col1": ["a"]}), + pa.table({"col1": []}).cast(pa.schema([("col1", pa.string())])), + pa.table({"col1": ["b"]})], + b'"col1"\n"a"\n"b"\n' + ), + # GH-36889: Empty batch at the end + ( + lambda: [pa.table({"col1": ["a"]}), + pa.table({"col1": ["b"]}), + pa.table({"col1": []}).cast(pa.schema([("col1", pa.string())]))], + b'"col1"\n"a"\n"b"\n' + ), +]) +def test_write_csv_empty_batch_should_not_pollute_output(tables, expected): + combined = pa.concat_tables(tables()) + + buf = io.BytesIO() + write_csv(combined, buf) + buf.seek(0) + result = buf.read() + + assert result == expected diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 64f45d8bed85..d6a2fe6a2765 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -80,8 +80,7 @@ def run_with_env_var(env_var): for v in ('-1', 'z'): out, err = run_with_env_var(v) assert out.strip() == '8' # default value - assert ("ARROW_IO_THREADS does not contain a valid number of threads" - in err.strip()) + assert "Invalid value for ARROW_IO_THREADS" in err.strip() def test_build_info(): diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 792c0840f813..e84f1b073f6c 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -3111,7 +3111,7 @@ cdef class Schema(_Weakrefable): @classmethod def from_pandas(cls, df, preserve_index=None): """ - Returns implied schema from dataframe + Returns implied schema from DataFrame Parameters ---------- @@ -3136,11 +3136,11 @@ cdef class Schema(_Weakrefable): ... 'str': ['a', 'b'] ... }) - Create an Arrow Schema from the schema of a pandas dataframe: + Create an Arrow Schema from the schema of a pandas DataFrame: >>> pa.Schema.from_pandas(df) int: int64 - str: string + str: ...string -- schema metadata -- pandas: '{"index_columns": [{"kind": "range", "name": null, ... """ diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 5878d1f90262..a95826e1c005 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -231,8 +231,9 @@ def _break_traceback_cycle_from_frame(frame): def _download_urllib(url, out_path): - from urllib.request import urlopen - with urlopen(url) as response: + from urllib.request import urlopen, Request + req = Request(url, headers={'User-Agent': 'pyarrow'}) + with urlopen(req) as response: with open(out_path, 'wb') as f: f.write(response.read()) @@ -264,11 +265,13 @@ def download_tzdata_on_windows(): # Try to download the files with requests and then fall back to urllib. This # works around possible issues in certain older environment (GH-45295) try: - _download_requests(tzdata_url, tzdata_compressed_path) - _download_requests(windows_zones_url, windows_zones_path) + import requests # noqa: F401 + download_fn = _download_requests except ImportError: - _download_urllib(tzdata_url, tzdata_compressed_path) - _download_urllib(windows_zones_url, windows_zones_path) + download_fn = _download_urllib + + download_fn(tzdata_url, tzdata_compressed_path) + download_fn(windows_zones_url, windows_zones_path) assert os.path.exists(tzdata_compressed_path) assert os.path.exists(windows_zones_path) diff --git a/python/pyproject.toml b/python/pyproject.toml index 899144d418de..217dba81b873 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -18,6 +18,8 @@ [build-system] requires = [ "cython >= 3.1", + # Needed for build-time stub docstring extraction + "libcst>=1.8.6", "numpy>=1.25", # configuring setuptools_scm in pyproject.toml requires # versions released after 2022 @@ -88,7 +90,7 @@ include = ["pyarrow"] namespaces = false [tool.setuptools.package-data] -pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd"] +pyarrow = ["*.pxd", "*.pyi", "*.pyx", "includes/*.pxd", "py.typed"] [tool.setuptools_scm] root = '..' @@ -96,3 +98,39 @@ version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '24.0.0a0' + +# TODO: Enable type checking once stubs are merged +[tool.mypy] +files = ["pyarrow-stubs"] +mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs" +exclude = [ + "^pyarrow/", + "^benchmarks/", + "^examples/", + "^scripts/", +] + +# TODO: Enable type checking once stubs are merged +[tool.pyright] +pythonPlatform = "All" +pythonVersion = "3.10" +include = ["pyarrow-stubs"] +exclude = [ + "pyarrow", + "benchmarks", + "examples", + "scripts", + "build", +] +stubPath = "pyarrow-stubs" +typeCheckingMode = "basic" + +# TODO: Enable type checking once stubs are merged +[tool.ty.src] +include = ["pyarrow-stubs"] +exclude = [ + "pyarrow", + "benchmarks", + "examples", + "scripts", +] diff --git a/python/requirements-build.txt b/python/requirements-build.txt index 9e03e04aded7..c3b7aa48eb67 100644 --- a/python/requirements-build.txt +++ b/python/requirements-build.txt @@ -1,4 +1,5 @@ cython>=3.1 +libcst>=1.8.6 numpy>=1.25 setuptools_scm>=8 setuptools>=77 diff --git a/python/requirements-test.txt b/python/requirements-test.txt index 4339aeb9c161..988d7a3ae7a9 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -3,5 +3,6 @@ hypothesis packaging pandas pytest +pytest-xdist pytz pyuwsgi; sys.platform != 'win32' and python_version < '3.13' diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index ac6388762b4c..6a2c62212437 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,4 +1,7 @@ +build cython>=3.1 +# Needed for build-time stub docstring extraction +libcst>=1.8.6 numpy>=2.0.0 setuptools_scm setuptools>=77 diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py index 53d3dd52bd8a..406dfc54e4fc 100644 --- a/python/scripts/run_emscripten_tests.py +++ b/python/scripts/run_emscripten_tests.py @@ -45,7 +45,7 @@ def do_GET(self) -> bytes | None: self.end_headers() with PYARROW_WHEEL_PATH.open(mode="rb") as wheel: self.copyfile(wheel, self.wfile) - if self.path.endswith("/test.html"): + elif self.path.endswith("/test.html"): body = b""" diff --git a/python/scripts/update_stub_docstrings.py b/python/scripts/update_stub_docstrings.py new file mode 100644 index 000000000000..5fd24014a024 --- /dev/null +++ b/python/scripts/update_stub_docstrings.py @@ -0,0 +1,228 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Extract docstrings from pyarrow runtime and insert them into stub files. + +Usage (from python/ directory with pyarrow built): + python scripts/update_stub_docstrings.py pyarrow-stubs +""" + +import argparse +import importlib +import inspect +import sys +from pathlib import Path +from textwrap import indent + +import libcst +from libcst import matchers as m + + +def _resolve_object(module, path): + """Resolve an object by dotted path from a module.""" + if not path: + return module, None, module.__name__ + + parts = path.split(".") + parent = None + obj = module + + for part in parts: + parent = obj + try: + obj = getattr(obj, part) + except AttributeError: + try: + obj = vars(parent).get(part) + if obj is not None: + continue + except TypeError: + pass + return None, None, None + + return obj, parent, getattr(obj, "__name__", parts[-1]) + + +def _get_docstring(name, module, indentation): + """Extract and format a docstring for insertion into a stub file.""" + obj, parent, obj_name = _resolve_object(module, name) + if obj is None: + print(f"{name} not found in {module.__name__}") + return None + + docstring = inspect.getdoc(obj) + if not docstring: + return None + + # Remove signature prefix + parent_name = getattr(parent, "__name__", None) if parent else None + if docstring.startswith(obj_name) or ( + parent_name and docstring.startswith(f"{parent_name}.{obj_name}") + ): + docstring = "\n".join(docstring.splitlines()[2:]) + + # Skip empty docstrings + if not docstring.strip(): + return None + + prefix = " " * indentation + return '"""\n' + indent(docstring + '\n"""', prefix) + + +class DocstringInserter(libcst.CSTTransformer): + """CST transformer that inserts docstrings into stub file nodes.""" + + def __init__(self, module, namespace): + self.module = module + self.base_namespace = namespace + self.stack = [] + self.indentation = 0 + + def _full_name(self): + name = ".".join(self.stack) + return f"{self.base_namespace}.{name}" if self.base_namespace else name + + def leave_Module(self, original_node, updated_node): + new_body = [] + clone_matcher = m.SimpleStatementLine( + body=[m.Assign(value=m.Call(func=m.Name(value="_clone_signature"))), + m.ZeroOrMore()] + ) + for stmt in updated_node.body: + new_body.append(stmt) + if m.matches(stmt, clone_matcher): + name = stmt.body[0].targets[0].target.value + if self.base_namespace: + name = f"{self.base_namespace}.{name}" + docstring = _get_docstring(name, self.module, 0) + if docstring: + new_body.append(libcst.SimpleStatementLine( + body=[libcst.Expr(value=libcst.SimpleString(docstring))])) + return updated_node.with_changes(body=new_body) + + def visit_ClassDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_ClassDef(self, original_node, updated_node): + name = self._full_name() + docstring = _get_docstring(name, self.module, self.indentation) + + if docstring: + ellipsis_class = m.ClassDef(body=m.IndentedBlock(body=[ + m.SimpleStatementLine(body=[ + m.Expr(m.Ellipsis()), m.ZeroOrMore()]), m.ZeroOrMore()])) + func_class = m.ClassDef(body=m.IndentedBlock( + body=[m.FunctionDef(), m.ZeroOrMore()])) + + if m.matches(updated_node, ellipsis_class): + updated_node = updated_node.deep_replace( + updated_node.body.body[0].body[0].value, + libcst.SimpleString(value=docstring)) + elif m.matches(updated_node, func_class): + docstring_stmt = libcst.SimpleStatementLine( + body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) + updated_node = updated_node.with_changes( + body=updated_node.body.with_changes( + body=[docstring_stmt] + list(updated_node.body.body))) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + def visit_FunctionDef(self, node): + self.stack.append(node.name.value) + self.indentation += 1 + + def leave_FunctionDef(self, original_node, updated_node): + name = self._full_name() + ellipsis_func = m.FunctionDef( + body=m.SimpleStatementSuite(body=[m.Expr(m.Ellipsis())])) + + if m.matches(original_node, ellipsis_func): + docstring = _get_docstring(name, self.module, self.indentation) + if docstring: + docstring_stmt = libcst.SimpleStatementLine( + body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) + updated_node = updated_node.with_changes( + body=libcst.IndentedBlock(body=[docstring_stmt])) + + self.stack.pop() + self.indentation -= 1 + return updated_node + + +LIB_MODULES = {"array", "builder", "compat", "config", "device", "error", "io", + "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"} + + +def add_docstrings_to_stubs(stubs_dir): + """Update all stub files in stubs_dir with docstrings from pyarrow runtime.""" + stubs_dir = Path(stubs_dir) + print(f"Updating stub docstrings in: {stubs_dir}") + + pyarrow = importlib.import_module("pyarrow") + + for stub_file in stubs_dir.rglob('*.pyi'): + if stub_file.name == "_stubs_typing.pyi": + continue + + module_name = stub_file.stem + if module_name in LIB_MODULES: + namespace = "lib" + elif stub_file.parent.name in ("parquet", "interchange"): + namespace = f"{stub_file.parent.name}.{module_name}" + elif module_name == "__init__": + namespace = "" + else: + namespace = module_name + + print(f" {stub_file.name} -> {namespace or '(root)'}") + tree = libcst.parse_module(stub_file.read_text()) + modified = tree.visit(DocstringInserter(pyarrow, namespace)) + stub_file.write_text(modified.code) + + +def add_docstrings_from_build(stubs_dir, build_lib): + """ + Entry point for setup.py: update docstrings using pyarrow from build directory. + + During the build process, pyarrow is not installed in the system Python. + We need to temporarily add the build directory to sys.path so we can + import pyarrow and extract docstrings from it. + """ + stubs_dir, build_lib = Path(stubs_dir), Path(build_lib) + + sys.path.insert(0, str(build_lib)) + try: + add_docstrings_to_stubs(stubs_dir) + finally: + sys.path.pop(0) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("stubs_dir", type=Path, help="Path to pyarrow-stubs folder") + args = parser.parse_args() + + # Add the directory containing this script's parent (python/) to sys.path + # so pyarrow can be imported when running from the python/ directory + script_dir = Path(__file__).resolve().parent + python_dir = script_dir.parent + sys.path.insert(0, str(python_dir)) + add_docstrings_to_stubs(args.stubs_dir.resolve()) diff --git a/python/setup.py b/python/setup.py index a27bd3baefd0..4f2bf7585e13 100755 --- a/python/setup.py +++ b/python/setup.py @@ -121,8 +121,51 @@ def build_extensions(self): def run(self): self._run_cmake() + self._update_stubs() _build_ext.run(self) + def _update_stubs(self): + """Copy stubs to build directory, then inject docstrings into the copies.""" + stubs_dir = pjoin(setup_dir, 'pyarrow-stubs') + if not os.path.exists(stubs_dir): + return + + build_cmd = self.get_finalized_command('build') + build_lib = os.path.abspath(build_cmd.build_lib) + + # Copy clean stubs to build directory first + self._copy_stubs(stubs_dir, build_lib) + + # Inject docstrings into the build copies (not the source stubs). + # We pass build_lib as stubs_dir since it mirrors the pyarrow-stubs/ + # directory structure (both contain a pyarrow/ subdirectory with .pyi + # files), so the namespace resolution logic works identically. + import importlib.util + spec = importlib.util.spec_from_file_location( + "update_stub_docstrings", + pjoin(setup_dir, 'scripts', 'update_stub_docstrings.py')) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + mod.add_docstrings_from_build(build_lib, build_lib) + + def _copy_stubs(self, stubs_dir, build_lib): + """Copy .pyi stub files to the build directory.""" + src_dir = pjoin(stubs_dir, 'pyarrow') + dest_dir = pjoin(build_lib, 'pyarrow') + + if not os.path.exists(src_dir): + return + + print(f"-- Copying stubs: {src_dir} -> {dest_dir}") + for root, dirs, files in os.walk(src_dir): + for fname in files: + if fname.endswith('.pyi'): + src = pjoin(root, fname) + rel_path = os.path.relpath(src, src_dir) + dest = pjoin(dest_dir, rel_path) + os.makedirs(os.path.dirname(dest), exist_ok=True) + shutil.copy2(src, dest) + # adapted from cmake_build_ext in dynd-python # github.com/libdynd/dynd-python diff --git a/r/DESCRIPTION b/r/DESCRIPTION index ee9e152a8c8a..d5c78fdaebee 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 23.0.0.9000 +Version: 23.0.1.9000 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index 3d2cc393da19..12d4047d8fca 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,14 @@ under the License. --> -# arrow 23.0.0.9000 +# arrow 23.0.1.9000 + +# arrow 23.0.1 + +## Minor improvements and fixes + +- Fix C++20 compatibility issue on macOS (#49221). +- Turn off GCS support by default on macOS; see `vignette("install", package = "arrow")` for details on enabling it (#49068, #48995). # arrow 23.0.0 diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index a1167433c932..5a596dffe3cd 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -38,6 +38,7 @@ supported_dplyr_methods <- list( select = NULL, filter = NULL, + filter_out = NULL, collect = NULL, summarise = c( "window functions not currently supported;", diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index 18f5c929affb..26fa1bf7d5f2 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -17,27 +17,61 @@ # The following S3 methods are registered on load if dplyr is present -filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { - try_arrow_dplyr({ - # TODO something with the .preserve argument - out <- as_adq(.data) +apply_filter_impl <- function( + .data, + ..., + .by = NULL, + .preserve = FALSE, + negate = FALSE +) { + # TODO something with the .preserve argument + out <- as_adq(.data) - by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") + by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") - if (by$from_by) { - out$group_by_vars <- by$names - } + if (by$from_by) { + out$group_by_vars <- by$names + } + + expanded_filters <- expand_across(out, quos(...)) + if (length(expanded_filters) == 0) { + # Nothing to do + return(as_adq(.data)) + } + + # tidy-eval the filter expressions inside an Arrow data_mask + mask <- arrow_mask(out) + + if (isTRUE(negate)) { + # filter_out(): combine all predicates with &, then negate + combined <- NULL + + for (expr in expanded_filters) { + filt <- arrow_eval(expr, mask) - expanded_filters <- expand_across(out, quos(...)) - if (length(expanded_filters) == 0) { - # Nothing to do - return(as_adq(.data)) + if (length(mask$.aggregations)) { + # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. + arrow_not_supported( + .actual_msg = "Expression not supported in filter_out() in Arrow", + call = expr + ) + } + + if (is_list_of(filt, "Expression")) { + filt <- Reduce("&", filt) + } + + combined <- if (is.null(combined)) filt else (combined & filt) } - # tidy-eval the filter expressions inside an Arrow data_mask - mask <- arrow_mask(out) + out <- set_filters(out, combined, negate = TRUE) + } else { + # filter(): apply each predicate sequentially for (expr in expanded_filters) { filt <- arrow_eval(expr, mask) + if (length(mask$.aggregations)) { # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. # But we could, the same way it works in mutate() via join, if someone asks. @@ -47,19 +81,55 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) call = expr ) } - out <- set_filters(out, filt) - } - if (by$from_by) { - out$group_by_vars <- character() + out <- set_filters(out, filt, negate = FALSE) } + } + + if (by$from_by) { + out$group_by_vars <- character() + } - out + out +} + +filter.arrow_dplyr_query <- function( + .data, + ..., + .by = NULL, + .preserve = FALSE +) { + try_arrow_dplyr({ + apply_filter_impl( + .data, + ..., + .by = {{ .by }}, + .preserve = .preserve, + negate = FALSE + ) }) } filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <- filter.arrow_dplyr_query -set_filters <- function(.data, expressions) { +filter_out.arrow_dplyr_query <- function( + .data, + ..., + .by = NULL, + .preserve = FALSE +) { + try_arrow_dplyr({ + apply_filter_impl( + .data, + ..., + .by = {{ .by }}, + .preserve = .preserve, + negate = TRUE + ) + }) +} +filter_out.Dataset <- filter_out.ArrowTabular <- filter_out.RecordBatchReader <- filter_out.arrow_dplyr_query + +set_filters <- function(.data, expressions, negate = FALSE) { if (length(expressions)) { if (is_list_of(expressions, "Expression")) { # expressions is a list of Expressions. AND them together and set them on .data @@ -67,7 +137,16 @@ set_filters <- function(.data, expressions) { } else if (inherits(expressions, "Expression")) { new_filter <- expressions } else { - stop("filter expressions must be either an expression or a list of expressions", call. = FALSE) + stop( + "filter expressions must be either an expression or a list of expressions", + call. = FALSE + ) + } + + if (isTRUE(negate)) { + # dplyr::filter_out() semantics: drop rows where predicate is TRUE; + # keep rows where predicate is FALSE or NA. + new_filter <- (!new_filter) | is.na(new_filter) } if (isTRUE(.data$filtered_rows)) { diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index bbd1c91a0213..9293d14c94c0 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -19,7 +19,7 @@ #' Functions available in Arrow dplyr queries #' -#' The `arrow` package contains methods for 37 `dplyr` table functions, many of +#' The `arrow` package contains methods for 38 `dplyr` table functions, many of #' which are "verbs" that do transformations to one or more tables. #' The package also has mappings of 224 R functions to the corresponding #' functions in the Arrow compute library. These allow you to write code inside @@ -45,6 +45,7 @@ #' * [`distinct()`][dplyr::distinct()]: `.keep_all = TRUE` returns a non-missing value if present, only returning missing values if all are missing. #' * [`explain()`][dplyr::explain()] #' * [`filter()`][dplyr::filter()] +#' * [`filter_out()`][dplyr::filter_out()] #' * [`full_join()`][dplyr::full_join()]: the `copy` argument is ignored #' * [`glimpse()`][dplyr::glimpse()] #' * [`group_by()`][dplyr::group_by()] diff --git a/r/R/ipc-stream.R b/r/R/ipc-stream.R index 26a61a790f93..8ebb5e36636e 100644 --- a/r/R/ipc-stream.R +++ b/r/R/ipc-stream.R @@ -95,6 +95,10 @@ write_to_raw <- function(x, format = c("stream", "file")) { #' Arrow [Table] otherwise #' @seealso [write_feather()] for writing IPC files. [RecordBatchReader] for a #' lower-level interface. +#' @section Untrusted data: +#' If reading from an untrusted source, you can validate the data by reading +#' with `as_data_frame = FALSE` and calling `$ValidateFull()` on the Table +#' before processing. #' @export read_ipc_stream <- function(file, as_data_frame = TRUE, ...) { if (!inherits(file, "InputStream")) { diff --git a/r/README.md b/r/README.md index bb5d137dc886..268ee24bdf00 100644 --- a/r/README.md +++ b/r/README.md @@ -64,7 +64,7 @@ It allows users to read and write data in a variety of formats: It provides access to remote filesystems and servers: -- Read and write files in Amazon S3 and Google Cloud Storage buckets +- Read and write files in Amazon S3 and Google Cloud Storage buckets (note: CRAN builds include S3 support but not GCS which require an alternative installation method; see the [cloud storage article](https://arrow.apache.org/docs/r/articles/fs.html) for details) - Connect to Arrow Flight servers to transport large datasets over networks Additional features include: diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 49468329923d..39700914db4b 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -150,6 +150,7 @@ articles: - developers/docker - developers/install_details - developers/data_object_layout + - developers/binary_features reference: - title: Read datasets diff --git a/r/man/DictionaryType.Rd b/r/man/DictionaryType.Rd index 8c9087f1ab68..cda27978b1b4 100644 --- a/r/man/DictionaryType.Rd +++ b/r/man/DictionaryType.Rd @@ -3,13 +3,40 @@ \docType{class} \name{DictionaryType} \alias{DictionaryType} -\title{class DictionaryType} +\title{DictionaryType class} \description{ -class DictionaryType +\code{DictionaryType} is a \link{FixedWidthType} that represents dictionary-encoded data. +Dictionary encoding stores unique values in a dictionary and uses integer-type +indices to reference them, which can be more memory-efficient for data with many +repeated values. } -\section{Methods}{ +\section{R6 Methods}{ +\itemize{ +\item \verb{$ToString()}: Return a string representation of the dictionary type +\item \verb{$code(namespace = FALSE)}: Return R code to create this dictionary type +} +} + +\section{Active bindings}{ -TODO +\itemize{ +\item \verb{$index_type}: The \link{DataType} for the dictionary indices (must be an integer type, +signed or unsigned) +\item \verb{$value_type}: The \link{DataType} for the dictionary values +\item \verb{$name}: The name of the type. +\item \verb{$ordered}: Whether the dictionary is ordered. +} +} + +\section{Factory}{ + + +\code{DictionaryType$create()} takes the following arguments: +\itemize{ +\item \code{index_type}: A \link{DataType} for the indices (default \code{\link[=int32]{int32()}}) +\item \code{value_type}: A \link{DataType} for the values (default \code{\link[=utf8]{utf8()}}) +\item \code{ordered}: Is this an ordered dictionary (default \code{FALSE})? +} } diff --git a/r/man/FixedWidthType.Rd b/r/man/FixedWidthType.Rd index ac6723d79dbb..71d0ab2d2766 100644 --- a/r/man/FixedWidthType.Rd +++ b/r/man/FixedWidthType.Rd @@ -5,11 +5,22 @@ \alias{FixedWidthType} \title{FixedWidthType class} \description{ -FixedWidthType class +\code{FixedWidthType} is a base class for data types with a fixed width in bits. +This includes all integer types, floating-point types, \code{Boolean}, +\code{FixedSizeBinary}, temporal types (dates, times, timestamps, durations), +and decimal types. } -\section{Methods}{ +\section{R6 Methods}{ -TODO +\code{FixedWidthType} inherits from \link{DataType}, so it has the same methods. } +\section{Active bindings}{ + +\itemize{ +\item \verb{$bit_width}: The width of the type in bits +} +} + +\keyword{internal} diff --git a/r/man/Message.Rd b/r/man/Message.Rd index fbad235b64fe..b8be82bfa4bb 100644 --- a/r/man/Message.Rd +++ b/r/man/Message.Rd @@ -5,11 +5,24 @@ \alias{Message} \title{Message class} \description{ -Message class +\code{Message} holds an Arrow IPC message, which includes metadata and +an optional message body. } -\section{Methods}{ +\section{R6 Methods}{ +\itemize{ +\item \verb{$Equals(other)}: Check if this \code{Message} is equal to another \code{Message} +\item \verb{$body_length()}: Return the length of the message body in bytes +\item \verb{$Verify()}: Check if the \code{Message} metadata is valid Flatbuffer format +} +} -TODO +\section{Active bindings}{ + +\itemize{ +\item \verb{$type}: The message type +\item \verb{$metadata}: The message metadata +\item \verb{$body}: The message body as a \link{Buffer} +} } diff --git a/r/man/MessageReader.Rd b/r/man/MessageReader.Rd index 32ca8900b33a..4c3bef3fc9f4 100644 --- a/r/man/MessageReader.Rd +++ b/r/man/MessageReader.Rd @@ -5,11 +5,22 @@ \alias{MessageReader} \title{MessageReader class} \description{ -MessageReader class +\code{MessageReader} reads \code{Message} objects from an input stream. } -\section{Methods}{ +\section{R6 Methods}{ +\itemize{ +\item \verb{$ReadNextMessage()}: Read the next \code{Message} from the stream. Returns \code{NULL} if +there are no more messages. +} +} + +\section{Factory}{ -TODO + +\code{MessageReader$create()} takes the following argument: +\itemize{ +\item \code{stream}: An \link{InputStream} or object coercible to one (e.g., a raw vector) +} } diff --git a/r/man/acero.Rd b/r/man/acero.Rd index dcaca04d2f2c..ee156cc9129b 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -7,7 +7,7 @@ \alias{arrow-dplyr} \title{Functions available in Arrow dplyr queries} \description{ -The \code{arrow} package contains methods for 37 \code{dplyr} table functions, many of +The \code{arrow} package contains methods for 38 \code{dplyr} table functions, many of which are "verbs" that do transformations to one or more tables. The package also has mappings of 224 R functions to the corresponding functions in the Arrow compute library. These allow you to write code inside @@ -32,6 +32,7 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} returns a non-missing value if present, only returning missing values if all are missing. \item \code{\link[dplyr:explain]{explain()}} \item \code{\link[dplyr:filter]{filter()}} +\item \code{\link[dplyr:filter]{filter_out()}} \item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:glimpse]{glimpse()}} \item \code{\link[dplyr:group_by]{group_by()}} @@ -198,7 +199,7 @@ Valid values are "s", "ms" (default), "us", "ns". \itemize{ \item \code{\link[dplyr:across]{across()}} \item \code{\link[dplyr:between]{between()}} -\item \code{\link[dplyr:case_when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported +\item \code{\link[dplyr:case-and-replace-when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported \item \code{\link[dplyr:coalesce]{coalesce()}} \item \code{\link[dplyr:desc]{desc()}} \item \code{\link[dplyr:across]{if_all()}} diff --git a/r/man/read_ipc_stream.Rd b/r/man/read_ipc_stream.Rd index 49d3949bfcf2..601edb2af068 100644 --- a/r/man/read_ipc_stream.Rd +++ b/r/man/read_ipc_stream.Rd @@ -27,6 +27,13 @@ Apache Arrow defines two formats for \href{https://arrow.apache.org/docs/format/ a "stream" format and a "file" format, known as Feather. \code{read_ipc_stream()} and \code{\link[=read_feather]{read_feather()}} read those formats, respectively. } +\section{Untrusted data}{ + +If reading from an untrusted source, you can validate the data by reading +with \code{as_data_frame = FALSE} and calling \verb{$ValidateFull()} on the Table +before processing. +} + \seealso{ \code{\link[=write_feather]{write_feather()}} for writing IPC files. \link{RecordBatchReader} for a lower-level interface. diff --git a/r/pkgdown/assets/versions.html b/r/pkgdown/assets/versions.html index 76c30f8f252a..e9fdd50a3473 100644 --- a/r/pkgdown/assets/versions.html +++ b/r/pkgdown/assets/versions.html @@ -1,7 +1,7 @@ -

23.0.0.9000 (dev)

-

23.0.0 (release)

+

23.0.1.9000 (dev)

+

23.0.1 (release)

22.0.0

21.0.0

20.0.0

diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 8b2f0471fe59..7d22213ef3b5 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,10 +1,10 @@ [ { - "name": "23.0.0.9000 (dev)", + "name": "23.0.1.9000 (dev)", "version": "dev/" }, { - "name": "23.0.0 (release)", + "name": "23.0.1 (release)", "version": "" }, { diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp index 3633c51d45d9..efdc584d87bf 100644 --- a/r/src/parquet.cpp +++ b/r/src/parquet.cpp @@ -147,48 +147,36 @@ std::shared_ptr parquet___arrow___FileReader__ReadTable2( // [[parquet::export]] std::shared_ptr parquet___arrow___FileReader__ReadRowGroup1( const std::shared_ptr& reader, int i) { - std::shared_ptr table; - auto result = - RunWithCapturedRIfPossibleVoid([&]() { return reader->ReadRowGroup(i, &table); }); - - StopIfNotOk(result); - return table; + auto result = RunWithCapturedRIfPossible>( + [&]() { return reader->ReadRowGroup(i); }); + return ValueOrStop(result); } // [[parquet::export]] std::shared_ptr parquet___arrow___FileReader__ReadRowGroup2( const std::shared_ptr& reader, int i, const std::vector& column_indices) { - std::shared_ptr table; - auto result = RunWithCapturedRIfPossibleVoid( - [&]() { return reader->ReadRowGroup(i, column_indices, &table); }); - - StopIfNotOk(result); - return table; + auto result = RunWithCapturedRIfPossible>( + [&]() { return reader->ReadRowGroup(i, column_indices); }); + return ValueOrStop(result); } // [[parquet::export]] std::shared_ptr parquet___arrow___FileReader__ReadRowGroups1( const std::shared_ptr& reader, const std::vector& row_groups) { - std::shared_ptr table; - auto result = RunWithCapturedRIfPossibleVoid( - [&]() { return reader->ReadRowGroups(row_groups, &table); }); - - StopIfNotOk(result); - return table; + auto result = RunWithCapturedRIfPossible>( + [&]() { return reader->ReadRowGroups(row_groups); }); + return ValueOrStop(result); } // [[parquet::export]] std::shared_ptr parquet___arrow___FileReader__ReadRowGroups2( const std::shared_ptr& reader, const std::vector& row_groups, const std::vector& column_indices) { - std::shared_ptr table; - auto result = RunWithCapturedRIfPossibleVoid( - [&]() { return reader->ReadRowGroups(row_groups, column_indices, &table); }); - - StopIfNotOk(result); - return table; + auto result = RunWithCapturedRIfPossible>( + [&]() { return reader->ReadRowGroups(row_groups, column_indices); }); + return ValueOrStop(result); } // [[parquet::export]] diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index d56e25fca329..3912e518ed08 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -498,3 +498,51 @@ test_that("filter() with aggregation expressions errors", { "not supported in filter" ) }) + +test_that("filter_out() basic", { + compare_dplyr_binding( + .input |> + filter_out(chr == "b") |> + select(chr, int, lgl) |> + collect(), + tbl + ) +}) + +test_that("filter_out() keeps NA values in predicate result", { + compare_dplyr_binding( + .input |> + filter_out(lgl) |> + select(chr, int, lgl) |> + collect(), + tbl + ) +}) + +test_that("filter_out() with multiple conditions", { + compare_dplyr_binding( + .input |> + filter_out(dbl > 2, chr %in% c("d", "f")) |> + collect(), + tbl + ) +}) + +test_that("More complex select/filter_out", { + compare_dplyr_binding( + .input |> + filter_out(dbl > 2, chr == "d" | chr == "f") |> + select(chr, int, lgl) |> + filter(int < 5) |> + select(int, chr) |> + collect(), + tbl + ) + + compare_dplyr_binding( + .input |> + filter_out(!is.na(int)) |> + collect(), + tbl + ) +}) diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index f4ccb4956a88..d50191ac18a1 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -255,13 +255,21 @@ check_allowlist <- function( any(grepl(paste(allowlist, collapse = "|"), os)) } +normalise_arch <- function(arch) { + if (arch %in% c("aarch64", "arm64")) { + return("arm64") + } + arch +} + select_binary <- function( os = tolower(Sys.info()[["sysname"]]), arch = tolower(Sys.info()[["machine"]]), test_program = test_for_curl_and_openssl ) { - if (identical(os, "darwin") || (identical(os, "linux") && identical(arch, "x86_64"))) { - # We only host x86 linux binaries and x86 & arm64 macos today + arch <- normalise_arch(arch) + + if (identical(os, "darwin") || identical(os, "linux")) { binary <- tryCatch( # Somehow the test program system2 call errors on the sanitizer builds # so globally handle the possibility that this could fail @@ -597,7 +605,7 @@ build_libarrow <- function(src_dir, dst_dir) { env_var_list <- c( env_var_list, ARROW_S3 = Sys.getenv("ARROW_S3", "ON"), - ARROW_GCS = Sys.getenv("ARROW_GCS", "ON"), + # ARROW_GCS = Sys.getenv("ARROW_GCS", "ON"), ARROW_WITH_ZSTD = Sys.getenv("ARROW_WITH_ZSTD", "ON") ) } diff --git a/r/tools/test-nixlibs.R b/r/tools/test-nixlibs.R index b1d6214fd879..f7711c97ce47 100644 --- a/r/tools/test-nixlibs.R +++ b/r/tools/test-nixlibs.R @@ -52,8 +52,8 @@ test_that("identify_binary() based on LIBARROW_BINARY", { test_that("select_binary() based on system", { expect_output( - expect_null(select_binary("linux", arch = "aarch64")), # Not built today - "Building on linux aarch64" + expect_null(select_binary("freebsd", arch = "x86_64")), + "Building on freebsd x86_64" ) }) diff --git a/r/vignettes/developers/binary_features.Rmd b/r/vignettes/developers/binary_features.Rmd new file mode 100644 index 000000000000..ed6c7180f5b1 --- /dev/null +++ b/r/vignettes/developers/binary_features.Rmd @@ -0,0 +1,193 @@ +--- +title: "Libarrow binary features" +description: > + Understanding which C++ features are enabled in Arrow R package builds +output: rmarkdown::html_vignette +--- + +This document explains which C++ features are enabled in different Arrow R +package build configurations, and documents the decisions behind our default +feature set. This is intended as internal developer documentation for understanding +which features are enabled in which builds. It is not intended to be a guide for +installing the Arrow R package; for that, see the +[installation guide](../../install.html). + +## Overview + +When the Arrow R package is installed, it needs a copy of the Arrow C++ library +(libarrow). This can come from: + +1. **Prebuilt binaries** we host (for releases and nightlies) +2. **Source builds** when binaries aren't available or users opt out + +The features available in libarrow depend on how it was built. This document +covers the feature configuration for both scenarios. + +## Prebuilt libarrow binary configuration + +We produce prebuilt libarrow binaries for macOS, Windows, and Linux. These +binaries include **more features** than the default source build to provide +users with a fully-featured experience out of the box. + +### Current binary feature set + +| Platform | S3 | GCS | Configured in | +|----------|----|----|---------------| +| macOS (ARM64, x86_64) | ON | ON | `dev/tasks/r/github.packages.yml` | +| Windows | ON | ON | `ci/scripts/PKGBUILD` | +| Linux (x86_64) | ON | ON | `compose.yaml` (`ubuntu-cpp-static`) | + +### Exceptions to our build defaults + +Even though GCS defaults to OFF for source builds, we explicitly enable it in +our prebuilt binaries because: + +1. **Binary users expect features to "just work"** - they shouldn't need to + rebuild from source to access cloud storage +2. **Build time is not a concern** - we build binaries once in CI, not on + user machines +3. **Parity across platforms** - users get the same features regardless of OS + + +## Feature configuration in source builds of libarrow + +Source builds are controlled by `r/inst/build_arrow_static.sh`. The key +environment variable is `LIBARROW_MINIMAL`: + +- `LIBARROW_MINIMAL` unset: Default feature set (Parquet, Dataset, JSON, common compression ON; S3/GCS/jemalloc OFF) +- `LIBARROW_MINIMAL=false`: Full feature set (adds S3, jemalloc, additional compression) +- `LIBARROW_MINIMAL=true`: Truly minimal (disables Parquet, Dataset, JSON, most compression, SIMD) + +### Features always enabled + +These features are always built regardless of `LIBARROW_MINIMAL`: + +| Feature | CMake Flag | Notes | +|---------|------------|-------| +| Compute | `ARROW_COMPUTE=ON` | Core compute functions | +| CSV | `ARROW_CSV=ON` | CSV reading/writing | +| Filesystem | `ARROW_FILESYSTEM=ON` | Local filesystem support | +| JSON | `ARROW_JSON=ON` | JSON reading | +| Parquet | `ARROW_PARQUET=ON` | Parquet file format | +| Dataset | `ARROW_DATASET=ON` | Multi-file datasets | +| Acero | `ARROW_ACERO=ON` | Query execution engine | +| Mimalloc | `ARROW_MIMALLOC=ON` | Memory allocator | +| LZ4 | `ARROW_WITH_LZ4=ON` | LZ4 compression | +| Snappy | `ARROW_WITH_SNAPPY=ON` | Snappy compression | +| RE2 | `ARROW_WITH_RE2=ON` | Regular expressions | +| UTF8Proc | `ARROW_WITH_UTF8PROC=ON` | Unicode support | + +### Features controlled by LIBARROW_MINIMAL + +When `LIBARROW_MINIMAL=false`, the following additional features are enabled +(via `$ARROW_DEFAULT_PARAM=ON`): + +| Feature | CMake Flag | Default | +|---------|------------|---------| +| S3 | `ARROW_S3` | `$ARROW_DEFAULT_PARAM` | +| Jemalloc | `ARROW_JEMALLOC` | `$ARROW_DEFAULT_PARAM` | +| Brotli | `ARROW_WITH_BROTLI` | `$ARROW_DEFAULT_PARAM` | +| BZ2 | `ARROW_WITH_BZ2` | `$ARROW_DEFAULT_PARAM` | +| Zlib | `ARROW_WITH_ZLIB` | `$ARROW_DEFAULT_PARAM` | +| Zstd | `ARROW_WITH_ZSTD` | `$ARROW_DEFAULT_PARAM` | + +### Features that require explicit opt-in + +GCS (Google Cloud Storage) is **always off by default**, even when +`LIBARROW_MINIMAL=false`: + +| Feature | CMake Flag | Default | Reason | +|---------|------------|---------|--------| +| GCS | `ARROW_GCS` | `OFF` | Build complexity, dependency size | + +To enable GCS in a source build, you must explicitly set `ARROW_GCS=ON`. + +**Why is GCS off by default?** + +GCS was turned off by default in [#48343](https://github.com/apache/arrow/pull/48343) +(December 2025) because: + +1. Building google-cloud-cpp is fragile and adds significant build time +2. The dependency on abseil (ABSL) has caused compatibility issues +3. Users who need GCS can still enable it explicitly + +## Configuration file locations + +### libarrow source build configuration + +The main build script that controls source builds: + +**`r/inst/build_arrow_static.sh`** - CMake flags and defaults +([view source](https://github.com/apache/arrow/blob/main/r/inst/build_arrow_static.sh)) +the environment variables to look for are `LIBARROW_MINIMAL`, `ARROW_*`, and, `ARROW_DEFAULT_PARAM` + +### libarrow binary build configuration + +Each platform has its own configuration file: + +| Platform | Config file | Key settings | +|----------|-------------|--------------| +| macOS | `dev/tasks/r/github.packages.yml` | `LIBARROW_MINIMAL=false`, `ARROW_GCS=ON` | +| Windows | `ci/scripts/PKGBUILD` | `ARROW_GCS=ON`, `ARROW_S3=ON` | +| Linux | `compose.yaml` (`ubuntu-cpp-static`) | `LIBARROW_MINIMAL=false`, `ARROW_GCS=ON` | + +## R-universe builds + +[R-universe](https://apache.r-universe.dev/arrow) builds the Arrow R package +for users who want newer versions than CRAN. R-universe behavior varies by +platform and architecture: + +| Platform | Architecture | Build method | Features | +|----------|--------------|--------------|----------| +| macOS | ARM64 | Downloads prebuilt binary | Full (S3 + GCS) | +| macOS | x86_64 | Downloads prebuilt binary | Full (S3 + GCS) | +| Windows | x86_64 | Downloads prebuilt binary | Full (S3 + GCS) | +| Windows | ARM64 | Not supported | NA | +| Linux | x86_64 | Downloads prebuilt binary | Full (S3 + GCS) | +| Linux | ARM64 | Builds from source | S3 only (no GCS) | + +### Why Linux ARM64 builds from source + +We only publish prebuilt Linux binaries for x86_64 architecture. The binary +selection logic in `r/tools/nixlibs.R` (line 263) explicitly checks for this: + +```r +if (identical(os, "darwin") || (identical(os, "linux") && identical(arch, "x86_64"))) { +``` +When R-universe builds on Linux ARM64 runners, no binary is available, so it +falls back to building from source using `build_arrow_static.sh`. Since GCS +defaults to OFF in that script, Linux ARM64 users don't get GCS support. + +### Enabling GCS for Linux ARM64 + +To provide full feature parity for Linux ARM64, we would need to: + +1. Add an ARM64 Linux build job to `dev/tasks/r/github.packages.yml` +2. Update `select_binary()` in `nixlibs.R` to recognize `linux-aarch64` +3. Add the artifact pattern to `dev/tasks/tasks.yml` +4. Update the nightly upload workflow + +See [GH-36193](https://github.com/apache/arrow/issues/36193) for tracking this work. + +Alternatively, changing the GCS default in `build_arrow_static.sh` from `OFF` +to `$ARROW_DEFAULT_PARAM` would enable GCS for all source builds, including +Linux ARM64 on R-universe. + +## Checking installed features + +Users can check which features are enabled in their installation: + +```r +# Show all capabilities +arrow::arrow_info() + +# Check specific features +arrow::arrow_with_s3() +arrow::arrow_with_gcs() +``` + +## Related documentation + +- [Installation guide](../install.html) - User-facing installation docs +- [Installation details](./install_details.html) - How the build system works +- [Developer setup](./setup.html) - Building Arrow for development diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index ed3b1bddb035..52652ad7e9ed 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -12,9 +12,9 @@ To make this work, the Arrow C++ library contains a general-purpose interface fo This article provides an overview of working with both S3 and GCS data using the Arrow toolkit. -## S3 and GCS support on Linux +## S3 and GCS support -Before you start, make sure that your arrow install has support for S3 and/or GCS enabled. For most users this will be true by default, because the Windows and macOS binary packages hosted on CRAN include S3 and GCS support. You can check whether support is enabled via helper functions: +Before you start, make sure that your arrow installation has support for S3 and/or GCS enabled. You can check whether support is enabled via helper functions: ```r arrow_with_s3() @@ -23,7 +23,20 @@ arrow_with_gcs() If these return `TRUE` then the relevant support is enabled. -In some cases you may find that your system does not have support enabled. The most common case for this occurs on Linux when installing arrow from source. In this situation S3 and GCS support is not always enabled by default, and there are additional system requirements involved. See the [installation article](./install.html) for details on how to resolve this. +CRAN builds of arrow include S3 support but not GCS support. If you need GCS support, you can install arrow with full features using one of the following methods: + +```r +# Option 1: Install from R-universe +install.packages("arrow", repos = "https://apache.r-universe.dev") +``` + +```r +# Option 2: Reinstall from source with full features +Sys.setenv("NOT_CRAN" = "true") +install.packages("arrow", type = "source") +``` + +On Linux, S3 and GCS support is not always enabled by default when installing from source, and there are additional system requirements involved. See the [installation article](./install.html) for details. ## Connecting to cloud storage diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index d9cdcc3885c2..14e6622e0434 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -8,6 +8,8 @@ output: rmarkdown::html_vignette In most cases, `install.packages("arrow")` should just work. There are things you can do to make the installation faster, documented in this article. If for some reason installation does not work, set the environment variable `ARROW_R_DEV=true`, retry, and share the logs with us. +Note that CRAN builds of arrow have some optional features disabled, including Google Cloud Storage (GCS) support. If you need these features, see the information below on [building with a libarrow binary](#r-source-package-with-libarrow-binary), or the [cloud storage article](./fs.html#s3-and-gcs-support) for alternative installation options. + ## Background The Apache Arrow project is implemented in multiple languages, and the R package depends on the Arrow C++ library (referred to from here on as libarrow). This means that when you install arrow, you need both the R and C++ versions. If you install arrow from CRAN on a machine running Windows or macOS, when you call `install.packages("arrow")`, a precompiled binary containing both the R package and libarrow will be downloaded. However, CRAN does not host R package binaries for Linux, and so you must choose from one of the alternative approaches. diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb b/ruby/red-arrow-format/lib/arrow-format/array.rb index 077da7663604..5bc7588f3a6b 100644 --- a/ruby/red-arrow-format/lib/arrow-format/array.rb +++ b/ruby/red-arrow-format/lib/arrow-format/array.rb @@ -23,15 +23,25 @@ class Array attr_reader :type attr_reader :size alias_method :length, :size + attr_reader :offset + attr_reader :validity_buffer def initialize(type, size, validity_buffer) @type = type @size = size + @offset = 0 @validity_buffer = validity_buffer + @sliced_buffers = {} + end + + def slice(offset, size=nil) + sliced = dup + sliced.slice!(@offset + offset, size || @size - offset) + sliced end def valid?(i) return true if @validity_buffer.nil? - validity_bitmap[i] == 1 + validity_bitmap[i] end def null?(i) @@ -42,25 +52,87 @@ def n_nulls if @validity_buffer.nil? 0 else - # TODO: popcount - validity_bitmap.count do |bit| - bit == 1 - end + @size - validity_bitmap.popcount end end + protected + def slice!(offset, size) + @offset = offset + @size = size + clear_cache + end + private def validity_bitmap - @validity_bitmap ||= Bitmap.new(@validity_buffer, @size) + @validity_bitmap ||= Bitmap.new(@validity_buffer, @offset, @size) end def apply_validity(array) return array if @validity_buffer.nil? - validity_bitmap.each_with_index do |bit, i| - array[i] = nil if bit.zero? + validity_bitmap.each_with_index do |is_valid, i| + array[i] = nil unless is_valid end array end + + def clear_cache + @validity_bitmap = nil + @sliced_buffers = {} + end + + def slice_buffer(id, buffer) + return buffer if buffer.nil? + return buffer if @offset.zero? + + @sliced_buffers[id] ||= yield(buffer) + end + + def slice_bitmap_buffer(id, buffer) + slice_buffer(id, buffer) do + if (@offset % 8).zero? + buffer.slice(@offset / 8) + else + # We need to copy because we can't do bit level slice. + # TODO: Optimize. + valid_bytes = [] + Bitmap.new(buffer, @offset, @size).each_slice(8) do |valids| + valid_byte = 0 + valids.each_with_index do |valid, i| + valid_byte |= 1 << (i % 8) if valid + end + valid_bytes << valid_byte + end + IO::Buffer.for(valid_bytes.pack("C*")) + end + end + end + + def slice_fixed_element_size_buffer(id, buffer, element_size) + slice_buffer(id, buffer) do + buffer.slice(element_size * @offset) + end + end + + def slice_offsets_buffer(id, buffer, buffer_type) + slice_buffer(id, buffer) do + offset_size = IO::Buffer.size_of(buffer_type) + buffer_offset = offset_size * @offset + first_offset = nil + # TODO: Optimize + sliced_buffer = IO::Buffer.new(offset_size * (@size + 1)) + buffer.each(buffer_type, + buffer_offset, + @size + 1).with_index do |(_, offset), i| + first_offset ||= offset + new_offset = offset - first_offset + sliced_buffer.set_value(buffer_type, + offset_size * i, + new_offset) + end + sliced_buffer + end + end end class NullArray < Array @@ -83,28 +155,48 @@ def initialize(type, size, validity_buffer, values_buffer) @values_buffer = values_buffer end + def to_a + offset = element_size * @offset + apply_validity(@values_buffer.values(@type.buffer_type, offset, @size)) + end + def each_buffer return to_enum(__method__) unless block_given? - yield(@validity_buffer) - yield(@values_buffer) + yield(slice_bitmap_buffer(:validity, @validity_buffer)) + yield(slice_fixed_element_size_buffer(:values, + @values_buffer, + element_size)) + end + + private + def element_size + IO::Buffer.size_of(@type.buffer_type) end end class BooleanArray < PrimitiveArray def to_a - @values_bitmap ||= Bitmap.new(@values_buffer, @size) - values = @values_bitmap.each.collect do |bit| - not bit.zero? - end + @values_bitmap ||= Bitmap.new(@values_buffer, @offset, @size) + values = @values_bitmap.to_a apply_validity(values) end + + def each_buffer + return to_enum(__method__) unless block_given? + + yield(slice_bitmap_buffer(:validity, @validity_buffer)) + yield(slice_bitmap_buffer(:values, @values_buffer)) + end + + private + def clear_cache + super + @values_bitmap = nil + end end class IntArray < PrimitiveArray - def to_a - apply_validity(@values_buffer.values(@type.buffer_type, 0, @size)) - end end class Int8Array < IntArray @@ -135,15 +227,9 @@ class FloatingPointArray < PrimitiveArray end class Float32Array < FloatingPointArray - def to_a - apply_validity(@values_buffer.values(:f32, 0, @size)) - end end class Float64Array < FloatingPointArray - def to_a - apply_validity(@values_buffer.values(:f64, 0, @size)) - end end class TemporalArray < PrimitiveArray @@ -153,75 +239,66 @@ class DateArray < TemporalArray end class Date32Array < DateArray - def to_a - apply_validity(@values_buffer.values(:s32, 0, @size)) - end end class Date64Array < DateArray - def to_a - apply_validity(@values_buffer.values(:s64, 0, @size)) - end end class TimeArray < TemporalArray end class Time32Array < TimeArray - def to_a - apply_validity(@values_buffer.values(:s32, 0, @size)) - end end class Time64Array < TimeArray - def to_a - apply_validity(@values_buffer.values(:s64, 0, @size)) - end end class TimestampArray < TemporalArray - def to_a - apply_validity(@values_buffer.values(:s64, 0, @size)) - end end class IntervalArray < TemporalArray end class YearMonthIntervalArray < IntervalArray - def to_a - apply_validity(@values_buffer.values(:s32, 0, @size)) - end end class DayTimeIntervalArray < IntervalArray def to_a + offset = element_size * @offset values = @values_buffer. - each(:s32, 0, @size * 2). + each(@type.buffer_type, offset, @size * 2). each_slice(2). collect do |(_, day), (_, time)| [day, time] end apply_validity(values) end + + private + def element_size + super * 2 + end end class MonthDayNanoIntervalArray < IntervalArray def to_a - buffer_types = [:s32, :s32, :s64] + buffer_types = @type.buffer_types value_size = IO::Buffer.size_of(buffer_types) + base_offset = value_size * @offset values = @size.times.collect do |i| - offset = value_size * i + offset = base_offset + value_size * i @values_buffer.get_values(buffer_types, offset) end apply_validity(values) end + + private + def element_size + IO::Buffer.size_of(@type.buffer_types) + end end class DurationArray < TemporalArray - def to_a - apply_validity(@values_buffer.values(:s64, 0, @size)) - end end class VariableSizeBinaryLayoutArray < Array @@ -234,65 +311,45 @@ def initialize(type, size, validity_buffer, offsets_buffer, values_buffer) def each_buffer return to_enum(__method__) unless block_given? - yield(@validity_buffer) - yield(@offsets_buffer) - yield(@values_buffer) + yield(slice_bitmap_buffer(:validity, @validity_buffer)) + yield(slice_offsets_buffer(:offsets, + @offsets_buffer, + @type.offset_buffer_type)) + sliced_values_buffer = slice_buffer(:values, @values_buffer) do + first_offset = @offsets_buffer.get_value(@type.offset_buffer_type, + offset_size * @offset) + @values_buffer.slice(first_offset) + end + yield(sliced_values_buffer) end def to_a values = @offsets_buffer. - each(buffer_type, 0, @size + 1). + each(@type.offset_buffer_type, offset_size * @offset, @size + 1). each_cons(2). collect do |(_, offset), (_, next_offset)| length = next_offset - offset - @values_buffer.get_string(offset, length, encoding) + @values_buffer.get_string(offset, length, @type.encoding) end apply_validity(values) end - end - class BinaryArray < VariableSizeBinaryLayoutArray private - def buffer_type - :s32 # TODO: big endian support + def offset_size + IO::Buffer.size_of(@type.offset_buffer_type) end + end - def encoding - Encoding::ASCII_8BIT - end + class BinaryArray < VariableSizeBinaryLayoutArray end class LargeBinaryArray < VariableSizeBinaryLayoutArray - private - def buffer_type - :s64 # TODO: big endian support - end - - def encoding - Encoding::ASCII_8BIT - end end class UTF8Array < VariableSizeBinaryLayoutArray - private - def buffer_type - :s32 # TODO: big endian support - end - - def encoding - Encoding::UTF_8 - end end class LargeUTF8Array < VariableSizeBinaryLayoutArray - private - def buffer_type - :s64 # TODO: big endian support - end - - def encoding - Encoding::UTF_8 - end end class FixedSizeBinaryArray < Array @@ -301,6 +358,15 @@ def initialize(type, size, validity_buffer, values_buffer) @values_buffer = values_buffer end + def each_buffer + return to_enum(__method__) unless block_given? + + yield(slice_bitmap_buffer(:validity, @validity_buffer)) + yield(slice_fixed_element_size_buffer(:values, + @values_buffer, + @type.byte_width)) + end + def to_a byte_width = @type.byte_width values = 0.step(@size * byte_width - 1, byte_width).collect do |offset| @@ -314,8 +380,9 @@ class DecimalArray < FixedSizeBinaryArray def to_a byte_width = @type.byte_width buffer_types = [:u64] * (byte_width / 8 - 1) + [:s64] + base_offset = byte_width * @offset values = 0.step(@size * byte_width - 1, byte_width).collect do |offset| - @values_buffer.get_values(buffer_types, offset) + @values_buffer.get_values(buffer_types, base_offset + offset) end apply_validity(values).collect do |value| if value.nil? @@ -363,44 +430,69 @@ class Decimal256Array < DecimalArray end class VariableSizeListArray < Array + attr_reader :child def initialize(type, size, validity_buffer, offsets_buffer, child) super(type, size, validity_buffer) @offsets_buffer = offsets_buffer @child = child end + def each_buffer(&block) + return to_enum(__method__) unless block_given? + + yield(slice_bitmap_buffer(:validity, @validity_buffer)) + yield(slice_offsets_buffer(:offsets, + @offsets_buffer, + @type.offset_buffer_type)) + end + def to_a child_values = @child.to_a values = @offsets_buffer. - each(offset_type, 0, @size + 1). + each(@type.offset_buffer_type, offset_size * @offset, @size + 1). each_cons(2). collect do |(_, offset), (_, next_offset)| child_values[offset...next_offset] end apply_validity(values) end - end - class ListArray < VariableSizeListArray private - def offset_type - :s32 # TODO: big endian support + def offset_size + IO::Buffer.size_of(@type.offset_buffer_type) end + + def slice!(offset, size) + super + first_offset = + @offsets_buffer.get_value(@type.offset_buffer_type, + offset_size * @offset) + last_offset = + @offsets_buffer.get_value(@type.offset_buffer_type, + offset_size * (@offset + @size + 1)) + @child = @child.slice(first_offset, last_offset - first_offset) + end + end + + class ListArray < VariableSizeListArray end class LargeListArray < VariableSizeListArray - private - def offset_type - :s64 # TODO: big endian support - end end class StructArray < Array + attr_reader :children def initialize(type, size, validity_buffer, children) super(type, size, validity_buffer) @children = children end + def each_buffer(&block) + return to_enum(__method__) unless block_given? + + yield(slice_bitmap_buffer(:validity, @validity_buffer)) + end + def to_a if @children.empty? values = [[]] * @size @@ -410,6 +502,14 @@ def to_a end apply_validity(values) end + + private + def slice!(offset, size) + super + @children = @children.collect do |child| + child.slice(offset, size) + end + end end class MapArray < VariableSizeListArray @@ -426,19 +526,24 @@ def to_a end end end - - private - def offset_type - :s32 # TODO: big endian support - end end class UnionArray < Array + attr_reader :children def initialize(type, size, types_buffer, children) super(type, size, nil) @types_buffer = types_buffer @children = children end + + private + def type_buffer_type + :S8 + end + + def type_element_size + IO::Buffer.size_of(type_buffer_type) + end end class DenseUnionArray < UnionArray @@ -451,41 +556,96 @@ def initialize(type, @offsets_buffer = offsets_buffer end + def each_buffer(&block) + return to_enum(__method__) unless block_given? + + # TODO: Dictionary delta support (slice support) + yield(@types_buffer) + yield(@offsets_buffer) + end + def to_a children_values = @children.collect(&:to_a) - types = @types_buffer.each(:S8, 0, @size) - offsets = @offsets_buffer.each(:s32, 0, @size) + types = @types_buffer.each(type_buffer_type, + type_element_size * @offset, + @size) + offsets = @offsets_buffer.each(:s32, + offset_element_size * @offset, + @size) types.zip(offsets).collect do |(_, type), (_, offset)| index = @type.resolve_type_index(type) children_values[index][offset] end end + + private + def offset_buffer_type + :s32 + end + + def offset_element_size + IO::Buffer.size_of(offset_buffer_type) + end end class SparseUnionArray < UnionArray + def each_buffer(&block) + return to_enum(__method__) unless block_given? + + yield(slice_fixed_element_size_buffer(:types, + @types_buffer, + type_element_size)) + end + def to_a children_values = @children.collect(&:to_a) - @types_buffer.each(:S8, 0, @size).with_index.collect do |(_, type), i| + @types_buffer.each(type_buffer_type, + type_element_size * @offset, + @size).with_index.collect do |(_, type), i| index = @type.resolve_type_index(type) children_values[index][i] end end + + private + def slice!(offset, size) + super + @children = @children.collect do |child| + child.slice(offset, size) + end + end end class DictionaryArray < Array - def initialize(type, size, validity_buffer, indices_buffer, dictionary) + attr_reader :indices_buffer + attr_reader :dictionaries + def initialize(type, + size, + validity_buffer, + indices_buffer, + dictionaries) super(type, size, validity_buffer) @indices_buffer = indices_buffer - @dictionary = dictionary + @dictionaries = dictionaries + end + + # TODO: Slice support + def each_buffer + return to_enum(__method__) unless block_given? + + yield(@validity_buffer) + yield(@indices_buffer) end def to_a values = [] - @dictionary.each do |dictionary_chunk| - values.concat(dictionary_chunk.to_a) + @dictionaries.each do |dictionary| + values.concat(dictionary.to_a) end buffer_type = @type.index_type.buffer_type - indices = apply_validity(@indices_buffer.values(buffer_type, 0, @size)) + offset = IO::Buffer.size_of(buffer_type) * @offset + indices = + apply_validity(@indices_buffer.values(buffer_type, offset, @size)) indices.collect do |index| if index.nil? nil diff --git a/ruby/red-arrow-format/lib/arrow-format/bitmap.rb b/ruby/red-arrow-format/lib/arrow-format/bitmap.rb index 5cff7e63d2ad..e4a0dc76d368 100644 --- a/ruby/red-arrow-format/lib/arrow-format/bitmap.rb +++ b/ruby/red-arrow-format/lib/arrow-format/bitmap.rb @@ -18,31 +18,46 @@ module ArrowFormat class Bitmap include Enumerable - def initialize(buffer, n_values) + def initialize(buffer, offset, n_values) @buffer = buffer + @offset = offset @n_values = n_values end def [](i) - (@validity_buffer.get_value(:U8, i / 8) & (1 << (i % 8))) > 0 + i += @offset + (@buffer.get_value(:U8, i / 8) & (1 << (i % 8))) > 0 end def each return to_enum(__method__) unless block_given? - n_bytes = @n_values / 8 + # TODO: Optimize + current = -1 + n_bytes = (@offset + @n_values) / 8 @buffer.each(:U8, 0, n_bytes) do |offset, value| 7.times do |i| - yield(value & (1 << (i % 8))) + current += 1 + next if current < @offset + yield((value & (1 << (i % 8))) > 0) end end - remained_bits = @n_values % 8 + remained_bits = (@offset + @n_values) % 8 unless remained_bits.zero? value = @buffer.get_value(:U8, n_bytes) remained_bits.times do |i| - yield(value & (1 << (i % 8))) + current += 1 + next if current < @offset + yield((value & (1 << (i % 8))) > 0) end end end + + def popcount + # TODO: Optimize + count do |flaged| + flaged + end + end end end diff --git a/ruby/red-arrow-format/lib/arrow-format/field.rb b/ruby/red-arrow-format/lib/arrow-format/field.rb index fc5639bb6699..7736bbf5e7e7 100644 --- a/ruby/red-arrow-format/lib/arrow-format/field.rb +++ b/ruby/red-arrow-format/lib/arrow-format/field.rb @@ -34,22 +34,14 @@ def to_flatbuffers fb_field = FB::Field::Data.new fb_field.name = @name fb_field.nullable = @nullable - if @type.is_a?(DictionaryType) - fb_field.type = @type.value_type.to_flatbuffers - dictionary_encoding = FB::DictionaryEncoding::Data.new - dictionary_encoding.id = @dictionary_id - int = FB::Int::Data.new - int.bit_width = @type.index_type.bit_width - int.signed = @type.index_type.signed? - dictionary_encoding.index_type = int - dictionary_encoding.ordered = @type.ordered? - dictionary_encoding.dictionary_kind = - FB::DictionaryKind::DENSE_ARRAY - fb_field.dictionary = dictionary + if @type.respond_to?(:build_fb_field) + @type.build_fb_field(fb_field, self) else fb_field.type = @type.to_flatbuffers end - if @type.respond_to?(:children) + if @type.respond_to?(:child) + fb_field.children = [@type.child.to_flatbuffers] + elsif @type.respond_to?(:children) fb_field.children = @type.children.collect(&:to_flatbuffers) end # fb_field.custom_metadata = @custom_metadata diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb index 6218fbcf1437..03514a3cc2e0 100644 --- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb +++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb @@ -203,7 +203,7 @@ def read_dictionaries dictionaries end - def find_dictionary(id) + def find_dictionaries(id) @dictionaries[id] end end diff --git a/ruby/red-arrow-format/lib/arrow-format/file-writer.rb b/ruby/red-arrow-format/lib/arrow-format/file-writer.rb index 8509be59b6de..27b6b55bbf9a 100644 --- a/ruby/red-arrow-format/lib/arrow-format/file-writer.rb +++ b/ruby/red-arrow-format/lib/arrow-format/file-writer.rb @@ -41,7 +41,7 @@ def build_footer fb_footer = FB::Footer::Data.new fb_footer.version = FB::MetadataVersion::V5 fb_footer.schema = @fb_schema - # fb_footer.dictionaries = ... # TODO + fb_footer.dictionaries = @fb_dictionary_blocks fb_footer.record_batches = @fb_record_batch_blocks # fb_footer.custom_metadata = ... # TODO FB::Footer.serialize(fb_footer) diff --git a/ruby/red-arrow-format/lib/arrow-format/readable.rb b/ruby/red-arrow-format/lib/arrow-format/readable.rb index 9cf1beecbebe..ff09c6129dd0 100644 --- a/ruby/red-arrow-format/lib/arrow-format/readable.rb +++ b/ruby/red-arrow-format/lib/arrow-format/readable.rb @@ -78,11 +78,11 @@ def read_field(fb_field) when FB::Interval case fb_type.unit when FB::IntervalUnit::YEAR_MONTH - type = YearMonthIntervalType.new + type = YearMonthIntervalType.singleton when FB::IntervalUnit::DAY_TIME - type = DayTimeIntervalType.new + type = DayTimeIntervalType.singleton when FB::IntervalUnit::MONTH_DAY_NANO - type = MonthDayNanoIntervalType.new + type = MonthDayNanoIntervalType.singleton end when FB::Duration unit = fb_type.unit.name.downcase.to_sym @@ -233,8 +233,8 @@ def read_column(field, nodes, buffers, body) when DictionaryType indices_buffer = buffers.shift indices = body.slice(indices_buffer.offset, indices_buffer.length) - dictionary = find_dictionary(field.dictionary_id) - field.type.build_array(length, validity, indices, dictionary) + dictionaries = find_dictionaries(field.dictionary_id) + field.type.build_array(length, validity, indices, dictionaries) end end end diff --git a/ruby/red-arrow-format/lib/arrow-format/record-batch.rb b/ruby/red-arrow-format/lib/arrow-format/record-batch.rb index cf925eebdfa3..a641c87da71e 100644 --- a/ruby/red-arrow-format/lib/arrow-format/record-batch.rb +++ b/ruby/red-arrow-format/lib/arrow-format/record-batch.rb @@ -70,7 +70,9 @@ def all_columns_enumerator Enumerator.new do |yielder| traverse = lambda do |array| yielder << array - if array.respond_to?(:children) + if array.respond_to?(:child) + traverse.call(array.child) + elsif array.respond_to?(:children) array.children.each do |child_array| traverse.call(child_array) end diff --git a/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb b/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb index ffa4cb553459..98263de77e10 100644 --- a/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb +++ b/ruby/red-arrow-format/lib/arrow-format/streaming-pull-reader.rb @@ -231,7 +231,7 @@ def process_dictionary_batch_message(message, body) end end - def find_dictionary(id) + def find_dictionaries(id) @dictionaries[id] end diff --git a/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb b/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb index 313c1b38ad99..11f2b4375a74 100644 --- a/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb +++ b/ruby/red-arrow-format/lib/arrow-format/streaming-writer.rb @@ -29,38 +29,26 @@ class StreamingWriter def initialize(output) @output = output @offset = 0 + @fb_dictionary_blocks = [] @fb_record_batch_blocks = [] + @written_dictionary_offsets = {} end def start(schema) write_message(build_metadata(schema.to_flatbuffers)) - # TODO: Write dictionaries end def write_record_batch(record_batch) - body_length = 0 - record_batch.all_buffers_enumerator.each do |buffer| - body_length += aligned_buffer_size(buffer) if buffer + record_batch.schema.fields.each_with_index do |field, i| + next if field.dictionary_id.nil? + dictionary_array = record_batch.columns[i] + write_dictionary(field.dictionary_id, dictionary_array) end - metadata = build_metadata(record_batch.to_flatbuffers, body_length) - fb_block = FB::Block::Data.new - fb_block.offset = @offset - fb_block.meta_data_length = - CONTINUATION.bytesize + - MessagePullReader::METADATA_LENGTH_SIZE + - metadata.bytesize - fb_block.body_length = body_length - @fb_record_batch_blocks << fb_block - write_message(metadata) do - record_batch.all_buffers_enumerator.each do |buffer| - write_buffer(buffer) if buffer - end - end - end - # TODO - # def write_dictionary_delta(id, dictionary) - # end + write_record_batch_based_message(record_batch, + record_batch.to_flatbuffers, + @fb_record_batch_blocks) + end def finish write_data(EOS) @@ -100,6 +88,57 @@ def build_metadata(header, body_length=0) metadata end + def write_record_batch_based_message(record_batch, fb_header, fb_blocks) + body_length = 0 + record_batch.all_buffers_enumerator.each do |buffer| + body_length += aligned_buffer_size(buffer) if buffer + end + metadata = build_metadata(fb_header, body_length) + fb_block = FB::Block::Data.new + fb_block.offset = @offset + fb_block.meta_data_length = + CONTINUATION.bytesize + + MessagePullReader::METADATA_LENGTH_SIZE + + metadata.bytesize + fb_block.body_length = body_length + fb_blocks << fb_block + write_message(metadata) do + record_batch.all_buffers_enumerator.each do |buffer| + write_buffer(buffer) if buffer + end + end + end + + def write_dictionary(id, dictionary_array) + value_type = dictionary_array.type.value_type + base_offset = 0 + dictionary_array.dictionaries.each do |dictionary| + written_offset = @written_dictionary_offsets[id] || 0 + current_base_offset = base_offset + next_base_offset = base_offset + dictionary.size + base_offset = next_base_offset + + next if next_base_offset <= written_offset + + is_delta = (not written_offset.zero?) + if current_base_offset < written_offset + dictionary = dictionary.slice(written_offset - current_base_offset) + end + + schema = Schema.new([Field.new("dummy", value_type, true, nil)]) + size = dictionary.size + record_batch = RecordBatch.new(schema, size, [dictionary]) + fb_dictionary_batch = FB::DictionaryBatch::Data.new + fb_dictionary_batch.id = id + fb_dictionary_batch.data = record_batch.to_flatbuffers + fb_dictionary_batch.delta = is_delta + write_record_batch_based_message(record_batch, + fb_dictionary_batch, + @fb_dictionary_blocks) + @written_dictionary_offsets[id] = written_offset + dictionary.size + end + end + def write_message(metadata) write_data(CONTINUATION) metadata_size = metadata.bytesize diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb b/ruby/red-arrow-format/lib/arrow-format/type.rb index c648e5b63137..bc2b3132857f 100644 --- a/ruby/red-arrow-format/lib/arrow-format/type.rb +++ b/ruby/red-arrow-format/lib/arrow-format/type.rb @@ -305,6 +305,10 @@ def name "Float32" end + def buffer_type + :f32 + end + def build_array(size, validity_buffer, values_buffer) Float32Array.new(self, size, validity_buffer, values_buffer) end @@ -325,15 +329,30 @@ def name "Float64" end + def buffer_type + :f64 + end + def build_array(size, validity_buffer, values_buffer) Float64Array.new(self, size, validity_buffer, values_buffer) end end - class TemporalType < Type + class TemporalType < PrimitiveType end class DateType < TemporalType + attr_reader :unit + def initialize(unit) + super() + @unit = unit + end + + def to_flatbuffers + fb_type = FB::Date::Data.new + fb_type.unit = FB::DateUnit.try_convert(@unit.to_s.upcase) + fb_type + end end class Date32Type < DateType @@ -343,10 +362,18 @@ def singleton end end + def initialize + super(:day) + end + def name "Date32" end + def buffer_type + :s32 + end + def build_array(size, validity_buffer, values_buffer) Date32Array.new(self, size, validity_buffer, values_buffer) end @@ -359,38 +386,71 @@ def singleton end end + def initialize + super(:millisecond) + end + def name "Date64" end + def buffer_type + :s64 + end + def build_array(size, validity_buffer, values_buffer) Date64Array.new(self, size, validity_buffer, values_buffer) end end class TimeType < TemporalType + attr_reader :bit_width attr_reader :unit - def initialize(unit) + def initialize(bit_width, unit) super() + @bit_width = bit_width @unit = unit end + + def to_flatbuffers + fb_type = FB::Time::Data.new + fb_type.bit_width = @bit_width + fb_type.unit = FB::TimeUnit.try_convert(@unit.to_s.upcase) + fb_type + end end class Time32Type < TimeType + def initialize(unit) + super(32, unit) + end + def name "Time32" end + def buffer_type + :s32 + end + def build_array(size, validity_buffer, values_buffer) Time32Array.new(self, size, validity_buffer, values_buffer) end end class Time64Type < TimeType + def initialize(unit) + super(64, unit) + end + def name "Time64" end + def buffer_type + :s64 + end + def build_array(size, validity_buffer, values_buffer) Time64Array.new(self, size, validity_buffer, values_buffer) end @@ -398,50 +458,102 @@ def build_array(size, validity_buffer, values_buffer) class TimestampType < TemporalType attr_reader :unit - attr_reader :timezone - def initialize(unit, timezone) + attr_reader :time_zone + def initialize(unit, time_zone) super() @unit = unit - @timezone = timezone + @time_zone = time_zone end def name "Timestamp" end + def buffer_type + :s64 + end + def build_array(size, validity_buffer, values_buffer) TimestampArray.new(self, size, validity_buffer, values_buffer) end + + def to_flatbuffers + fb_type = FB::Timestamp::Data.new + fb_type.unit = FB::TimeUnit.try_convert(@unit.to_s.upcase) + fb_type.timezone = @time_zone + fb_type + end end class IntervalType < TemporalType + class << self + def singleton + @singleton ||= new + end + end + + attr_reader :unit + def initialize(unit) + super() + @unit = unit + end + + def to_flatbuffers + fb_type = FB::Interval::Data.new + fb_type.unit = FB::IntervalUnit.try_convert(@unit.to_s.upcase) + fb_type + end end class YearMonthIntervalType < IntervalType + def initialize + super(:year_month) + end + def name "YearMonthInterval" end + def buffer_type + :s32 + end + def build_array(size, validity_buffer, values_buffer) YearMonthIntervalArray.new(self, size, validity_buffer, values_buffer) end end class DayTimeIntervalType < IntervalType + def initialize + super(:day_time) + end + def name "DayTimeInterval" end + def buffer_type + :s32 + end + def build_array(size, validity_buffer, values_buffer) DayTimeIntervalArray.new(self, size, validity_buffer, values_buffer) end end class MonthDayNanoIntervalType < IntervalType + def initialize + super(:month_day_nano) + end + def name "MonthDayNanoInterval" end + def buffer_types + @buffer_types ||= [:s32, :s32, :s64] + end + def build_array(size, validity_buffer, values_buffer) MonthDayNanoIntervalArray.new(self, size, @@ -461,9 +573,19 @@ def name "Duration" end + def buffer_type + :s64 + end + def build_array(size, validity_buffer, values_buffer) DurationArray.new(self, size, validity_buffer, values_buffer) end + + def to_flatbuffers + fb_type = FB::Duration::Data.new + fb_type.unit = FB::TimeUnit.try_convert(@unit.to_s.upcase) + fb_type + end end class VariableSizeBinaryType < Type @@ -480,6 +602,14 @@ def name "Binary" end + def offset_buffer_type + :s32 # TODO: big endian support + end + + def encoding + Encoding::ASCII_8BIT + end + def build_array(size, validity_buffer, offsets_buffer, values_buffer) BinaryArray.new(self, size, @@ -504,6 +634,14 @@ def name "LargeBinary" end + def offset_buffer_type + :s64 # TODO: big endian support + end + + def encoding + Encoding::ASCII_8BIT + end + def build_array(size, validity_buffer, offsets_buffer, values_buffer) LargeBinaryArray.new(self, size, @@ -528,6 +666,14 @@ def name "UTF8" end + def offset_buffer_type + :s32 # TODO: big endian support + end + + def encoding + Encoding::UTF_8 + end + def build_array(size, validity_buffer, offsets_buffer, values_buffer) UTF8Array.new(self, size, validity_buffer, offsets_buffer, values_buffer) end @@ -548,6 +694,14 @@ def name "LargeUTF8" end + def offset_buffer_type + :s64 # TODO: big endian support + end + + def encoding + Encoding::UTF_8 + end + def build_array(size, validity_buffer, offsets_buffer, values_buffer) LargeUTF8Array.new(self, size, @@ -555,6 +709,10 @@ def build_array(size, validity_buffer, offsets_buffer, values_buffer) offsets_buffer, values_buffer) end + + def to_flatbuffers + FB::LargeUtf8::Data.new + end end class FixedSizeBinaryType < Type @@ -571,6 +729,12 @@ def name def build_array(size, validity_buffer, values_buffer) FixedSizeBinaryArray.new(self, size, validity_buffer, values_buffer) end + + def to_flatbuffers + fb_type = FB::FixedSizeBinary::Data.new + fb_type.byte_width = @byte_width + fb_type + end end class DecimalType < FixedSizeBinaryType @@ -581,6 +745,14 @@ def initialize(byte_width, precision, scale) @precision = precision @scale = scale end + + def to_flatbuffers + fb_type = FB::Decimal::Data.new + fb_type.bit_width = @byte_width * 8 + fb_type.precision = @precision + fb_type.scale = @scale + fb_type + end end class Decimal128Type < DecimalType @@ -617,7 +789,6 @@ def initialize(child) super() @child = child end - end class ListType < VariableSizeListType @@ -625,9 +796,17 @@ def name "List" end + def offset_buffer_type + :s32 # TODO: big endian support + end + def build_array(size, validity_buffer, offsets_buffer, child) ListArray.new(self, size, validity_buffer, offsets_buffer, child) end + + def to_flatbuffers + FB::List::Data.new + end end class LargeListType < VariableSizeListType @@ -635,9 +814,17 @@ def name "LargeList" end + def offset_buffer_type + :s64 # TODO: big endian support + end + def build_array(size, validity_buffer, offsets_buffer, child) LargeListArray.new(self, size, validity_buffer, offsets_buffer, child) end + + def to_flatbuffers + FB::LargeList::Data.new + end end class StructType < Type @@ -654,6 +841,10 @@ def name def build_array(size, validity_buffer, children) StructArray.new(self, size, validity_buffer, children) end + + def to_flatbuffers + FB::Struct::Data.new + end end class MapType < VariableSizeListType @@ -681,16 +872,25 @@ def name "Map" end + def offset_buffer_type + :s32 # TODO: big endian support + end + def build_array(size, validity_buffer, offsets_buffer, child) MapArray.new(self, size, validity_buffer, offsets_buffer, child) end + + def to_flatbuffers + FB::Map::Data.new + end end class UnionType < Type attr_reader :children attr_reader :type_ids - def initialize(children, type_ids) + def initialize(mode, children, type_ids) super() + @mode = mode @children = children @type_ids = type_ids @type_indexes = {} @@ -699,9 +899,20 @@ def initialize(children, type_ids) def resolve_type_index(type) @type_indexes[type] ||= @type_ids.index(type) end + + def to_flatbuffers + fb_type = FB::Union::Data.new + fb_type.mode = FB::UnionMode.try_convert(@mode.to_s.capitalize) + fb_type.type_ids = @type_ids + fb_type + end end class DenseUnionType < UnionType + def initialize(children, type_ids) + super(:dense, children, type_ids) + end + def name "DenseUnion" end @@ -712,6 +923,10 @@ def build_array(size, types_buffer, offsets_buffer, children) end class SparseUnionType < UnionType + def initialize(children, type_ids) + super(:sparse, children, type_ids) + end + def name "SparseUnion" end @@ -739,12 +954,26 @@ def name "Dictionary" end - def build_array(size, validity_buffer, indices_buffer, dictionary) + def build_array(size, validity_buffer, indices_buffer, dictionaries) DictionaryArray.new(self, size, validity_buffer, indices_buffer, - dictionary) + dictionaries) + end + + def build_fb_field(fb_field, field) + fb_dictionary_encoding = FB::DictionaryEncoding::Data.new + fb_dictionary_encoding.id = field.dictionary_id + fb_int = FB::Int::Data.new + fb_int.bit_width = @index_type.bit_width + fb_int.signed = @index_type.signed? + fb_dictionary_encoding.index_type = fb_int + fb_dictionary_encoding.ordered = @ordered + fb_dictionary_encoding.dictionary_kind = + FB::DictionaryKind::DENSE_ARRAY + fb_field.type = @value_type.to_flatbuffers + fb_field.dictionary = fb_dictionary_encoding end end end diff --git a/ruby/red-arrow-format/test/test-reader.rb b/ruby/red-arrow-format/test/test-reader.rb index e00489673760..10a2597f4a05 100644 --- a/ruby/red-arrow-format/test/test-reader.rb +++ b/ruby/red-arrow-format/test/test-reader.rb @@ -16,6 +16,20 @@ # under the License. module ReaderTests + def read + @reader.collect do |record_batch| + record_batch.to_h.tap do |hash| + hash.each do |key, value| + hash[key] = value.to_a + end + end + end + end + + def type + @type ||= @reader.first.schema.fields[0].type + end + class << self def included(base) base.class_eval do @@ -191,7 +205,7 @@ def test_read sub_test_case("Date64") do def setup(&block) @date_2017_08_28_00_00_00 = 1503878400000 - @date_2025_12_09_00_00_00 = 1765324800000 + @date_2025_12_10_00_00_00 = 1765324800000 super(&block) end @@ -199,7 +213,7 @@ def build_array Arrow::Date64Array.new([ @date_2017_08_28_00_00_00, nil, - @date_2025_12_09_00_00_00, + @date_2025_12_10_00_00_00, ]) end @@ -209,7 +223,7 @@ def test_read "value" => [ @date_2017_08_28_00_00_00, nil, - @date_2025_12_09_00_00_00, + @date_2025_12_10_00_00_00, ], }, ], @@ -225,7 +239,8 @@ def setup(&block) end def build_array - Arrow::Time32Array.new(:second, [@time_00_00_10, nil, @time_00_01_10]) + Arrow::Time32Array.new(:second, + [@time_00_00_10, nil, @time_00_01_10]) end def test_read @@ -350,7 +365,7 @@ def test_type sub_test_case("Timestamp(:second)") do def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 + @timestamp_2019_11_17_15_09_11 = 1574003351 @timestamp_2025_12_16_05_33_58 = 1765863238 super(&block) end @@ -358,7 +373,7 @@ def setup(&block) def build_array Arrow::TimestampArray.new(:second, [ - @timestamp_2019_11_18_00_09_11, + @timestamp_2019_11_17_15_09_11, nil, @timestamp_2025_12_16_05_33_58, ]) @@ -368,7 +383,7 @@ def test_read assert_equal([ { "value" => [ - @timestamp_2019_11_18_00_09_11, + @timestamp_2019_11_17_15_09_11, nil, @timestamp_2025_12_16_05_33_58, ], @@ -380,7 +395,7 @@ def test_read sub_test_case("Timestamp(:millisecond)") do def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000 + @timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000 @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000 super(&block) end @@ -388,7 +403,7 @@ def setup(&block) def build_array Arrow::TimestampArray.new(:milli, [ - @timestamp_2019_11_18_00_09_11, + @timestamp_2019_11_17_15_09_11, nil, @timestamp_2025_12_16_05_33_58, ]) @@ -398,7 +413,7 @@ def test_read assert_equal([ { "value" => [ - @timestamp_2019_11_18_00_09_11, + @timestamp_2019_11_17_15_09_11, nil, @timestamp_2025_12_16_05_33_58, ], @@ -410,7 +425,7 @@ def test_read sub_test_case("Timestamp(:microsecond)") do def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000 + @timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000_000 @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000 super(&block) end @@ -418,7 +433,7 @@ def setup(&block) def build_array Arrow::TimestampArray.new(:micro, [ - @timestamp_2019_11_18_00_09_11, + @timestamp_2019_11_17_15_09_11, nil, @timestamp_2025_12_16_05_33_58, ]) @@ -428,7 +443,7 @@ def test_read assert_equal([ { "value" => [ - @timestamp_2019_11_18_00_09_11, + @timestamp_2019_11_17_15_09_11, nil, @timestamp_2025_12_16_05_33_58, ], @@ -440,7 +455,7 @@ def test_read sub_test_case("Timestamp(:nanosecond)") do def setup(&block) - @timestamp_2019_11_18_00_09_11 = 1574003351 * 1_000_000_000 + @timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000_000_000 @timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000_000 super(&block) end @@ -448,7 +463,7 @@ def setup(&block) def build_array Arrow::TimestampArray.new(:nano, [ - @timestamp_2019_11_18_00_09_11, + @timestamp_2019_11_17_15_09_11, nil, @timestamp_2025_12_16_05_33_58, ]) @@ -458,7 +473,7 @@ def test_read assert_equal([ { "value" => [ - @timestamp_2019_11_18_00_09_11, + @timestamp_2019_11_17_15_09_11, nil, @timestamp_2025_12_16_05_33_58, ], @@ -468,27 +483,27 @@ def test_read end end - sub_test_case("Timestamp(timezone)") do + sub_test_case("Timestamp(time_zone)") do def setup(&block) - @timezone = "UTC" - @timestamp_2019_11_18_00_09_11 = 1574003351 + @time_zone = "UTC" + @timestamp_2019_11_17_15_09_11 = 1574003351 @timestamp_2025_12_16_05_33_58 = 1765863238 super(&block) end def build_array - data_type = Arrow::TimestampDataType.new(:second, @timezone) + data_type = Arrow::TimestampDataType.new(:second, @time_zone) Arrow::TimestampArray.new(data_type, [ - @timestamp_2019_11_18_00_09_11, + @timestamp_2019_11_17_15_09_11, nil, @timestamp_2025_12_16_05_33_58, ]) end def test_type - assert_equal([:second, @timezone], - [type.unit, type.timezone]) + assert_equal([:second, @time_zone], + [type.unit, type.time_zone]) end end @@ -900,20 +915,6 @@ def setup GC.start end end - - def read - @reader.to_a.collect do |record_batch| - record_batch.to_h.tap do |hash| - hash.each do |key, value| - hash[key] = value.to_a - end - end - end - end - - def type - @type ||= @reader.first.schema.fields[0].type - end end class TestStreamingReader < Test::Unit::TestCase @@ -932,18 +933,4 @@ def setup GC.start end end - - def read - @reader.collect do |record_batch| - record_batch.to_h.tap do |hash| - hash.each do |key, value| - hash[key] = value.to_a - end - end - end - end - - def type - @type ||= @reader.first.schema.fields[0].type - end end diff --git a/ruby/red-arrow-format/test/test-writer.rb b/ruby/red-arrow-format/test/test-writer.rb index 24a49b3777f3..3b97d08fc46c 100644 --- a/ruby/red-arrow-format/test/test-writer.rb +++ b/ruby/red-arrow-format/test/test-writer.rb @@ -15,7 +15,15 @@ # specific language governing permissions and limitations # under the License. -module WriterTests +module WriterHelper + def convert_time_unit(red_arrow_time_unit) + if red_arrow_time_unit.nick == "second" + red_arrow_time_unit.nick.to_sym + else + :"#{red_arrow_time_unit.nick}second" + end + end + def convert_type(red_arrow_type) case red_arrow_type when Arrow::NullDataType @@ -42,20 +50,91 @@ def convert_type(red_arrow_type) ArrowFormat::Float32Type.singleton when Arrow::DoubleDataType ArrowFormat::Float64Type.singleton + when Arrow::Date32DataType + ArrowFormat::Date32Type.singleton + when Arrow::Date64DataType + ArrowFormat::Date64Type.singleton + when Arrow::Time32DataType + ArrowFormat::Time32Type.new(convert_time_unit(red_arrow_type.unit)) + when Arrow::Time64DataType + ArrowFormat::Time64Type.new(convert_time_unit(red_arrow_type.unit)) + when Arrow::TimestampDataType + ArrowFormat::TimestampType.new(convert_time_unit(red_arrow_type.unit), + red_arrow_type.time_zone&.identifier) + when Arrow::MonthIntervalDataType + ArrowFormat::YearMonthIntervalType.singleton + when Arrow::DayTimeIntervalDataType + ArrowFormat::DayTimeIntervalType.singleton + when Arrow::MonthDayNanoIntervalDataType + ArrowFormat::MonthDayNanoIntervalType.singleton + when Arrow::DurationDataType + ArrowFormat::DurationType.new(convert_time_unit(red_arrow_type.unit)) when Arrow::BinaryDataType ArrowFormat::BinaryType.singleton when Arrow::LargeBinaryDataType ArrowFormat::LargeBinaryType.singleton when Arrow::StringDataType ArrowFormat::UTF8Type.singleton + when Arrow::LargeStringDataType + ArrowFormat::LargeUTF8Type.singleton + when Arrow::Decimal128DataType + ArrowFormat::Decimal128Type.new(red_arrow_type.precision, + red_arrow_type.scale) + when Arrow::Decimal256DataType + ArrowFormat::Decimal256Type.new(red_arrow_type.precision, + red_arrow_type.scale) + when Arrow::FixedSizeBinaryDataType + ArrowFormat::FixedSizeBinaryType.new(red_arrow_type.byte_width) + when Arrow::MapDataType + ArrowFormat::MapType.new(convert_field(red_arrow_type.field)) + when Arrow::ListDataType + ArrowFormat::ListType.new(convert_field(red_arrow_type.field)) + when Arrow::LargeListDataType + ArrowFormat::LargeListType.new(convert_field(red_arrow_type.field)) + when Arrow::StructDataType + fields = red_arrow_type.fields.collect do |field| + convert_field(field) + end + ArrowFormat::StructType.new(fields) + when Arrow::DenseUnionDataType + fields = red_arrow_type.fields.collect do |field| + convert_field(field) + end + ArrowFormat::DenseUnionType.new(fields, red_arrow_type.type_codes) + when Arrow::SparseUnionDataType + fields = red_arrow_type.fields.collect do |field| + convert_field(field) + end + ArrowFormat::SparseUnionType.new(fields, red_arrow_type.type_codes) + when Arrow::DictionaryDataType + index_type = convert_type(red_arrow_type.index_data_type) + type = convert_type(red_arrow_type.value_data_type) + ArrowFormat::DictionaryType.new(index_type, + type, + red_arrow_type.ordered?) else raise "Unsupported type: #{red_arrow_type.inspect}" end end + def convert_field(red_arrow_field) + type = convert_type(red_arrow_field.data_type) + if type.is_a?(ArrowFormat::DictionaryType) + @dictionary_id ||= 0 + dictionary_id = @dictionary_id + @dictionary_id += 1 + else + dictionary_id = nil + end + ArrowFormat::Field.new(red_arrow_field.name, + type, + red_arrow_field.nullable?, + dictionary_id) + end + def convert_buffer(buffer) return nil if buffer.nil? - IO::Buffer.for(buffer.data.to_s) + IO::Buffer.for(buffer.data.to_s.dup) end def convert_array(red_arrow_array) @@ -72,245 +151,1364 @@ def convert_array(red_arrow_array) convert_buffer(red_arrow_array.null_bitmap), convert_buffer(red_arrow_array.offsets_buffer), convert_buffer(red_arrow_array.data_buffer)) + when ArrowFormat::FixedSizeBinaryType + type.build_array(red_arrow_array.size, + convert_buffer(red_arrow_array.null_bitmap), + convert_buffer(red_arrow_array.data_buffer)) + when ArrowFormat::VariableSizeListType + type.build_array(red_arrow_array.size, + convert_buffer(red_arrow_array.null_bitmap), + convert_buffer(red_arrow_array.value_offsets_buffer), + convert_array(red_arrow_array.values_raw)) + when ArrowFormat::StructType + children = red_arrow_array.fields.collect do |red_arrow_field| + convert_array(red_arrow_field) + end + type.build_array(red_arrow_array.size, + convert_buffer(red_arrow_array.null_bitmap), + children) + when ArrowFormat::DenseUnionType + types_buffer = convert_buffer(red_arrow_array.type_ids.data_buffer) + offsets_buffer = convert_buffer(red_arrow_array.value_offsets.data_buffer) + children = red_arrow_array.fields.collect do |red_arrow_field| + convert_array(red_arrow_field) + end + type.build_array(red_arrow_array.size, + types_buffer, + offsets_buffer, + children) + when ArrowFormat::SparseUnionType + types_buffer = convert_buffer(red_arrow_array.type_ids.data_buffer) + children = red_arrow_array.fields.collect do |red_arrow_field| + convert_array(red_arrow_field) + end + type.build_array(red_arrow_array.size, + types_buffer, + children) + when ArrowFormat::DictionaryType + validity_buffer = convert_buffer(red_arrow_array.null_bitmap) + indices_buffer = convert_buffer(red_arrow_array.indices.data_buffer) + dictionary = convert_array(red_arrow_array.dictionary) + type.build_array(red_arrow_array.size, + validity_buffer, + indices_buffer, + [dictionary]) else raise "Unsupported array #{red_arrow_array.inspect}" end end - class << self - def included(base) - base.class_eval do - sub_test_case("Null") do - def build_array - Arrow::NullArray.new(3) - end + def write(writer, *inputs) + inputs.each_with_index do |input, i| + case input + when ArrowFormat::RecordBatch + record_batch = input + else + red_arrow_array = input + array = convert_array(red_arrow_array) + red_arrow_field = Arrow::Field.new("value", + red_arrow_array.value_data_type, + true) + fields = [convert_field(red_arrow_field)] + schema = ArrowFormat::Schema.new(fields) + record_batch = ArrowFormat::RecordBatch.new(schema, + array.size, + [array]) + end + writer.start(record_batch.schema) if i.zero? + writer.write_record_batch(record_batch) + end + writer.finish + end - def test_write - assert_equal([nil, nil, nil], - @values) - end - end + def roundtrip(*inputs) + Dir.mktmpdir do |tmp_dir| + path = File.join(tmp_dir, "data.#{file_extension}") + File.open(path, "wb") do |output| + writer = writer_class.new(output) + write(writer, *inputs) + end + # pp(read(path)) # debug + data = File.open(path, "rb", &:read).freeze + table = Arrow::Table.load(Arrow::Buffer.new(data), format: :arrow) + [table.value.data_type, table.value.values] + end + end +end - sub_test_case("Boolean") do - def build_array - Arrow::BooleanArray.new([true, nil, false]) - end +module WriterTests + def test_null + array = Arrow::NullArray.new(3) + type, values = roundtrip(array) + assert_equal(["null", [nil, nil, nil]], + [type.to_s, values]) + end - def test_write - assert_equal([true, nil, false], - @values) - end - end + def test_boolean + array = Arrow::BooleanArray.new([true, nil, false]) + type, values = roundtrip(array) + assert_equal(["bool", [true, nil, false]], + [type.to_s, values]) + end - sub_test_case("Int8") do - def build_array - Arrow::Int8Array.new([-128, nil, 127]) - end + def test_int8 + array = Arrow::Int8Array.new([-128, nil, 127]) + type, values = roundtrip(array) + assert_equal(["int8", [-128, nil, 127]], + [type.to_s, values]) + end - def test_write - assert_equal([-128, nil, 127], - @values) - end - end + def test_uint8 + array = Arrow::UInt8Array.new([0, nil, 255]) + type, values = roundtrip(array) + assert_equal(["uint8", [0, nil, 255]], + [type.to_s, values]) + end - sub_test_case("UInt8") do - def build_array - Arrow::UInt8Array.new([0, nil, 255]) - end + def test_int16 + array = Arrow::Int16Array.new([-32768, nil, 32767]) + type, values = roundtrip(array) + assert_equal(["int16", [-32768, nil, 32767]], + [type.to_s, values]) + end - def test_write - assert_equal([0, nil, 255], - @values) - end - end + def test_uint16 + array = Arrow::UInt16Array.new([0, nil, 65535]) + type, values = roundtrip(array) + assert_equal(["uint16", [0, nil, 65535]], + [type.to_s, values]) + end - sub_test_case("Int16") do - def build_array - Arrow::Int16Array.new([-32768, nil, 32767]) - end + def test_int32 + array = Arrow::Int32Array.new([-2147483648, nil, 2147483647]) + type, values = roundtrip(array) + assert_equal(["int32", [-2147483648, nil, 2147483647]], + [type.to_s, values]) + end - def test_write - assert_equal([-32768, nil, 32767], - @values) - end - end + def test_uint32 + array = Arrow::UInt32Array.new([0, nil, 4294967295]) + type, values = roundtrip(array) + assert_equal(["uint32", [0, nil, 4294967295]], + [type.to_s, values]) + end - sub_test_case("UInt16") do - def build_array - Arrow::UInt16Array.new([0, nil, 65535]) - end + def test_int64 + array = Arrow::Int64Array.new([ + -9223372036854775808, + nil, + 9223372036854775807 + ]) + type, values = roundtrip(array) + assert_equal([ + "int64", + [ + -9223372036854775808, + nil, + 9223372036854775807 + ], + ], + [type.to_s, values]) + end - def test_write - assert_equal([0, nil, 65535], - @values) - end - end + def test_uint64 + array = Arrow::UInt64Array.new([0, nil, 18446744073709551615]) + type, values = roundtrip(array) + assert_equal(["uint64", [0, nil, 18446744073709551615]], + [type.to_s, values]) + end - sub_test_case("Int32") do - def build_array - Arrow::Int32Array.new([-2147483648, nil, 2147483647]) - end + def test_float32 + array = Arrow::FloatArray.new([-0.5, nil, 0.5]) + type, values = roundtrip(array) + assert_equal(["float", [-0.5, nil, 0.5]], + [type.to_s, values]) + end - def test_write - assert_equal([-2147483648, nil, 2147483647], - @values) - end - end + def test_float64 + array = Arrow::DoubleArray.new([-0.5, nil, 0.5]) + type, values = roundtrip(array) + assert_equal(["double", [-0.5, nil, 0.5]], + [type.to_s, values]) + end - sub_test_case("UInt32") do - def build_array - Arrow::UInt32Array.new([0, nil, 4294967295]) - end + def test_date32 + date_2017_08_28 = 17406 + date_2025_12_09 = 20431 + array = Arrow::Date32Array.new([ + date_2017_08_28, + nil, + date_2025_12_09, + ]) + type, values = roundtrip(array) + assert_equal([ + "date32[day]", + [Date.new(2017, 8, 28), nil, Date.new(2025, 12, 9)], + ], + [type.to_s, values]) + end - def test_write - assert_equal([0, nil, 4294967295], - @values) - end - end + def test_date64 + date_2017_08_28_00_00_00 = 1503878400000 + date_2025_12_10_00_00_00 = 1765324800000 + array = Arrow::Date64Array.new([ + date_2017_08_28_00_00_00, + nil, + date_2025_12_10_00_00_00, + ]) + type, values = roundtrip(array) + assert_equal([ + "date64[ms]", + [ + DateTime.new(2017, 8, 28, 0, 0, 0), + nil, + DateTime.new(2025, 12, 10, 0, 0, 0), + ], + ], + [type.to_s, values]) + end - sub_test_case("Int64") do - def build_array - Arrow::Int64Array.new([ - -9223372036854775808, - nil, - 9223372036854775807 - ]) - end + def test_time32_second + time_00_00_10 = 10 + time_00_01_10 = 60 + 10 + array = Arrow::Time32Array.new(:second, + [time_00_00_10, nil, time_00_01_10]) + type, values = roundtrip(array) + assert_equal([ + "time32[s]", + [ + Arrow::Time.new(:second, time_00_00_10), + nil, + Arrow::Time.new(:second, time_00_01_10), + ], + ], + [type.to_s, values]) + end - def test_write - assert_equal([ - -9223372036854775808, - nil, - 9223372036854775807 - ], - @values) - end - end + def test_time32_millisecond + time_00_00_10_000 = 10 * 1000 + time_00_01_10_000 = (60 + 10) * 1000 + array = Arrow::Time32Array.new(:milli, + [ + time_00_00_10_000, + nil, + time_00_01_10_000, + ]) + type, values = roundtrip(array) + assert_equal([ + "time32[ms]", + [ + Arrow::Time.new(:milli, time_00_00_10_000), + nil, + Arrow::Time.new(:milli, time_00_01_10_000), + ], + ], + [type.to_s, values]) + end - sub_test_case("UInt64") do - def build_array - Arrow::UInt64Array.new([0, nil, 18446744073709551615]) - end + def test_time64_microsecond + time_00_00_10_000_000 = 10 * 1_000_000 + time_00_01_10_000_000 = (60 + 10) * 1_000_000 + array = Arrow::Time64Array.new(:micro, + [ + time_00_00_10_000_000, + nil, + time_00_01_10_000_000, + ]) + type, values = roundtrip(array) + assert_equal([ + "time64[us]", + [ + Arrow::Time.new(:micro, time_00_00_10_000_000), + nil, + Arrow::Time.new(:micro, time_00_01_10_000_000), + ], + ], + [type.to_s, values]) + end - def test_write - assert_equal([0, nil, 18446744073709551615], - @values) - end - end + def test_time64_nanosecond + time_00_00_10_000_000_000 = 10 * 1_000_000_000 + time_00_01_10_000_000_000 = (60 + 10) * 1_000_000_000 + array = Arrow::Time64Array.new(:nano, + [ + time_00_00_10_000_000_000, + nil, + time_00_01_10_000_000_000, + ]) + type, values = roundtrip(array) + assert_equal([ + "time64[ns]", + [ + Arrow::Time.new(:nano, time_00_00_10_000_000_000), + nil, + Arrow::Time.new(:nano, time_00_01_10_000_000_000), + ], + ], + [type.to_s, values]) + end - sub_test_case("Float32") do - def build_array - Arrow::FloatArray.new([-0.5, nil, 0.5]) - end + def test_timestamp_second + timestamp_2019_11_17_15_09_11 = 1574003351 + timestamp_2025_12_16_05_33_58 = 1765863238 + array = Arrow::TimestampArray.new(:second, + [ + timestamp_2019_11_17_15_09_11, + nil, + timestamp_2025_12_16_05_33_58, + ]) + type, values = roundtrip(array) + assert_equal([ + "timestamp[s]", + [ + Time.at(timestamp_2019_11_17_15_09_11), + nil, + Time.at(timestamp_2025_12_16_05_33_58), + ], + ], + [type.to_s, values]) + end - def test_write - assert_equal([-0.5, nil, 0.5], - @values) - end - end + def test_timestamp_millisecond + timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000 + timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000 + array = Arrow::TimestampArray.new(:milli, + [ + timestamp_2019_11_17_15_09_11, + nil, + timestamp_2025_12_16_05_33_58, + ]) + type, values = roundtrip(array) + assert_equal([ + "timestamp[ms]", + [ + Time.at(timestamp_2019_11_17_15_09_11 / 1_000), + nil, + Time.at(timestamp_2025_12_16_05_33_58 / 1_000), + ], + ], + [type.to_s, values]) + end - sub_test_case("Float64") do - def build_array - Arrow::DoubleArray.new([-0.5, nil, 0.5]) - end + def test_timestamp_microsecond + timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000_000 + timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000 + array = Arrow::TimestampArray.new(:micro, + [ + timestamp_2019_11_17_15_09_11, + nil, + timestamp_2025_12_16_05_33_58, + ]) + type, values = roundtrip(array) + assert_equal([ + "timestamp[us]", + [ + Time.at(timestamp_2019_11_17_15_09_11 / 1_000_000), + nil, + Time.at(timestamp_2025_12_16_05_33_58 / 1_000_000), + ], + ], + [type.to_s, values]) + end - def test_write - assert_equal([-0.5, nil, 0.5], - @values) - end - end + def test_timestamp_nanosecond + timestamp_2019_11_17_15_09_11 = 1574003351 * 1_000_000_000 + timestamp_2025_12_16_05_33_58 = 1765863238 * 1_000_000_000 + array = Arrow::TimestampArray.new(:nano, + [ + timestamp_2019_11_17_15_09_11, + nil, + timestamp_2025_12_16_05_33_58, + ]) + type, values = roundtrip(array) + assert_equal([ + "timestamp[ns]", + [ + Time.at(timestamp_2019_11_17_15_09_11 / 1_000_000_000), + nil, + Time.at(timestamp_2025_12_16_05_33_58 / 1_000_000_000), + ], + ], + [type.to_s, values]) + end - sub_test_case("Binary") do - def build_array - Arrow::BinaryArray.new(["Hello".b, nil, "World".b]) - end + def test_timestamp_time_zone + time_zone = "UTC" + timestamp_2019_11_17_15_09_11 = 1574003351 + timestamp_2025_12_16_05_33_58 = 1765863238 + data_type = Arrow::TimestampDataType.new(:second, time_zone) + array = Arrow::TimestampArray.new(data_type, + [ + timestamp_2019_11_17_15_09_11, + nil, + timestamp_2025_12_16_05_33_58, + ]) + type, values = roundtrip(array) + assert_equal([ + "timestamp[s, tz=#{time_zone}]", + [ + Time.at(timestamp_2019_11_17_15_09_11), + nil, + Time.at(timestamp_2025_12_16_05_33_58), + ], + ], + [type.to_s, values]) + end - def test_write - assert_equal(["Hello".b, nil, "World".b], - @values) - end - end + def test_year_month_interval + array = Arrow::MonthIntervalArray.new([0, nil, 100]) + type, values = roundtrip(array) + assert_equal(["month_interval", [0, nil, 100]], + [type.to_s, values]) + end - sub_test_case("LargeBinary") do - def build_array - Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b]) - end + def test_day_time_interval + array = + Arrow::DayTimeIntervalArray.new([ + {day: 1, millisecond: 100}, + nil, + {day: 3, millisecond: 300}, + ]) + type, values = roundtrip(array) + assert_equal([ + "day_time_interval", + [ + {day: 1, millisecond: 100}, + nil, + {day: 3, millisecond: 300}, + ], + ], + [type.to_s, values]) + end - def test_write - assert_equal(["Hello".b, nil, "World".b], - @values) - end - end + def test_month_day_nano_interval + array = + Arrow::MonthDayNanoIntervalArray.new([ + { + month: 1, + day: 1, + nanosecond: 100, + }, + nil, + { + month: 3, + day: 3, + nanosecond: 300, + }, + ]) + type, values = roundtrip(array) + assert_equal([ + "month_day_nano_interval", + [ + { + month: 1, + day: 1, + nanosecond: 100, + }, + nil, + { + month: 3, + day: 3, + nanosecond: 300, + }, + ], + ], + [type.to_s, values]) + end - sub_test_case("String") do - def build_array - Arrow::StringArray.new(["Hello", nil, "World"]) - end + def test_duration_second + array = Arrow::DurationArray.new(:second, [0, nil, 100]) + type, values = roundtrip(array) + assert_equal(["duration[s]", [0, nil, 100]], + [type.to_s, values]) + end - def test_write - assert_equal(["Hello", nil, "World"], - @values) - end - end + def test_duration_millisecond + array = Arrow::DurationArray.new(:milli, [0, nil, 100]) + type, values = roundtrip(array) + assert_equal(["duration[ms]", [0, nil, 100]], + [type.to_s, values]) + end + + def test_duration_microsecond + array = Arrow::DurationArray.new(:micro, [0, nil, 100]) + type, values = roundtrip(array) + assert_equal(["duration[us]", [0, nil, 100]], + [type.to_s, values]) + end + + def test_duration_nanosecond + array = Arrow::DurationArray.new(:nano, [0, nil, 100]) + type, values = roundtrip(array) + assert_equal(["duration[ns]", [0, nil, 100]], + [type.to_s, values]) + end + + def test_binary + array = Arrow::BinaryArray.new(["Hello".b, nil, "World".b]) + type, values = roundtrip(array) + assert_equal(["binary", ["Hello".b, nil, "World".b]], + [type.to_s, values]) + end + + def test_large_binary + array = Arrow::LargeBinaryArray.new(["Hello".b, nil, "World".b]) + type, values = roundtrip(array) + assert_equal(["large_binary", ["Hello".b, nil, "World".b]], + [type.to_s, values]) + end + + def test_utf8 + array = Arrow::StringArray.new(["Hello", nil, "World"]) + type, values = roundtrip(array) + assert_equal(["string", ["Hello", nil, "World"]], + [type.to_s, values]) + end + + def test_large_utf8 + array = Arrow::LargeStringArray.new(["Hello", nil, "World"]) + type, values = roundtrip(array) + assert_equal(["large_string", ["Hello", nil, "World"]], + [type.to_s, values]) + end + + def test_fixed_size_binary + data_type = Arrow::FixedSizeBinaryDataType.new(4) + array = Arrow::FixedSizeBinaryArray.new(data_type, + ["0124".b, nil, "abcd".b]) + type, values = roundtrip(array) + assert_equal(["fixed_size_binary[4]", ["0124".b, nil, "abcd".b]], + [type.to_s, values]) + end + + def test_decimal128 + positive_small = "1.200" + positive_large = ("1234567890" * 3) + "12345.678" + negative_small = "-1.200" + negative_large = "-" + ("1234567890" * 3) + "12345.678" + array = Arrow::Decimal128Array.new({precision: 38, scale: 3}, + [ + positive_large, + positive_small, + nil, + negative_small, + negative_large, + ]) + type, values = roundtrip(array) + assert_equal([ + "decimal128(38, 3)", + [ + BigDecimal(positive_large), + BigDecimal(positive_small), + nil, + BigDecimal(negative_small), + BigDecimal(negative_large), + ], + ], + [type.to_s, values]) + end + + def test_decimal256 + positive_small = "1.200" + positive_large = ("1234567890" * 7) + "123.456" + negative_small = "-1.200" + negative_large = "-" + ("1234567890" * 7) + "123.456" + array = Arrow::Decimal256Array.new({precision: 76, scale: 3}, + [ + positive_large, + positive_small, + nil, + negative_small, + negative_large, + ]) + type, values = roundtrip(array) + assert_equal([ + "decimal256(76, 3)", + [ + BigDecimal(positive_large), + BigDecimal(positive_small), + nil, + BigDecimal(negative_small), + BigDecimal(negative_large), + ], + ], + [type.to_s, values]) + end + + def test_list + data_type = Arrow::ListDataType.new(name: "count", type: :int8) + array = Arrow::ListArray.new(data_type, + [[-128, 127], nil, [-1, 0, 1]]) + type, values = roundtrip(array) + assert_equal([ + "list", + [[-128, 127], nil, [-1, 0, 1]], + ], + [type.to_s, values]) + end + + def test_large_list + data_type = Arrow::LargeListDataType.new(name: "count", + type: :int8) + array = Arrow::LargeListArray.new(data_type, + [[-128, 127], nil, [-1, 0, 1]]) + type, values = roundtrip(array) + assert_equal([ + "large_list", + [[-128, 127], nil, [-1, 0, 1]], + ], + [type.to_s, values]) + end + + def test_map + data_type = Arrow::MapDataType.new(:string, :int8) + array = Arrow::MapArray.new(data_type, + [ + {"a" => -128, "b" => 127}, + nil, + {"c" => nil}, + ]) + type, values = roundtrip(array) + assert_equal([ + "map", + [ + {"a" => -128, "b" => 127}, + nil, + {"c" => nil}, + ], + ], + [type.to_s, values]) + end + + def test_struct + data_type = Arrow::StructDataType.new(count: :int8, + visible: :boolean) + array = Arrow::StructArray.new(data_type, + [[-128, nil], nil, [nil, true]]) + type, values = roundtrip(array) + assert_equal([ + "struct", + [ + {"count" => -128, "visible" => nil}, + nil, + {"count" => nil, "visible" => true}, + ], + ], + [type.to_s, values]) + end + + def test_dense_union + fields = [ + Arrow::Field.new("number", :int8), + Arrow::Field.new("text", :string), + ] + type_ids = [11, 13] + data_type = Arrow::DenseUnionDataType.new(fields, type_ids) + types = Arrow::Int8Array.new([11, 13, 11, 13, 13]) + value_offsets = Arrow::Int32Array.new([0, 0, 1, 1, 2]) + children = [ + Arrow::Int8Array.new([1, nil]), + Arrow::StringArray.new(["a", "b", "c"]) + ] + array = Arrow::DenseUnionArray.new(data_type, + types, + value_offsets, + children) + type, values = roundtrip(array) + assert_equal([ + "dense_union", + [1, "a", nil, "b", "c"], + ], + [type.to_s, values]) + end + + def test_sparse_union + fields = [ + Arrow::Field.new("number", :int8), + Arrow::Field.new("text", :string), + ] + type_ids = [11, 13] + data_type = Arrow::SparseUnionDataType.new(fields, type_ids) + types = Arrow::Int8Array.new([11, 13, 11, 13, 11]) + children = [ + Arrow::Int8Array.new([1, nil, nil, nil, 5]), + Arrow::StringArray.new([nil, "b", nil, "d", nil]) + ] + array = Arrow::SparseUnionArray.new(data_type, types, children) + type, values = roundtrip(array) + assert_equal([ + "sparse_union", + [1, "b", nil, "d", 5], + ], + [type.to_s, values]) + end + + def test_dictionary + values = ["a", "b", "c", nil, "a"] + string_array = Arrow::StringArray.new(values) + array = string_array.dictionary_encode + type, values = roundtrip(array) + assert_equal([ + "dictionary", + ["a", "b", "c", nil, "a"], + ], + [type.to_s, values]) + end +end + +module WriterDictionaryDeltaTests + def build_schema(value_type) + index_type = ArrowFormat::Int32Type.singleton + ordered = false + type = ArrowFormat::DictionaryType.new(index_type, + value_type, + ordered) + nullable = true + dictionary_id = 1 + field = ArrowFormat::Field.new("value", + type, + nullable, + dictionary_id) + ArrowFormat::Schema.new([field]) + end + + def build_dictionary_array(type, indices, dictionaries) + indices_buffer = IO::Buffer.for(indices.pack("l<*")) + ArrowFormat::DictionaryArray.new(type, + indices.size, + nil, + indices_buffer, + dictionaries) + end + + def build_record_batches(red_arrow_value_type, values1, values2) + value_type = convert_type(red_arrow_value_type) + schema = build_schema(value_type) + type = schema.fields[0].type + + # The first record batch with new dictionary. + raw_dictionary = values1.uniq + red_arrow_dictionary = + red_arrow_value_type.build_array(raw_dictionary) + dictionary = convert_array(red_arrow_dictionary) + indices1 = values1.collect do |value| + raw_dictionary.index(value) + end + array1 = build_dictionary_array(type, indices1, [dictionary]) + record_batch = + ArrowFormat::RecordBatch.new(schema, array1.size, [array1]) + + if chunked_dictionaries? + # The second record batch with the first dictionary and + # a delta dictionary. + raw_dictionary_delta = (values2.uniq - raw_dictionary) + raw_dictionary_more = raw_dictionary + raw_dictionary_delta + red_arrow_dictionary_delta = + red_arrow_value_type.build_array(raw_dictionary_delta) + dictionary_delta = convert_array(red_arrow_dictionary_delta) + indices2 = values2.collect do |value| + raw_dictionary_more.index(value) end + array2 = build_dictionary_array(type, + indices2, + [dictionary, dictionary_delta]) + else + # The second record batch with the combined dictionary. + raw_dictionary_more = raw_dictionary | values2.uniq + red_arrow_dictionary_more = + red_arrow_value_type.build_array(raw_dictionary_more) + dictionary_more = convert_array(red_arrow_dictionary_more) + indices2 = values2.collect do |value| + raw_dictionary_more.index(value) + end + array2 = build_dictionary_array(type, + indices2, + [dictionary_more]) end + record_batch_delta = + ArrowFormat::RecordBatch.new(schema, array2.size, [array2]) + + [record_batch, record_batch_delta] + end + + def roundtrip(value_type, values1, values2) + r = build_record_batches(value_type, values1, values2) + GC.start + super(*r) + end + + def test_boolean + value_type = Arrow::BooleanDataType.new + values1 = [true, true] + values2 = [false, true, false] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_int8 + value_type = Arrow::Int8DataType.new + values1 = [-128, 0, -128] + values2 = [127, -128, 0, 127] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_uint8 + value_type = Arrow::UInt8DataType.new + values1 = [1, 0, 1] + values2 = [255, 0, 1, 255] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_int16 + value_type = Arrow::Int16DataType.new + values1 = [-32768, 0, -32768] + values2 = [32767, -32768, 0, 32767] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_uint16 + value_type = Arrow::UInt16DataType.new + values1 = [1, 0, 1] + values2 = [65535, 0, 1, 65535] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_int32 + value_type = Arrow::Int32DataType.new + values1 = [-2147483648, 0, -2147483648] + values2 = [2147483647, -2147483648, 0, 2147483647] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_uint32 + value_type = Arrow::UInt32DataType.new + values1 = [1, 0, 1] + values2 = [4294967295, 0, 1, 4294967295] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_int64 + value_type = Arrow::Int64DataType.new + values1 = [ + -9223372036854775808, + 0, + -9223372036854775808, + ] + values2 = [ + 9223372036854775807, + -9223372036854775808, + 0, + 9223372036854775807, + ] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_uint64 + value_type = Arrow::UInt64DataType.new + values1 = [1, 0, 1] + values2 = [ + 18446744073709551615, + 0, + 1, + 18446744073709551615, + ] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_float32 + value_type = Arrow::FloatDataType.new + values1 = [-0.5, 0.0, -0.5] + values2 = [0.5, -0.5, 0.0, 0.5] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_float64 + value_type = Arrow::DoubleDataType.new + values1 = [-0.5, 0.0, -0.5] + values2 = [0.5, -0.5, 0.0, 0.5] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_date32 + date_2017_08_28 = 17406 + date_2025_12_09 = 20431 + value_type = Arrow::Date32DataType.new + values1 = [date_2017_08_28, date_2017_08_28] + values2 = [date_2025_12_09, date_2017_08_28, date_2025_12_09] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + [ + Date.new(2017, 8, 28), + Date.new(2017, 8, 28), + Date.new(2025, 12, 9), + Date.new(2017, 8, 28), + Date.new(2025, 12, 9), + ], + ], + [type.to_s, values]) + end + + def test_date64 + date_2017_08_28_00_00_00 = 1503878400000 + date_2025_12_10_00_00_00 = 1765324800000 + value_type = Arrow::Date64DataType.new + values1 = [date_2017_08_28_00_00_00, date_2017_08_28_00_00_00] + values2 = [ + date_2025_12_10_00_00_00, + date_2017_08_28_00_00_00, + date_2025_12_10_00_00_00, + ] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + [ + DateTime.new(2017, 8, 28), + DateTime.new(2017, 8, 28), + DateTime.new(2025, 12, 10), + DateTime.new(2017, 8, 28), + DateTime.new(2025, 12, 10), + ], + ], + [type.to_s, values]) + end + + def test_time32 + time_00_00_10 = 10 + time_00_01_10 = 60 + 10 + value_type = Arrow::Time32DataType.new(:second) + values1 = [time_00_00_10, time_00_00_10] + values2 = [time_00_01_10, time_00_00_10, time_00_01_10] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + [ + Arrow::Time.new(:second, time_00_00_10), + Arrow::Time.new(:second, time_00_00_10), + Arrow::Time.new(:second, time_00_01_10), + Arrow::Time.new(:second, time_00_00_10), + Arrow::Time.new(:second, time_00_01_10), + ], + ], + [type.to_s, values]) + end + + def test_time64 + time_00_00_10_000_000 = 10 * 1_000_000 + time_00_01_10_000_000 = (60 + 10) * 1_000_000 + value_type = Arrow::Time64DataType.new(:micro) + values1 = [time_00_00_10_000_000, time_00_00_10_000_000] + values2 = [ + time_00_01_10_000_000, + time_00_00_10_000_000, + time_00_01_10_000_000, + ] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + [ + Arrow::Time.new(:micro, time_00_00_10_000_000), + Arrow::Time.new(:micro, time_00_00_10_000_000), + Arrow::Time.new(:micro, time_00_01_10_000_000), + Arrow::Time.new(:micro, time_00_00_10_000_000), + Arrow::Time.new(:micro, time_00_01_10_000_000), + ], + ], + [type.to_s, values]) + end + + def test_timestamp + timestamp_2019_11_17_15_09_11 = 1574003351 + timestamp_2025_12_16_05_33_58 = 1765863238 + value_type = Arrow::TimestampDataType.new(:second) + values1 = [ + timestamp_2019_11_17_15_09_11, + timestamp_2019_11_17_15_09_11, + ] + values2 = [ + timestamp_2025_12_16_05_33_58, + timestamp_2019_11_17_15_09_11, + timestamp_2025_12_16_05_33_58, + ] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + [ + Time.at(timestamp_2019_11_17_15_09_11), + Time.at(timestamp_2019_11_17_15_09_11), + Time.at(timestamp_2025_12_16_05_33_58), + Time.at(timestamp_2019_11_17_15_09_11), + Time.at(timestamp_2025_12_16_05_33_58), + ], + ], + [type.to_s, values]) + end + + def test_year_month_interval + value_type = Arrow::MonthIntervalDataType.new + values1 = [100, 0, 100] + values2 = [1000, 100, 0, 1000] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_day_time_interval + value_type = Arrow::DayTimeIntervalDataType.new + values1 = [ + {day: 1, millisecond: 100}, + {day: 1, millisecond: 100}, + ] + values2 = [ + {day: 3, millisecond: 300}, + {day: 1, millisecond: 100}, + {day: 3, millisecond: 300}, + ] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_month_day_nano_interval + value_type = Arrow::MonthDayNanoIntervalDataType.new + values1 = [ + {month: 1, day: 1, nanosecond: 100}, + {month: 1, day: 1, nanosecond: 100}, + ] + values2 = [ + {month: 3, day: 3, nanosecond: 300}, + {month: 1, day: 1, nanosecond: 100}, + {month: 3, day: 3, nanosecond: 300}, + ] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_duration + value_type = Arrow::DurationDataType.new(:second) + values1 = [100, 0, 100] + values2 = [1000, 100, 0, 1000] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_binary + value_type = Arrow::BinaryDataType.new + values1 = ["ab".b, "c".b, "ab".b] + values2 = ["c".b, "de".b, "ab".b, "de".b] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_large_binary + value_type = Arrow::LargeBinaryDataType.new + values1 = ["ab".b, "c".b, "ab".b] + values2 = ["c".b, "de".b, "ab".b, "de".b] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_utf8 + value_type = Arrow::StringDataType.new + values1 = ["ab", "c", "ab"] + values2 = ["c", "de", "ab", "de"] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_large_utf8 + value_type = Arrow::LargeStringDataType.new + values1 = ["ab", "c", "ab"] + values2 = ["c", "de", "ab", "de"] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_fixed_size_binary + value_type = Arrow::FixedSizeBinaryDataType.new(2) + values1 = ["ab".b, "cd".b, "ab".b] + values2 = ["ef".b, "cd".b, "ab".b, "ef".b] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + values1 + values2, + ], + [type.to_s, values]) + end + + def test_decimal128 + positive_small = "1.200" + positive_large = ("1234567890" * 3) + "12345.678" + negative_small = "-1.200" + negative_large = "-" + ("1234567890" * 3) + "12345.678" + value_type = Arrow::Decimal128DataType.new(precision: 38, + scale: 3) + values1 = [positive_small, negative_small, positive_small] + values2 = [ + positive_large, + positive_small, + negative_small, + positive_large, + negative_large, + ] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + (values1 + values2).collect {|v| BigDecimal(v)}, + ], + [type.to_s, values]) + end + + def test_decimal256 + positive_small = "1.200" + positive_large = ("1234567890" * 7) + "123.456" + negative_small = "-1.200" + negative_large = "-" + ("1234567890" * 7) + "123.456" + value_type = Arrow::Decimal256DataType.new(precision: 76, + scale: 3) + values1 = [positive_small, negative_small, positive_small] + values2 = [ + positive_large, + positive_small, + negative_small, + positive_large, + negative_large, + ] + type, values = roundtrip(value_type, values1, values2) + assert_equal([ + "dictionary", + (values1 + values2).collect {|v| BigDecimal(v)}, + ], + [type.to_s, values]) end end class TestFileWriter < Test::Unit::TestCase - include WriterTests + include WriterHelper - def setup - Dir.mktmpdir do |tmp_dir| - path = File.join(tmp_dir, "data.arrow") - File.open(path, "wb") do |output| - writer = ArrowFormat::FileWriter.new(output) - red_arrow_array = build_array - array = convert_array(red_arrow_array) - fields = [ - ArrowFormat::Field.new("value", - array.type, - true, - nil), - ] - schema = ArrowFormat::Schema.new(fields) - record_batch = ArrowFormat::RecordBatch.new(schema, array.size, [array]) - writer.start(schema) - writer.write_record_batch(record_batch) - writer.finish + def file_extension + "arrow" + end + + def writer_class + ArrowFormat::FileWriter + end + + def read(path) + File.open(path, "rb") do |input| + reader = ArrowFormat::FileReader.new(input) + reader.to_a.collect do |record_batch| + record_batch.to_h.tap do |hash| + hash.each do |key, value| + hash[key] = value.to_a + end + end end - data = File.open(path, "rb", &:read).freeze - table = Arrow::Table.load(Arrow::Buffer.new(data), format: :arrow) - @values = table.value.values + end + end + + sub_test_case("Basic") do + include WriterTests + end + + sub_test_case("Dictionary: delta") do + include WriterDictionaryDeltaTests + + def chunked_dictionaries? + true + end + end + + sub_test_case("Dictionary: delta: slice") do + include WriterDictionaryDeltaTests + + def chunked_dictionaries? + false end end end class TestStreamingWriter < Test::Unit::TestCase - include WriterTests + include WriterHelper - def setup - Dir.mktmpdir do |tmp_dir| - path = File.join(tmp_dir, "data.arrows") - File.open(path, "wb") do |output| - writer = ArrowFormat::StreamingWriter.new(output) - red_arrow_array = build_array - array = convert_array(red_arrow_array) - fields = [ - ArrowFormat::Field.new("value", - array.type, - true, - nil), - ] - schema = ArrowFormat::Schema.new(fields) - record_batch = ArrowFormat::RecordBatch.new(schema, array.size, [array]) - writer.start(schema) - writer.write_record_batch(record_batch) - writer.finish + def file_extension + "arrows" + end + + def writer_class + ArrowFormat::StreamingWriter + end + + def read(path) + File.open(path, "rb") do |input| + reader = ArrowFormat::StreamingReader.new(input) + reader.collect do |record_batch| + record_batch.to_h.tap do |hash| + hash.each do |key, value| + hash[key] = value.to_a + end + end end - data = File.open(path, "rb", &:read).freeze - table = Arrow::Table.load(Arrow::Buffer.new(data), format: :arrows) - @values = table.value.values + end + end + + sub_test_case("Basic") do + include WriterTests + end + + sub_test_case("Dictionary: delta") do + include WriterDictionaryDeltaTests + + def chunked_dictionaries? + true + end + end + + sub_test_case("Dictionary: delta: slice") do + include WriterDictionaryDeltaTests + + def chunked_dictionaries? + false end end end diff --git a/ruby/red-arrow/ext/arrow/arrow.cpp b/ruby/red-arrow/ext/arrow/arrow.cpp index 0c582d070772..d563ce8c4bfc 100644 --- a/ruby/red-arrow/ext/arrow/arrow.cpp +++ b/ruby/red-arrow/ext/arrow/arrow.cpp @@ -63,6 +63,36 @@ namespace red_arrow { rbgobj_gc_mark_instance(node->data); } } + + void + call_expression_mark(gpointer object) + { + auto expression = GARROW_CALL_EXPRESSION(object); + auto arguments = garrow_call_expression_get_arguments(expression); + for (auto argument = arguments; argument; argument = g_list_next(argument)) { + rbgobj_gc_mark_instance(argument->data); + } + } + + void + aggregate_node_options_mark(gpointer object) + { + auto options = GARROW_AGGREGATE_NODE_OPTIONS(object); + auto aggregations = garrow_aggregate_node_options_get_aggregations(options); + for (auto aggregation = aggregations; aggregation; aggregation = g_list_next(aggregation)) { + rbgobj_gc_mark_instance(aggregation->data); + } + } + + void + project_node_options_mark(gpointer object) + { + auto options = GARROW_PROJECT_NODE_OPTIONS(object); + auto expressions = garrow_project_node_options_get_expressions(options); + for (auto expression = expressions; expression; expression = g_list_next(expression)) { + rbgobj_gc_mark_instance(expression->data); + } + } } extern "C" void Init_arrow() { @@ -124,4 +154,10 @@ extern "C" void Init_arrow() { red_arrow::record_batch_reader_mark); rbgobj_register_mark_func(GARROW_TYPE_EXECUTE_PLAN, red_arrow::execute_plan_mark); + rbgobj_register_mark_func(GARROW_TYPE_CALL_EXPRESSION, + red_arrow::call_expression_mark); + rbgobj_register_mark_func(GARROW_TYPE_AGGREGATE_NODE_OPTIONS, + red_arrow::aggregate_node_options_mark); + rbgobj_register_mark_func(GARROW_TYPE_PROJECT_NODE_OPTIONS, + red_arrow::project_node_options_mark); } diff --git a/ruby/red-arrow/ext/arrow/converters.hpp b/ruby/red-arrow/ext/arrow/converters.hpp index 9525700eba9b..099aa916863b 100644 --- a/ruby/red-arrow/ext/arrow/converters.hpp +++ b/ruby/red-arrow/ext/arrow/converters.hpp @@ -175,6 +175,14 @@ namespace red_arrow { length); } + inline VALUE convert(const arrow::LargeStringArray& array, + const int64_t i) { + int64_t length; + const auto value = array.GetValue(i, &length); + return rb_utf8_str_new(reinterpret_cast(value), + length); + } + inline VALUE convert(const arrow::FixedSizeBinaryArray& array, const int64_t i) { return rb_enc_str_new(reinterpret_cast(array.Value(i)), @@ -233,11 +241,6 @@ namespace red_arrow { return rb_time_num_new(sec, Qnil); } - // TODO - // inline VALUE convert(const arrow::IntervalArray& array, - // const int64_t i) { - // }; - inline VALUE convert(const arrow::MonthIntervalArray& array, const int64_t i) { return INT2NUM(array.Value(i)); @@ -272,6 +275,11 @@ namespace red_arrow { return value; } + inline VALUE convert(const arrow::DurationArray& array, + const int64_t i) { + return LL2NUM(array.Value(i)); + } + VALUE convert(const arrow::ListArray& array, const int64_t i); @@ -374,6 +382,7 @@ namespace red_arrow { VISIT(MonthInterval) VISIT(DayTimeInterval) VISIT(MonthDayNanoInterval) + VISIT(Duration) VISIT(List) VISIT(LargeList) VISIT(Struct) @@ -473,6 +482,7 @@ namespace red_arrow { VISIT(MonthInterval) VISIT(DayTimeInterval) VISIT(MonthDayNanoInterval) + VISIT(Duration) VISIT(List) VISIT(LargeList) VISIT(Struct) @@ -580,6 +590,7 @@ namespace red_arrow { VISIT(MonthInterval) VISIT(DayTimeInterval) VISIT(MonthDayNanoInterval) + VISIT(Duration) VISIT(List) VISIT(LargeList) VISIT(Struct) @@ -683,6 +694,7 @@ namespace red_arrow { VISIT(MonthInterval) VISIT(DayTimeInterval) VISIT(MonthDayNanoInterval) + VISIT(Duration) VISIT(List) VISIT(LargeList) VISIT(Struct) @@ -787,6 +799,7 @@ namespace red_arrow { VISIT(MonthInterval) VISIT(DayTimeInterval) VISIT(MonthDayNanoInterval) + VISIT(Duration) VISIT(List) VISIT(LargeList) VISIT(Struct) @@ -889,7 +902,9 @@ namespace red_arrow { VISIT(Float) VISIT(Double) VISIT(Binary) + VISIT(LargeBinary) VISIT(String) + VISIT(LargeString) VISIT(FixedSizeBinary) VISIT(Date32) VISIT(Date64) @@ -899,6 +914,7 @@ namespace red_arrow { VISIT(MonthInterval) VISIT(DayTimeInterval) VISIT(MonthDayNanoInterval) + VISIT(Duration) VISIT(List) VISIT(LargeList) VISIT(Struct) diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index 25a95379efca..7f643bad4130 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -90,6 +90,7 @@ namespace red_arrow { VISIT(Binary) VISIT(LargeBinary) VISIT(String) + VISIT(LargeString) VISIT(FixedSizeBinary) VISIT(Date32) VISIT(Date64) @@ -99,6 +100,7 @@ namespace red_arrow { VISIT(MonthInterval) VISIT(DayTimeInterval) VISIT(MonthDayNanoInterval) + VISIT(Duration) VISIT(List) VISIT(Struct) VISIT(Map) @@ -227,6 +229,7 @@ namespace red_arrow { VISIT(Binary) VISIT(LargeBinary) VISIT(String) + VISIT(LargeString) VISIT(FixedSizeBinary) VISIT(Date32) VISIT(Date64) @@ -236,6 +239,7 @@ namespace red_arrow { VISIT(MonthInterval) VISIT(DayTimeInterval) VISIT(MonthDayNanoInterval) + VISIT(Duration) VISIT(List) VISIT(Struct) VISIT(Map) diff --git a/ruby/red-arrow/ext/arrow/values.cpp b/ruby/red-arrow/ext/arrow/values.cpp index 783cdb3d7d3a..0296f27398d8 100644 --- a/ruby/red-arrow/ext/arrow/values.cpp +++ b/ruby/red-arrow/ext/arrow/values.cpp @@ -71,6 +71,7 @@ namespace red_arrow { VISIT(Binary) VISIT(LargeBinary) VISIT(String) + VISIT(LargeString) VISIT(FixedSizeBinary) VISIT(Date32) VISIT(Date64) @@ -80,6 +81,7 @@ namespace red_arrow { VISIT(MonthInterval) VISIT(DayTimeInterval) VISIT(MonthDayNanoInterval) + VISIT(Duration) VISIT(List) VISIT(LargeList) VISIT(Struct) diff --git a/ruby/red-arrow/lib/arrow/dense-union-array.rb b/ruby/red-arrow/lib/arrow/dense-union-array.rb index 07b2bbfce68a..eb8bab0fa67f 100644 --- a/ruby/red-arrow/lib/arrow/dense-union-array.rb +++ b/ruby/red-arrow/lib/arrow/dense-union-array.rb @@ -19,7 +19,7 @@ module Arrow class DenseUnionArray def get_value(i) child_id = get_child_id(i) - field = get_field(child_id) + field = fields[child_id] field[get_value_offset(i)] end end diff --git a/ruby/red-arrow/lib/arrow/libraries.rb b/ruby/red-arrow/lib/arrow/libraries.rb index 52cc1ceb294d..a29a5588bbb5 100644 --- a/ruby/red-arrow/lib/arrow/libraries.rb +++ b/ruby/red-arrow/lib/arrow/libraries.rb @@ -134,5 +134,6 @@ require_relative "timestamp-array-builder" require_relative "timestamp-data-type" require_relative "timestamp-parser" +require_relative "union-array" require_relative "union-array-builder" require_relative "writable" diff --git a/ruby/red-arrow/lib/arrow/sparse-union-array.rb b/ruby/red-arrow/lib/arrow/sparse-union-array.rb index 783493f6b636..084001a05822 100644 --- a/ruby/red-arrow/lib/arrow/sparse-union-array.rb +++ b/ruby/red-arrow/lib/arrow/sparse-union-array.rb @@ -19,7 +19,7 @@ module Arrow class SparseUnionArray def get_value(i) child_id = get_child_id(i) - field = get_field(child_id) + field = fields[child_id] field[i] end end diff --git a/python/asv-uninstall.sh b/ruby/red-arrow/lib/arrow/union-array.rb old mode 100755 new mode 100644 similarity index 84% rename from python/asv-uninstall.sh rename to ruby/red-arrow/lib/arrow/union-array.rb index beef730b7b8c..a316dd38f1cb --- a/python/asv-uninstall.sh +++ b/ruby/red-arrow/lib/arrow/union-array.rb @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,5 +15,12 @@ # specific language governing permissions and limitations # under the License. -# Deliberately empty, but exists so that we don't have to change -# asv.conf.json if we need specific commands here. +module Arrow + class UnionArray + def fields + @fields ||= n_fields.times.collect do |i| + get_field(i) + end + end + end +end diff --git a/ruby/red-arrow/red-arrow.gemspec b/ruby/red-arrow/red-arrow.gemspec index 2487568b1de1..51c42a62b361 100644 --- a/ruby/red-arrow/red-arrow.gemspec +++ b/ruby/red-arrow/red-arrow.gemspec @@ -58,7 +58,7 @@ Gem::Specification.new do |spec| spec.requirements << "jar org.apache.arrow, arrow-vector, #{spec.version}" spec.requirements << "jar org.apache.arrow, arrow-memory-netty, #{spec.version}" else - spec.add_runtime_dependency("extpp", ">= 0.1.1") + spec.add_runtime_dependency("extpp", ">= 0.1.2") spec.add_runtime_dependency("gio2", ">= 4.2.3") spec.add_runtime_dependency("pkg-config") @@ -98,6 +98,8 @@ Gem::Specification.new do |spec| ["fedora", "libarrow-glib-devel"], + ["homebrew", "apache-arrow-glib"], + # Try without additional repository ["rhel", "arrow-glib-devel"], # Retry with additional repository diff --git a/ruby/red-arrow/test/raw-records/test-basic-arrays.rb b/ruby/red-arrow/test/raw-records/test-basic-arrays.rb index f09b2e8b7142..7a6e6115d6ba 100644 --- a/ruby/red-arrow/test/raw-records/test-basic-arrays.rb +++ b/ruby/red-arrow/test/raw-records/test-basic-arrays.rb @@ -177,6 +177,16 @@ def test_string assert_equal(records, actual_records(target)) end + def test_large_string + records = [ + ["Ruby"], + [nil], + ["\u3042"], # U+3042 HIRAGANA LETTER A + ] + target = build({column: :large_string}, records) + assert_equal(records, actual_records(target)) + end + def test_date32 records = [ [Date.new(1960, 1, 1)], @@ -396,6 +406,46 @@ def test_month_day_nano_interval target = build({column: :month_day_nano_interval}, records) assert_equal(records, actual_records(target)) end + + def test_duration_second + records = [ + [0], + [nil], + [100], + ] + target = build({column: {type: :duration, unit: :second}}, records) + assert_equal(records, actual_records(target)) + end + + def test_duration_milli + records = [ + [0], + [nil], + [100], + ] + target = build({column: {type: :duration, unit: :milli}}, records) + assert_equal(records, actual_records(target)) + end + + def test_duration_micro + records = [ + [0], + [nil], + [100], + ] + target = build({column: {type: :duration, unit: :micro}}, records) + assert_equal(records, actual_records(target)) + end + + def test_duration_nano + records = [ + [0], + [nil], + [100], + ] + target = build({column: {type: :duration, unit: :nano}}, records) + assert_equal(records, actual_records(target)) + end end class EachRawRecordRecordBatchBasicArraysTest < Test::Unit::TestCase diff --git a/ruby/red-arrow/test/raw-records/test-dictionary-array.rb b/ruby/red-arrow/test/raw-records/test-dictionary-array.rb index 09d472b215ab..2a4966316a42 100644 --- a/ruby/red-arrow/test/raw-records/test-dictionary-array.rb +++ b/ruby/red-arrow/test/raw-records/test-dictionary-array.rb @@ -153,6 +153,16 @@ def test_binary assert_equal(records, actual_records(target)) end + def test_large_binary + records = [ + ["\x00".b], + [nil], + ["\xff".b], + ] + target = build(Arrow::LargeBinaryArray.new(records.collect(&:first))) + assert_equal(records, actual_records(target)) + end + def test_string records = [ ["Ruby"], @@ -163,6 +173,16 @@ def test_string assert_equal(records, actual_records(target)) end + def test_large_string + records = [ + ["Ruby"], + [nil], + ["\u3042"], # U+3042 HIRAGANA LETTER A + ] + target = build(Arrow::LargeStringArray.new(records.collect(&:first))) + assert_equal(records, actual_records(target)) + end + def test_date32 records = [ [Date.new(1960, 1, 1)], diff --git a/ruby/red-arrow/test/values/test-basic-arrays.rb b/ruby/red-arrow/test/values/test-basic-arrays.rb index ed96a61bd072..b3c8e18172d9 100644 --- a/ruby/red-arrow/test/values/test-basic-arrays.rb +++ b/ruby/red-arrow/test/values/test-basic-arrays.rb @@ -167,6 +167,16 @@ def test_string assert_equal(values, target.values) end + def test_large_string + values = [ + "Ruby", + nil, + "\u3042", # U+3042 HIRAGANA LETTER A + ] + target = build(Arrow::LargeStringArray.new(values)) + assert_equal(values, target.values) + end + def test_date32 values = [ Date.new(1960, 1, 1), @@ -326,6 +336,46 @@ def test_month_day_nano_interval target = build(Arrow::MonthDayNanoIntervalArray.new(values)) assert_equal(values, target.values) end + + def test_duration_second + values = [ + 0, + nil, + 100, + ] + target = build(Arrow::DurationArray.new(:second, values)) + assert_equal(values, target.values) + end + + def test_duration_milli + values = [ + 0, + nil, + 100, + ] + target = build(Arrow::DurationArray.new(:milli, values)) + assert_equal(values, target.values) + end + + def test_duration_micro + values = [ + 0, + nil, + 100, + ] + target = build(Arrow::DurationArray.new(:micro, values)) + assert_equal(values, target.values) + end + + def test_duration_nano + values = [ + 0, + nil, + 100, + ] + target = build(Arrow::DurationArray.new(:nano, values)) + assert_equal(values, target.values) + end end class ValuesArrayBasicArraysTest < Test::Unit::TestCase diff --git a/ruby/red-arrow/test/values/test-dictionary-array.rb b/ruby/red-arrow/test/values/test-dictionary-array.rb index 115656b7d761..f06c0427fc15 100644 --- a/ruby/red-arrow/test/values/test-dictionary-array.rb +++ b/ruby/red-arrow/test/values/test-dictionary-array.rb @@ -137,6 +137,16 @@ def test_binary assert_equal(values, target.values) end + def test_large_binary + values = [ + "\x00".b, + nil, + "\xff".b, + ] + target = build(Arrow::LargeBinaryArray.new(values)) + assert_equal(values, target.values) + end + def test_string values = [ "Ruby", @@ -147,6 +157,16 @@ def test_string assert_equal(values, target.values) end + def test_large_string + values = [ + "Ruby", + nil, + "\u3042", # U+3042 HIRAGANA LETTER A + ] + target = build(Arrow::LargeStringArray.new(values)) + assert_equal(values, target.values) + end + def test_date32 values = [ Date.new(1960, 1, 1), diff --git a/task_list.json b/task_list.json new file mode 100644 index 000000000000..5f9a768924bd --- /dev/null +++ b/task_list.json @@ -0,0 +1,765 @@ +[ + { + "id": 0, + "phase": "Prerequisites", + "task": "Extend ORC adapter with column statistics APIs", + "description": "CRITICAL PREREQUISITE: The current ORC adapter lacks APIs to access stripe-level column statistics. Must add: (1) ColumnStatistics struct with has_null, num_values, min/max, is_deprecated, (2) GetStripeColumnStatistics(stripe, column) method, (3) Access to ORC type tree for column index mapping. This blocks all predicate pushdown work.", + "files_to_modify": [ + "cpp/src/arrow/adapters/orc/adapter.h", + "cpp/src/arrow/adapters/orc/adapter.cc" + ], + "parquet_reference": null, + "verification": [ + "cmake --build . --target arrow_orc", + "Unit test: GetStripeColumnStatistics returns valid statistics for int32/int64" + ], + "status": "pending", + "depends_on": [], + "priority": "P0" + }, + { + "id": 1, + "phase": "Core Data Structures", + "task": "Add OrcSchemaManifest and OrcSchemaField structures", + "description": "Create OrcSchemaManifest and OrcSchemaField classes to map Arrow schema fields to ORC physical column indices. Required for nested type support. Mirrors Parquet's SchemaManifest design.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.h", + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "parquet::arrow::SchemaManifest", + "file": "parquet/arrow/schema.h", + "notes": "Idea reuse - create ORC-specific version following same pattern" + }, + "verification": [ + "cmake --build . --target arrow_dataset", + "Unit test: Manifest construction from ORC metadata" + ], + "status": "pending", + "depends_on": [0], + "priority": "P0" + }, + { + "id": 2, + "phase": "Core Data Structures", + "task": "Implement BuildOrcSchemaManifest function", + "description": "Create function that builds schema manifest from ORC file metadata. Walk Arrow schema and ORC type tree in parallel, extract column indices. For leaf fields, store column_index for statistics lookup. For containers, mark as non-leaf.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "Schema manifest building", + "notes": "ORC type tree is depth-first pre-order (col 0 = root struct), differs from Parquet" + }, + "verification": [ + "Unit test: Manifest building for flat schemas", + "Unit test: Manifest building for nested schemas (struct, list, map)" + ], + "status": "pending", + "depends_on": [1], + "priority": "P0" + }, + { + "id": 3, + "phase": "Core Data Structures", + "task": "Implement GetOrcColumnIndex function", + "description": "Create function that resolves FieldRef to ORC column index using manifest. Handle top-level fields directly and nested fields by traversing manifest tree. Return null if not found or not a leaf.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "Column index resolution via manifest", + "notes": "Similar to Parquet but ORC indices start at 1 for user columns" + }, + "verification": [ + "Unit test: Column index resolution for top-level fields", + "Unit test: Column index resolution for nested fields" + ], + "status": "pending", + "depends_on": [2], + "priority": "P0" + }, + { + "id": 4, + "phase": "Core Data Structures", + "task": "Create OrcFileFragment class", + "description": "Extend FileFragment with ORC-specific predicate pushdown capabilities. Add fields: stripes (optional list of selected indices), metadata (OrcFileMetadata), manifest (OrcSchemaManifest), statistics_cache (StripeStatisticsCache), cache_status enum.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.h", + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "ParquetFileFragment", + "file": "cpp/src/arrow/dataset/file_parquet.h", + "lines": "158-235", + "notes": "Mirror structure: row_groups_ -> stripes_, statistics_expressions_ -> stripe_guarantees_, etc." + }, + "verification": [ + "cmake --build . --target arrow_dataset", + "Unit test: OrcFileFragment construction" + ], + "status": "pending", + "depends_on": [1], + "priority": "P0" + }, + { + "id": 5, + "phase": "Core Data Structures", + "task": "Implement StripeStatisticsCache structure", + "description": "Create cache class with: stripe_guarantees (list of Expression per stripe), fields_processed (set tracking processed fields), statistics_complete (list of bool per column). Protected by mutex for thread safety.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "statistics_expressions_ and statistics_expressions_complete_", + "file": "cpp/src/arrow/dataset/file_parquet.h", + "lines": "224-227", + "notes": "Same caching pattern, different naming" + }, + "verification": [ + "Unit test: Cache initialization", + "Unit test: Thread-safe access" + ], + "status": "pending", + "depends_on": [4], + "priority": "P0" + }, + { + "id": 6, + "phase": "Metadata Loading", + "task": "Implement EnsureFileMetadataCached function", + "description": "Load ORC file metadata if not cached. Read footer containing stripe info, schema, writer version. Set physical_schema from metadata. Thread-safe with mutex.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "EnsureCompleteMetadata", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "lines": "802-870", + "notes": "Same lazy loading pattern with mutex protection" + }, + "verification": [ + "Unit test: Metadata loading", + "Unit test: Caching (second call doesn't reload)" + ], + "status": "pending", + "depends_on": [4], + "priority": "P0" + }, + { + "id": 7, + "phase": "Metadata Loading", + "task": "Implement EnsureManifestCached function", + "description": "Build and cache schema manifest if not done. Requires metadata first. Call BuildOrcSchemaManifest. Thread-safe.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "Manifest caching in SetMetadata", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "notes": "Part of EnsureCompleteMetadata flow" + }, + "verification": [ + "Unit test: Manifest caching", + "Unit test: Immutability once set" + ], + "status": "pending", + "depends_on": [2, 6], + "priority": "P0" + }, + { + "id": 8, + "phase": "Metadata Loading", + "task": "Implement EnsureStatisticsCached function", + "description": "Initialize statistics cache if not done. Create stripe_guarantees with literal(true) per stripe, empty fields_processed, statistics_complete all false. Idempotent.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "Statistics expressions initialization", + "notes": "Implicit in Parquet's SetMetadata, make explicit for ORC" + }, + "verification": [ + "Unit test: Cache initialization", + "Unit test: Idempotency" + ], + "status": "pending", + "depends_on": [5], + "priority": "P0" + }, + { + "id": 9, + "phase": "Predicate Evaluation", + "task": "Implement ResolvePredicateFields function", + "description": "Resolve field references in predicate to PredicateField entities using manifest. Return list with: field_ref, arrow_field_index, orc_column_index, data_type, supports_statistics. Skip non-leaf or unsupported types.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "Field resolution in TestRowGroups", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "notes": "Similar pattern but uses ORC manifest" + }, + "verification": [ + "Unit test: Resolution for supported types (int32, int64)", + "Unit test: Skipping unsupported types", + "Unit test: Nested field resolution" + ], + "status": "pending", + "depends_on": [3, 7], + "priority": "P0" + }, + { + "id": 10, + "phase": "Predicate Evaluation", + "task": "Implement DeriveFieldGuarantee function", + "description": "Derive guarantee expression from stripe column statistics. Handle: all-null (num_values=0), min/max available, incomplete stats. Validate not deprecated/corrupted. Core of predicate pushdown.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "EvaluateStatisticsAsExpression", + "file": "cpp/src/arrow/dataset/file_parquet.h", + "lines": "184-189", + "notes": "Same logic, different statistics API" + }, + "verification": [ + "Unit test: All-null case", + "Unit test: Min/max range case", + "Unit test: Single value (min=max)", + "Unit test: Deprecated stats ignored", + "Unit test: Corrupted stats (min>max) returns null" + ], + "status": "pending", + "depends_on": [9], + "priority": "P0" + }, + { + "id": 11, + "phase": "Predicate Evaluation", + "task": "Use SimplifyWithGuarantee from Arrow compute", + "description": "Use compute::SimplifyWithGuarantee for expression simplification. This is shared infrastructure. Create FoldingAnd helper for combining guarantees.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "SimplifyWithGuarantee usage", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "lines": "939, 980", + "notes": "INFRA REUSE - this is format-agnostic shared code" + }, + "verification": [ + "Unit test: x > 10 with guarantee x >= 15 -> true", + "Unit test: x < 5 with guarantee x >= 15 -> false" + ], + "status": "pending", + "depends_on": [10], + "priority": "P0" + }, + { + "id": 12, + "phase": "Predicate Evaluation", + "task": "Implement TestStripes function", + "description": "Core statistics evaluation. Lock mutex, simplify with partition expr, resolve fields, load uncached statistics into cache, return per-stripe simplified expressions. Mirrors TestRowGroups.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "TestRowGroups", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "lines": "933-983", + "notes": "IDEA REUSE - follow same structure, adapt for stripes" + }, + "verification": [ + "Unit test: Batch stripe evaluation", + "Unit test: Incremental cache population", + "Unit test: Partition-level filtering" + ], + "status": "pending", + "depends_on": [8, 9, 11], + "priority": "P0" + }, + { + "id": 13, + "phase": "Predicate Evaluation", + "task": "Implement FilterStripes function", + "description": "Main entry point. Ensure metadata/manifest/cache loaded. Call TestStripes. Return StripeFilterResult with selected_indices and skipped_count. Skip empty stripes.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "FilterRowGroups", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "lines": "918-931", + "notes": "Simple wrapper around TestStripes" + }, + "verification": [ + "Unit test: Basic stripe filtering", + "Unit test: No stripes selected", + "Unit test: All stripes selected" + ], + "status": "pending", + "depends_on": [6, 7, 12], + "priority": "P0" + }, + { + "id": 14, + "phase": "Scan Integration", + "task": "Update ScanBatchesAsync to use FilterStripes", + "description": "Integrate FilterStripes into scan path. Add pre-filtering if metadata cached. Early exit for empty result. Pass selected stripes to reader. Add SlicingGenerator.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "ScanBatchesAsync with FilterRowGroups", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "lines": "619-636", + "notes": "Same integration pattern" + }, + "verification": [ + "Integration test: Scan with filter skips stripes", + "Integration test: Pre-filtering optimization", + "Integration test: Empty result handling" + ], + "status": "pending", + "depends_on": [13], + "priority": "P0" + }, + { + "id": 15, + "phase": "Count Optimization", + "task": "Implement OrcTryCountRows function", + "description": "Count rows from metadata when possible. Fast path for no field refs. Use TestStripes, sum rows for literal(true), return null if any not literal.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "TryCountRows", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "lines": "986-1010", + "notes": "Same optimization pattern" + }, + "verification": [ + "Unit test: Count with no field refs", + "Unit test: All stripes matched", + "Unit test: All stripes excluded", + "Unit test: Partial matches returns null" + ], + "status": "pending", + "depends_on": [12], + "priority": "P1" + }, + { + "id": 16, + "phase": "Count Optimization", + "task": "Integrate OrcTryCountRows into CountRows", + "description": "Modify CountRows to use TryCountRows optimization. If returns value, use directly. Otherwise fall back to full scan.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "CountRows with TryCountRows", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "notes": "Same pattern" + }, + "verification": [ + "Unit test: Fast count with simple filter", + "Unit test: Fallback to full scan" + ], + "status": "pending", + "depends_on": [14, 15], + "priority": "P1" + }, + { + "id": 17, + "phase": "Fragment Operations", + "task": "Implement OrcFileFragment::Subset", + "description": "Create new fragment with filtered stripes. Share immutable metadata/manifest, fresh statistics_cache.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "Subset", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "lines": "893-897", + "notes": "Same pattern" + }, + "verification": [ + "Unit test: Subset creation", + "Unit test: Metadata/manifest sharing", + "Unit test: Fresh cache" + ], + "status": "pending", + "depends_on": [13], + "priority": "P1" + }, + { + "id": 18, + "phase": "Fragment Operations", + "task": "Implement OrcFileFragment::SplitByStripe", + "description": "Split fragment into one per stripe. Useful for parallel processing.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "SplitByRowGroup", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "lines": "873-889", + "notes": "Same pattern" + }, + "verification": [ + "Unit test: Split into per-stripe fragments", + "Unit test: Metadata sharing" + ], + "status": "pending", + "depends_on": [13], + "priority": "P1" + }, + { + "id": 19, + "phase": "Thread Safety", + "task": "Add mutex protection for all cache operations", + "description": "Add physical_schema_mutex_ to OrcFileFragment. Protect all cache reads/writes. Match Parquet's thread safety model.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.h", + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "physical_schema_mutex_ usage", + "file": "cpp/src/arrow/dataset/file_parquet.cc", + "lines": "798, 803, 923, 935", + "notes": "Same locking pattern" + }, + "verification": [ + "Unit test: Concurrent scans on same fragment", + "Unit test: No data corruption" + ], + "status": "pending", + "depends_on": [13], + "priority": "P0" + }, + { + "id": 20, + "phase": "Testing", + "task": "Add basic predicate pushdown tests", + "description": "Add tests for basic predicates: =, !=, <, <=, >, >=. Test int32/int64. Verify stripe filtering.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "concept": "PredicatePushdown test", + "file": "cpp/src/arrow/dataset/file_parquet_test.cc", + "lines": "639-693", + "notes": "INFRA REUSE - same test structure, ORC-specific data" + }, + "verification": [ + "All new tests pass" + ], + "status": "pending", + "depends_on": [14], + "priority": "P1" + }, + { + "id": 21, + "phase": "Testing", + "task": "Add CountRowsPredicatePushdown test", + "description": "Test count optimization with predicates.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "concept": "CountRowsPredicatePushdown", + "file": "cpp/src/arrow/dataset/file_parquet_test.cc", + "lines": "307-376", + "notes": "INFRA REUSE - same test pattern" + }, + "verification": [ + "All new tests pass" + ], + "status": "pending", + "depends_on": [16], + "priority": "P1" + }, + { + "id": 22, + "phase": "Testing", + "task": "Add PredicatePushdownStripeFragments test", + "description": "Test predicate pushdown with stripe-level fragments.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "concept": "PredicatePushdownRowGroupFragments", + "file": "cpp/src/arrow/dataset/file_parquet_test.cc", + "lines": "694-749", + "notes": "Same pattern, stripes instead of row groups" + }, + "verification": [ + "All new tests pass" + ], + "status": "pending", + "depends_on": [18], + "priority": "P1" + }, + { + "id": 23, + "phase": "Testing", + "task": "Add CachedMetadata test", + "description": "Test metadata caching behavior.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "concept": "CachedMetadata", + "file": "cpp/src/arrow/dataset/file_parquet_test.cc", + "lines": "378-435", + "notes": "Same caching test pattern" + }, + "verification": [ + "All new tests pass" + ], + "status": "pending", + "depends_on": [6], + "priority": "P1" + }, + { + "id": 24, + "phase": "Testing", + "task": "Add MultithreadedScan test", + "description": "Test concurrent scans on same fragment.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "concept": "MultithreadedScan", + "file": "cpp/src/arrow/dataset/file_parquet_test.cc", + "lines": "436-460", + "notes": "Critical for thread safety validation" + }, + "verification": [ + "All new tests pass" + ], + "status": "pending", + "depends_on": [19], + "priority": "P1" + }, + { + "id": 25, + "phase": "Testing", + "task": "Add statistics edge case tests", + "description": "Test: all-null stripes, single-value stripes, missing statistics, deprecated statistics, corrupted statistics (min>max).", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "notes": "ORC-specific edge cases based on allium spec" + }, + "verification": [ + "All edge cases handled correctly", + "Conservative behavior verified" + ], + "status": "pending", + "depends_on": [14], + "priority": "P1" + }, + { + "id": 26, + "phase": "Testing", + "task": "Add compound predicate tests (AND, OR, NOT)", + "description": "Test logical operators with statistics.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "notes": "Verify three-valued logic behavior" + }, + "verification": [ + "All compound predicate combinations work" + ], + "status": "pending", + "depends_on": [14], + "priority": "P1" + }, + { + "id": 27, + "phase": "Testing", + "task": "Add IN predicate test", + "description": "Test IN predicate with range intersection.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "notes": "Test value set intersection with min/max" + }, + "verification": [ + "IN predicate optimized correctly" + ], + "status": "pending", + "depends_on": [14], + "priority": "P1" + }, + { + "id": 28, + "phase": "Testing", + "task": "Add NULL handling tests", + "description": "Test IS NULL, IS VALID predicates. Verify three-valued logic.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "concept": "ScanWithPushdownNulls", + "notes": "Already exists but verify with statistics" + }, + "verification": [ + "NULL predicates work correctly" + ], + "status": "pending", + "depends_on": [14], + "priority": "P1" + }, + { + "id": 29, + "phase": "Cache Management", + "task": "Implement ClearCachedMetadata", + "description": "Invalidate all cached data. Set cache_status to uncached. Useful for testing and error recovery.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "ClearCachedMetadata", + "file": "cpp/src/arrow/dataset/file_parquet.h", + "lines": "178", + "notes": "Same pattern" + }, + "verification": [ + "Unit test: Cache clearing", + "Unit test: Rebuild after clear" + ], + "status": "pending", + "depends_on": [13], + "priority": "P2" + }, + { + "id": 30, + "phase": "Documentation", + "task": "Add inline documentation with spec references", + "description": "Document all new functions with references to allium spec sections. Explain design decisions.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.h", + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "notes": "Follow Parquet's documentation style" + }, + "verification": [ + "All public APIs documented", + "Spec references included" + ], + "status": "pending", + "depends_on": [14], + "priority": "P3" + }, + { + "id": 31, + "phase": "Performance", + "task": "Add performance benchmarks", + "description": "Benchmark I/O reduction with selective filters. Measure cache benefit. Compare to baseline.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_benchmark.cc" + ], + "files_to_create": [ + "cpp/src/arrow/dataset/file_orc_benchmark.cc" + ], + "parquet_reference": { + "notes": "Create ORC-specific benchmarks" + }, + "verification": [ + "Benchmarks run successfully", + "Performance improvement documented" + ], + "status": "pending", + "depends_on": [14], + "priority": "P2" + }, + { + "id": 32, + "phase": "Future - Float Support", + "task": "Add float32/float64 type support", + "description": "Extend statistics support to floating-point types. Handle NaN, infinity, signed zero edge cases.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "notes": "See allium spec FLOATING-POINT EDGE CASES section" + }, + "verification": [ + "Float predicates work correctly", + "NaN handling verified" + ], + "status": "pending", + "depends_on": [14], + "priority": "P2" + }, + { + "id": 33, + "phase": "Future - String Support", + "task": "Add string/binary type support", + "description": "Extend to string/binary types. Handle truncation in statistics.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "String column pushdown test", + "file": "cpp/src/arrow/dataset/file_parquet_test.cc", + "lines": "810", + "notes": "See allium spec TRUNCATION HANDLING section" + }, + "verification": [ + "String predicates work correctly", + "Truncation handled conservatively" + ], + "status": "pending", + "depends_on": [14], + "priority": "P2" + }, + { + "id": 34, + "phase": "Future - Temporal Support", + "task": "Add timestamp/date type support", + "description": "Extend to temporal types. Handle unit conversion and timezone issues.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc.cc" + ], + "parquet_reference": { + "concept": "Duration column pushdown test", + "file": "cpp/src/arrow/dataset/file_parquet_test.cc", + "lines": "827", + "notes": "Handle temporal type complexity" + }, + "verification": [ + "Temporal predicates work correctly", + "Unit conversion correct" + ], + "status": "pending", + "depends_on": [14], + "priority": "P2" + }, + { + "id": 35, + "phase": "Future - Nested Types", + "task": "Add nested type tests", + "description": "Test predicate pushdown for struct/list/map columns via manifest.", + "files_to_modify": [ + "cpp/src/arrow/dataset/file_orc_test.cc" + ], + "parquet_reference": { + "notes": "Verify manifest correctly maps nested fields to leaf columns" + }, + "verification": [ + "Nested field predicates work" + ], + "status": "pending", + "depends_on": [14], + "priority": "P2" + } +] diff --git a/testing b/testing index 7b641152dcb0..ca49b7795c09 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 7b641152dcb0f9e197ebe24a1986151849250959 +Subproject commit ca49b7795c09181c2915b0a5e762a8fac70f9556