Skip to content

Commit 484eb33

Browse files
authored
feat: introduce feature flags to select major arrow versions (#654)
This change introduces arrow_53 and arrow_54 feature flags on kernel which are _required_ when using default-engine or sync-engine. Fundamentally we must push users of the crate to select their arrow major version through flags since Cargo _will_ include multiple major versions in the dependency tree which can cause ABI breakages when passing around symbols such as `RecordBatch` See #640 --------- Signed-off-by: R. Tyler Croy <[email protected]>
1 parent 72b585d commit 484eb33

File tree

46 files changed

+249
-296
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+249
-296
lines changed

.github/workflows/build.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
cargo install cargo-msrv --locked
3838
- name: verify-msrv
3939
run: |
40-
cargo msrv --path kernel/ verify --all-features
40+
cargo msrv --path kernel/ verify --features $(cat .github/workflows/default-kernel-features)
4141
cargo msrv --path derive-macros/ verify --all-features
4242
cargo msrv --path ffi/ verify --all-features
4343
cargo msrv --path ffi-proc-macros/ verify --all-features
@@ -104,7 +104,7 @@ jobs:
104104
- name: check kernel builds with no-default-features
105105
run: cargo build -p delta_kernel --no-default-features
106106
- name: build and lint with clippy
107-
run: cargo clippy --benches --tests --all-features -- -D warnings
107+
run: cargo clippy --benches --tests --features $(cat .github/workflows/default-kernel-features) -- -D warnings
108108
- name: lint without default features
109109
run: cargo clippy --no-default-features -- -D warnings
110110
- name: check kernel builds with default-engine
@@ -129,7 +129,7 @@ jobs:
129129
override: true
130130
- uses: Swatinem/rust-cache@v2
131131
- name: test
132-
run: cargo test --workspace --verbose --all-features -- --skip read_table_version_hdfs
132+
run: cargo test --workspace --verbose --features $(cat .github/workflows/default-kernel-features) -- --skip read_table_version_hdfs
133133

134134
ffi_test:
135135
runs-on: ${{ matrix.os }}
@@ -229,7 +229,7 @@ jobs:
229229
uses: taiki-e/install-action@cargo-llvm-cov
230230
- uses: Swatinem/rust-cache@v2
231231
- name: Generate code coverage
232-
run: cargo llvm-cov --all-features --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs
232+
run: cargo llvm-cov --features $(cat .github/workflows/default-kernel-features) --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs
233233
- name: Upload coverage to Codecov
234234
uses: codecov/codecov-action@v5
235235
with:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
integration-test,default-engine,default-engine-rustls,cloud,arrow,sync-engine

Cargo.toml

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,6 @@ rust-version = "1.80"
2323
version = "0.6.1"
2424

2525
[workspace.dependencies]
26-
# When changing the arrow version range, also modify ffi/Cargo.toml which has
27-
# its own arrow version ranges witeh modified features. Failure to do so will
28-
# result in compilation errors as two different sets of arrow dependencies may
29-
# be sourced
30-
arrow = { version = ">=53, <55" }
31-
arrow-arith = { version = ">=53, <55" }
32-
arrow-array = { version = ">=53, <55" }
33-
arrow-buffer = { version = ">=53, <55" }
34-
arrow-cast = { version = ">=53, <55" }
35-
arrow-data = { version = ">=53, <55" }
36-
arrow-ord = { version = ">=53, <55" }
37-
arrow-json = { version = ">=53, <55" }
38-
arrow-select = { version = ">=53, <55" }
39-
arrow-schema = { version = ">=53, <55" }
40-
parquet = { version = ">=53, <55", features = ["object_store"] }
4126
object_store = { version = ">=0.11, <0.12" }
4227
hdfs-native-object-store = "0.12.0"
4328
hdfs-native = "0.10.0"

acceptance/Cargo.toml

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,14 @@ rust-version.workspace = true
1414
release = false
1515

1616
[dependencies]
17-
arrow-array = { workspace = true }
18-
arrow-cast = { workspace = true }
19-
arrow-ord = { workspace = true }
20-
arrow-select = { workspace = true }
21-
arrow-schema = { workspace = true }
2217
delta_kernel = { path = "../kernel", features = [
2318
"default-engine",
19+
"arrow_53",
2420
"developer-visibility",
2521
] }
2622
futures = "0.3"
2723
itertools = "0.13"
2824
object_store = { workspace = true }
29-
parquet = { workspace = true }
3025
serde = { version = "1", features = ["derive"] }
3126
serde_json = "1"
3227
thiserror = "1"

acceptance/src/data.rs

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
use std::{path::Path, sync::Arc};
22

3-
use arrow_array::{Array, RecordBatch};
4-
use arrow_ord::sort::{lexsort_to_indices, SortColumn};
5-
use arrow_schema::{DataType, Schema};
6-
use arrow_select::{concat::concat_batches, filter::filter_record_batch, take::take};
3+
use delta_kernel::arrow::array::{Array, RecordBatch};
4+
use delta_kernel::arrow::compute::{
5+
concat_batches, filter_record_batch, lexsort_to_indices, take, SortColumn,
6+
};
7+
use delta_kernel::arrow::datatypes::{DataType, Schema};
78

9+
use delta_kernel::parquet::arrow::async_reader::{
10+
ParquetObjectReader, ParquetRecordBatchStreamBuilder,
11+
};
812
use delta_kernel::{engine::arrow_data::ArrowEngineData, DeltaResult, Engine, Error, Table};
913
use futures::{stream::TryStreamExt, StreamExt};
1014
use itertools::Itertools;
1115
use object_store::{local::LocalFileSystem, ObjectStore};
12-
use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder};
1316

1417
use crate::{TestCaseInfo, TestResult};
1518

@@ -83,8 +86,8 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
8386
fn normalize_col(col: Arc<dyn Array>) -> Arc<dyn Array> {
8487
if let DataType::Timestamp(unit, Some(zone)) = col.data_type() {
8588
if **zone == *"+00:00" {
86-
arrow_cast::cast::cast(&col, &DataType::Timestamp(*unit, Some("UTC".into())))
87-
.expect("Could not cast to UTC")
89+
let data_type = DataType::Timestamp(*unit, Some("UTC".into()));
90+
delta_kernel::arrow::compute::cast(&col, &data_type).expect("Could not cast to UTC")
8891
} else {
8992
col
9093
}

feature-tests/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ version.workspace = true
1212
release = false
1313

1414
[dependencies]
15-
delta_kernel = { path = "../kernel" }
15+
delta_kernel = { path = "../kernel", features = ["arrow_53"] }
1616

1717
[features]
1818
default-engine = [ "delta_kernel/default-engine" ]

ffi/Cargo.toml

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,13 @@ tracing-core = { version = "0.1", optional = true }
2222
tracing-subscriber = { version = "0.3", optional = true, features = [ "json" ] }
2323
url = "2"
2424
delta_kernel = { path = "../kernel", default-features = false, features = [
25+
"arrow",
2526
"developer-visibility",
2627
] }
2728
delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.6.1" }
2829

29-
# used if we use the default engine to be able to move arrow data into the c-ffi format
30-
arrow-schema = { version = ">=53, <55", default-features = false, features = [
31-
"ffi",
32-
], optional = true }
33-
arrow-data = { version = ">=53, <55", default-features = false, features = [
34-
"ffi",
35-
], optional = true }
36-
arrow-array = { version = ">=53, <55", default-features = false, optional = true }
37-
3830
[build-dependencies]
39-
cbindgen = "0.27.0"
31+
cbindgen = "0.28"
4032
libc = "0.2.158"
4133

4234
[dev-dependencies]
@@ -52,9 +44,6 @@ default = ["default-engine"]
5244
cloud = ["delta_kernel/cloud"]
5345
default-engine = [
5446
"delta_kernel/default-engine",
55-
"arrow-array",
56-
"arrow-data",
57-
"arrow-schema",
5847
]
5948
tracing = [ "tracing-core", "tracing-subscriber" ]
6049
sync-engine = ["delta_kernel/sync-engine"]

ffi/cbindgen.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,4 @@ parse_deps = true
2525
# only crates found in this list will ever be parsed.
2626
#
2727
# default: there is no allow-list (NOTE: this is the opposite of [])
28-
include = ["delta_kernel", "arrow-data", "arrow-schema"]
28+
include = ["arrow", "arrow-data", "arrow-schema", "delta_kernel"]

ffi/src/engine_data.rs

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
//! EngineData related ffi code
22
3+
use delta_kernel::arrow::array::{
4+
ffi::{FFI_ArrowArray, FFI_ArrowSchema},
5+
ArrayData, StructArray,
6+
};
37
use delta_kernel::{DeltaResult, EngineData};
48
use std::ffi::c_void;
59

@@ -45,8 +49,8 @@ unsafe fn get_raw_engine_data_impl(data: &mut Handle<ExclusiveEngineData>) -> &m
4549
#[cfg(feature = "default-engine")]
4650
#[repr(C)]
4751
pub struct ArrowFFIData {
48-
pub array: arrow_data::ffi::FFI_ArrowArray,
49-
pub schema: arrow_schema::ffi::FFI_ArrowSchema,
52+
pub array: FFI_ArrowArray,
53+
pub schema: FFI_ArrowSchema,
5054
}
5155

5256
// TODO: This should use a callback to avoid having to have the engine free the struct
@@ -71,16 +75,16 @@ pub unsafe extern "C" fn get_raw_arrow_data(
7175
// TODO: This method leaks the returned pointer memory. How will the engine free it?
7276
#[cfg(feature = "default-engine")]
7377
fn get_raw_arrow_data_impl(data: Box<dyn EngineData>) -> DeltaResult<*mut ArrowFFIData> {
74-
let record_batch: arrow_array::RecordBatch = data
78+
let record_batch: delta_kernel::arrow::array::RecordBatch = data
7579
.into_any()
7680
.downcast::<delta_kernel::engine::arrow_data::ArrowEngineData>()
7781
.map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))?
7882
.into();
79-
let sa: arrow_array::StructArray = record_batch.into();
80-
let array_data: arrow_data::ArrayData = sa.into();
83+
let sa: StructArray = record_batch.into();
84+
let array_data: ArrayData = sa.into();
8185
// these call `clone`. is there a way to not copy anything and what exactly are they cloning?
82-
let array = arrow_data::ffi::FFI_ArrowArray::new(&array_data);
83-
let schema = arrow_schema::ffi::FFI_ArrowSchema::try_from(array_data.data_type())?;
86+
let array = FFI_ArrowArray::new(&array_data);
87+
let schema = FFI_ArrowSchema::try_from(array_data.data_type())?;
8488
let ret_data = Box::new(ArrowFFIData { array, schema });
8589
Ok(Box::leak(ret_data))
8690
}

integration-tests/Cargo.toml

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,4 @@ edition = "2021"
66
[workspace]
77

88
[dependencies]
9-
arrow = "=53.0.0"
10-
delta_kernel = { path = "../kernel", features = ["arrow-conversion", "arrow-expression", "default-engine", "sync-engine"] }
11-
12-
[patch.'file:///../kernel']
13-
arrow = "=53.0.0"
14-
arrow-arith = "=53.0.0"
15-
arrow-array = "=53.0.0"
16-
arrow-buffer = "=53.0.0"
17-
arrow-cast = "=53.0.0"
18-
arrow-data = "=53.0.0"
19-
arrow-ord = "=53.0.0"
20-
arrow-json = "=53.0.0"
21-
arrow-select = "=53.0.0"
22-
arrow-schema = "=53.0.0"
23-
parquet = "=53.0.0"
24-
object_store = "=0.11.1"
9+
delta_kernel = { path = "../kernel", features = ["default-engine", "sync-engine"] }

0 commit comments

Comments
 (0)