From 1b8e968d886205d09f88f4ef566ef6b88aa6e6ce Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Fri, 8 Nov 2024 16:01:01 -0700 Subject: [PATCH 1/7] Fix Buffer::bit_slice losing length with byte-aligned offsets --- Cargo.toml | 2 +- arrow-buffer/src/buffer/immutable.rs | 35 +++++++++++++++++++++++++++- arrow-flight/src/encode.rs | 2 +- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f210ae210012..c5527fc41cca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,7 +74,7 @@ include = [ "Cargo.toml", ] edition = "2021" -rust-version = "1.62" +rust-version = "1.75" [workspace.dependencies] arrow = { version = "53.2.0", path = "./arrow", default-features = false } diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 8d1a46583fca..4c100ab07bd6 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -265,7 +265,7 @@ impl Buffer { /// otherwise a new buffer is allocated and filled with a copy of the bits in the range. pub fn bit_slice(&self, offset: usize, len: usize) -> Self { if offset % 8 == 0 { - return self.slice(offset / 8); + return self.slice_with_length(offset / 8, len.div_ceil(8)); } bitwise_unary_op_helper(self, offset, len, |a| a) @@ -860,4 +860,37 @@ mod tests { let iter_len = usize::MAX / std::mem::size_of::() + 1; let _ = Buffer::from_iter(std::iter::repeat(0_u64).take(iter_len)); } + + #[test] + fn bit_slice_length_preserved() { + // Create a boring buffer + let buf = Buffer::from_iter(std::iter::repeat(true).take(64)); + + let assert_preserved = |offset: usize, len: usize| { + let new_buf = buf.bit_slice(offset, len); + assert_eq!(new_buf.len(), len.div_ceil(8)); + + // if the offset is not byte-aligned, we have to create a deep copy to a new buffer + // (since the `offset` value inside a Buffer is byte-granular, not bit-granular), so + // checking the offset should always return 0 if so. If the offset IS byte-aligned, we + // want to make sure it doesn't unnecessarily create a deep copy. + if offset % 8 == 0 { + assert_eq!(new_buf.ptr_offset(), offset / 8); + } else { + assert_eq!(new_buf.ptr_offset(), 0); + } + }; + + // go through every available value for offset + for o in 0..=64 { + // and go through every length that could accompany that offset - we can't have a + // situation where offset + len > 64, because that would go past the end of the buffer, + // so we use the map to ensure it's in range. + for l in (o..=64).map(|l| l - o) { + // and we just want to make sure every one of these keeps its offset and length + // when neeeded + assert_preserved(o, l); + } + } + } } diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index ae3475c7c7d7..e65295e8750f 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -1741,7 +1741,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("a1", Arc::new(array) as _)]).unwrap(); - verify_encoded_split(batch, 160).await; + verify_encoded_split(batch, 48).await; } #[tokio::test] From 0db95dd6f78ec3670c5ea9b97b13af3a714239a7 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Fri, 8 Nov 2024 16:48:13 -0700 Subject: [PATCH 2/7] Fix msrv confusion and bit_util::ceil usage --- Cargo.toml | 2 +- arrow-buffer/src/buffer/immutable.rs | 6 +++--- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- object_store/Cargo.toml | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c5527fc41cca..bf2274a40306 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,7 +74,7 @@ include = [ "Cargo.toml", ] edition = "2021" -rust-version = "1.75" +rust-version = "1.70" [workspace.dependencies] arrow = { version = "53.2.0", path = "./arrow", default-features = false } diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 4c100ab07bd6..7254313b7da5 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use crate::alloc::{Allocation, Deallocation, ALIGNMENT}; use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk}; use crate::BufferBuilder; -use crate::{bytes::Bytes, native::ArrowNativeType}; +use crate::{bit_util, bytes::Bytes, native::ArrowNativeType}; use super::ops::bitwise_unary_op_helper; use super::{MutableBuffer, ScalarBuffer}; @@ -265,7 +265,7 @@ impl Buffer { /// otherwise a new buffer is allocated and filled with a copy of the bits in the range. pub fn bit_slice(&self, offset: usize, len: usize) -> Self { if offset % 8 == 0 { - return self.slice_with_length(offset / 8, len.div_ceil(8)); + return self.slice_with_length(offset / 8, bit_util::ceil(len, 8)); } bitwise_unary_op_helper(self, offset, len, |a| a) @@ -868,7 +868,7 @@ mod tests { let assert_preserved = |offset: usize, len: usize| { let new_buf = buf.bit_slice(offset, len); - assert_eq!(new_buf.len(), len.div_ceil(8)); + assert_eq!(new_buf.len(), bit_util::ceil(len, 8)); // if the offset is not byte-aligned, we have to create a deep copy to a new buffer // (since the `offset` value inside a Buffer is byte-granular, not bit-granular), so diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 0834f2d13384..630c4218352c 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -25,7 +25,7 @@ authors = ["Apache Arrow "] license = "Apache-2.0" keywords = [ "arrow" ] edition = "2021" -rust-version = "1.62" +rust-version = { workspace = true } publish = false [lib] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index a0fd96415a1d..c8e162ff9b02 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -31,7 +31,7 @@ include = [ "Cargo.toml", ] edition = { workspace = true } -rust-version = "1.70.0" +rust-version = { workspace = true } [lib] name = "arrow" diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 86d1392ebf61..c79e42a167d8 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -24,7 +24,7 @@ readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] repository = "https://github.com/apache/arrow-rs/tree/master/object_store" -rust-version = "1.64.0" +rust-version = "1.70.0" [package.metadata.docs.rs] all-features = true From c0a9689fb1ee051f158332970c704ae2d852ffbe Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 12 Nov 2024 11:35:02 -0700 Subject: [PATCH 3/7] Remove workspace reference in non-workspace crate --- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 630c4218352c..5f8d48dc58d5 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -25,7 +25,7 @@ authors = ["Apache Arrow "] license = "Apache-2.0" keywords = [ "arrow" ] edition = "2021" -rust-version = { workspace = true } +rust-version = "1.70" publish = false [lib] From 1957c4595a73875ee73dd98e13e9545a29fcabb7 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 12 Nov 2024 11:55:02 -0700 Subject: [PATCH 4/7] Fix more msrvs to be correct --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index c7fe89beb93a..53b275b52396 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -20,7 +20,7 @@ name = "gen" description = "Code generation for arrow-flight" version = "0.1.0" edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.71.1" authors = { workspace = true } homepage = { workspace = true } repository = { workspace = true } From c37f8dad68fd901499b03966f2082926a623162c Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 12 Nov 2024 11:55:22 -0700 Subject: [PATCH 5/7] Fix even more msrvs to be accurate --- arrow-avro/Cargo.toml | 1 - arrow-integration-testing/Cargo.toml | 2 +- arrow-json/Cargo.toml | 1 - arrow-row/Cargo.toml | 1 - arrow-schema/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- 6 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arrow-avro/Cargo.toml b/arrow-avro/Cargo.toml index d2436f0c15de..93e602320a1d 100644 --- a/arrow-avro/Cargo.toml +++ b/arrow-avro/Cargo.toml @@ -53,4 +53,3 @@ crc = { version = "3.0", optional = true } [dev-dependencies] - diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 7be56d919852..c3cd6346c1f9 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -25,7 +25,7 @@ authors = { workspace = true } license = { workspace = true } edition = { workspace = true } publish = false -rust-version = { workspace = true } +rust-version = "1.75.0" [lib] crate-type = ["lib", "cdylib"] diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 517bb03d2064..5a3990efdfd6 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -60,4 +60,3 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" [[bench]] name = "serde" harness = false - diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 3754afb4dbc6..c04c84a55e1b 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -53,4 +53,3 @@ arrow-ord = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] - diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 628d4a683cac..945c6f2ed464 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -26,7 +26,7 @@ license = { workspace = true } keywords = { workspace = true } include = { workspace = true } edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.64" [lib] name = "arrow_schema" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 4064baba0947..5e2c13bf9c4d 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -26,7 +26,7 @@ authors = { workspace = true } keywords = ["arrow", "parquet", "hadoop"] readme = "README.md" edition = { workspace = true } -rust-version = "1.70.0" +rust-version = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } From 8f56f5d56540ef174aee5a28d91e8d4407d73431 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Tue, 12 Nov 2024 12:05:37 -0700 Subject: [PATCH 6/7] Revert object_store msrv change and remove unnecessary ahash downgrade --- .github/workflows/rust.yml | 2 -- object_store/Cargo.toml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 1b65c5057de1..bdc4d0186f52 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -122,8 +122,6 @@ jobs: uses: ./.github/actions/setup-builder - name: Install cargo-msrv run: cargo install cargo-msrv - - name: Downgrade arrow dependencies - run: cargo update -p ahash --precise 0.8.7 - name: Check arrow working-directory: arrow run: cargo msrv --log-target stdout verify diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index c79e42a167d8..86d1392ebf61 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -24,7 +24,7 @@ readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] repository = "https://github.com/apache/arrow-rs/tree/master/object_store" -rust-version = "1.70.0" +rust-version = "1.64.0" [package.metadata.docs.rs] all-features = true From 5c49b6f312ce070ae57559dfa08ea0622ad8e7a1 Mon Sep 17 00:00:00 2001 From: itsjunetime Date: Sat, 16 Nov 2024 17:19:01 -0700 Subject: [PATCH 7/7] Remove MSRV changes --- .github/workflows/rust.yml | 2 ++ Cargo.toml | 2 +- arrow-avro/Cargo.toml | 1 + arrow-flight/gen/Cargo.toml | 2 +- arrow-integration-testing/Cargo.toml | 2 +- arrow-json/Cargo.toml | 1 + arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow-row/Cargo.toml | 1 + arrow-schema/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- parquet/Cargo.toml | 2 +- 11 files changed, 12 insertions(+), 7 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index bdc4d0186f52..1b65c5057de1 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -122,6 +122,8 @@ jobs: uses: ./.github/actions/setup-builder - name: Install cargo-msrv run: cargo install cargo-msrv + - name: Downgrade arrow dependencies + run: cargo update -p ahash --precise 0.8.7 - name: Check arrow working-directory: arrow run: cargo msrv --log-target stdout verify diff --git a/Cargo.toml b/Cargo.toml index bf2274a40306..f210ae210012 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,7 +74,7 @@ include = [ "Cargo.toml", ] edition = "2021" -rust-version = "1.70" +rust-version = "1.62" [workspace.dependencies] arrow = { version = "53.2.0", path = "./arrow", default-features = false } diff --git a/arrow-avro/Cargo.toml b/arrow-avro/Cargo.toml index 93e602320a1d..d2436f0c15de 100644 --- a/arrow-avro/Cargo.toml +++ b/arrow-avro/Cargo.toml @@ -53,3 +53,4 @@ crc = { version = "3.0", optional = true } [dev-dependencies] + diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 53b275b52396..c7fe89beb93a 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -20,7 +20,7 @@ name = "gen" description = "Code generation for arrow-flight" version = "0.1.0" edition = { workspace = true } -rust-version = "1.71.1" +rust-version = { workspace = true } authors = { workspace = true } homepage = { workspace = true } repository = { workspace = true } diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index c3cd6346c1f9..7be56d919852 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -25,7 +25,7 @@ authors = { workspace = true } license = { workspace = true } edition = { workspace = true } publish = false -rust-version = "1.75.0" +rust-version = { workspace = true } [lib] crate-type = ["lib", "cdylib"] diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 5a3990efdfd6..517bb03d2064 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -60,3 +60,4 @@ rand = { version = "0.8", default-features = false, features = ["std", "std_rng" [[bench]] name = "serde" harness = false + diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 5f8d48dc58d5..0834f2d13384 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -25,7 +25,7 @@ authors = ["Apache Arrow "] license = "Apache-2.0" keywords = [ "arrow" ] edition = "2021" -rust-version = "1.70" +rust-version = "1.62" publish = false [lib] diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index c04c84a55e1b..3754afb4dbc6 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -53,3 +53,4 @@ arrow-ord = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } [features] + diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 945c6f2ed464..628d4a683cac 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -26,7 +26,7 @@ license = { workspace = true } keywords = { workspace = true } include = { workspace = true } edition = { workspace = true } -rust-version = "1.64" +rust-version = { workspace = true } [lib] name = "arrow_schema" diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index c8e162ff9b02..a0fd96415a1d 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -31,7 +31,7 @@ include = [ "Cargo.toml", ] edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.70.0" [lib] name = "arrow" diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 5e2c13bf9c4d..4064baba0947 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -26,7 +26,7 @@ authors = { workspace = true } keywords = ["arrow", "parquet", "hadoop"] readme = "README.md" edition = { workspace = true } -rust-version = { workspace = true } +rust-version = "1.70.0" [target.'cfg(target_arch = "wasm32")'.dependencies] ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] }