-
Notifications
You must be signed in to change notification settings - Fork 1.1k
fix: Panic on reencoding offsets in arrow-ipc with sliced nested arrays #6998
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
2be4a8c
fix: Panic on reencoding offsets
HawaiianSpork 467fe3f
Add tests for slicing larger arrays
alamb 91ebd73
Run rustfmt
HawaiianSpork 64ded99
Merge branch 'alamb/arrow_ipc_test' into fix-offsets-upstream
HawaiianSpork 2c9d9ab
Added end to end unit test which shows the problem is fixed.
HawaiianSpork File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1473,10 +1473,7 @@ fn reencode_offsets<O: OffsetSizeTrait>( | |
| let offsets = match start_offset.as_usize() { | ||
| 0 => { | ||
| let size = size_of::<O>(); | ||
| offsets.slice_with_length( | ||
| data.offset() * size, | ||
| (data.offset() + data.len() + 1) * size, | ||
| ) | ||
| offsets.slice_with_length(data.offset() * size, (data.len() + 1) * size) | ||
| } | ||
| _ => offset_slice.iter().map(|x| *x - *start_offset).collect(), | ||
| }; | ||
|
|
@@ -1793,9 +1790,9 @@ mod tests { | |
| use std::io::Cursor; | ||
| use std::io::Seek; | ||
|
|
||
| use arrow_array::builder::GenericListBuilder; | ||
| use arrow_array::builder::MapBuilder; | ||
| use arrow_array::builder::UnionBuilder; | ||
| use arrow_array::builder::{GenericListBuilder, ListBuilder, StringBuilder}; | ||
| use arrow_array::builder::{PrimitiveRunBuilder, UInt32Builder}; | ||
| use arrow_array::types::*; | ||
| use arrow_buffer::ScalarBuffer; | ||
|
|
@@ -2433,6 +2430,126 @@ mod tests { | |
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_large_slice_uint32() { | ||
| ensure_roundtrip(Arc::new(UInt32Array::from_iter((0..8000).map(|i| { | ||
| if i % 2 == 0 { | ||
| Some(i) | ||
| } else { | ||
| None | ||
| } | ||
| })))); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_large_slice_string() { | ||
| let strings: Vec<_> = (0..8000) | ||
| .map(|i| { | ||
| if i % 2 == 0 { | ||
| Some(format!("value{}", i)) | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect(); | ||
|
|
||
| ensure_roundtrip(Arc::new(StringArray::from(strings))); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_large_slice_string_list() { | ||
| let mut ls = ListBuilder::new(StringBuilder::new()); | ||
|
|
||
| let mut s = String::new(); | ||
| for row_number in 0..8000 { | ||
| if row_number % 2 == 0 { | ||
| for list_element in 0..1000 { | ||
| s.clear(); | ||
| use std::fmt::Write; | ||
| write!(&mut s, "value{row_number}-{list_element}").unwrap(); | ||
| ls.values().append_value(&s); | ||
| } | ||
| ls.append(true) | ||
| } else { | ||
| ls.append(false); // null | ||
| } | ||
| } | ||
|
|
||
| ensure_roundtrip(Arc::new(ls.finish())); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_large_slice_string_list_of_lists() { | ||
| // The reason for the special test is to verify reencode_offsets which looks both at | ||
| // the starting offset and the data offset. So need a dataset where the starting_offset | ||
| // is zero but the data offset is not. | ||
| let mut ls = ListBuilder::new(ListBuilder::new(StringBuilder::new())); | ||
|
|
||
| for _ in 0..4000 { | ||
| ls.values().append(true); | ||
| ls.append(true) | ||
| } | ||
|
|
||
| let mut s = String::new(); | ||
| for row_number in 0..4000 { | ||
| if row_number % 2 == 0 { | ||
| for list_element in 0..1000 { | ||
| s.clear(); | ||
| use std::fmt::Write; | ||
| write!(&mut s, "value{row_number}-{list_element}").unwrap(); | ||
| ls.values().values().append_value(&s); | ||
| } | ||
| ls.values().append(true); | ||
| ls.append(true) | ||
| } else { | ||
| ls.append(false); // null | ||
| } | ||
| } | ||
|
|
||
| ensure_roundtrip(Arc::new(ls.finish())); | ||
| } | ||
|
|
||
| /// Read/write a record batch to a File and Stream and ensure it is the same at the outout | ||
| fn ensure_roundtrip(array: ArrayRef) { | ||
| let num_rows = array.len(); | ||
| let orig_batch = RecordBatch::try_from_iter(vec![("a", array)]).unwrap(); | ||
| // take off the first element | ||
| let sliced_batch = orig_batch.slice(1, num_rows - 1); | ||
|
|
||
| let schema = orig_batch.schema(); | ||
| let stream_data = { | ||
| let mut writer = StreamWriter::try_new(vec![], &schema).unwrap(); | ||
| writer.write(&sliced_batch).unwrap(); | ||
| writer.into_inner().unwrap() | ||
| }; | ||
| let read_batch = { | ||
| let projection = None; | ||
| let mut reader = StreamReader::try_new(Cursor::new(stream_data), projection).unwrap(); | ||
| reader | ||
| .next() | ||
| .expect("expect no errors reading batch") | ||
| .expect("expect batch") | ||
| }; | ||
| assert_eq!(sliced_batch, read_batch); | ||
|
|
||
| let file_data = { | ||
| let mut writer = FileWriter::try_new_buffered(vec![], &schema).unwrap(); | ||
| writer.write(&sliced_batch).unwrap(); | ||
| writer.into_inner().unwrap().into_inner().unwrap() | ||
| }; | ||
| let read_batch = { | ||
| let projection = None; | ||
| let mut reader = FileReader::try_new(Cursor::new(file_data), projection).unwrap(); | ||
| reader | ||
| .next() | ||
| .expect("expect no errors reading batch") | ||
| .expect("expect batch") | ||
| }; | ||
| assert_eq!(sliced_batch, read_batch); | ||
|
|
||
| // TODO test file writer/reader | ||
| } | ||
|
|
||
| #[test] | ||
| fn encode_bools_slice() { | ||
| // Test case for https://github.com/apache/arrow-rs/issues/3496 | ||
|
|
@@ -2615,6 +2732,40 @@ mod tests { | |
| builder.finish() | ||
| } | ||
|
|
||
| #[test] | ||
| fn reencode_offsets_when_first_offset_is_not_zero() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I verified that this test fials without the changes in this PR |
||
| let original_list = generate_list_data::<i32>(); | ||
| let original_data = original_list.into_data(); | ||
| let slice_data = original_data.slice(75, 7); | ||
| let (new_offsets, original_start, length) = | ||
| reencode_offsets::<i32>(&slice_data.buffers()[0], &slice_data); | ||
| assert_eq!( | ||
| vec![0, 3, 6, 9, 12, 15, 18, 21], | ||
| new_offsets.typed_data::<i32>() | ||
| ); | ||
| assert_eq!(225, original_start); | ||
| assert_eq!(21, length); | ||
| } | ||
|
|
||
| #[test] | ||
| fn reencode_offsets_when_first_offset_is_zero() { | ||
| let mut ls = GenericListBuilder::<i32, _>::new(UInt32Builder::new()); | ||
| // ls = [[], [35, 42] | ||
| ls.append(true); | ||
| ls.values().append_value(35); | ||
| ls.values().append_value(42); | ||
| ls.append(true); | ||
| let original_list = ls.finish(); | ||
| let original_data = original_list.into_data(); | ||
|
|
||
| let slice_data = original_data.slice(1, 1); | ||
| let (new_offsets, original_start, length) = | ||
| reencode_offsets::<i32>(&slice_data.buffers()[0], &slice_data); | ||
| assert_eq!(vec![0, 2], new_offsets.typed_data::<i32>()); | ||
| assert_eq!(0, original_start); | ||
| assert_eq!(2, length); | ||
| } | ||
|
|
||
| /// Ensure when serde full & sliced versions they are equal to original input. | ||
| /// Also ensure serialized sliced version is significantly smaller than serialized full. | ||
| fn roundtrip_ensure_sliced_smaller(in_batch: RecordBatch, expected_size_factor: usize) { | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍 Ah, Lists of Lists 🤯
I verified this fails without the tests in this PR like this:
💯