Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 50 additions & 18 deletions parquet-variant/src/variant/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,28 +240,23 @@ impl<'m> VariantMetadata<'m> {
let value_buffer =
string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;

let mut offsets_iter = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
let mut current_offset = offsets_iter.next().unwrap_or(0);
let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An insignificant point. I named it *_iter, which exists in both metadata and object. If you want to make modifications, they should be consistent.


if self.header.is_sorted {
// Validate the dictionary values are unique and lexicographically sorted
//
// Since we use the offsets to access dictionary values, this also validates
// offsets are in-bounds and monotonically increasing
let mut current_offset = offsets.next().unwrap_or(0);
let mut prev_value: Option<&str> = None;

for next_offset in offsets_iter {
if next_offset <= current_offset {
return Err(ArrowError::InvalidArgumentError(
"offsets not monotonically increasing".to_string(),
));
}

for next_offset in offsets {
let current_value =
value_buffer
.get(current_offset..next_offset)
.ok_or_else(|| {
ArrowError::InvalidArgumentError("offset out of bounds".to_string())
ArrowError::InvalidArgumentError(format!(
"range {current_offset}..{next_offset} is invalid or out of bounds"
))
})?;

if let Some(prev_val) = prev_value {
Expand All @@ -281,13 +276,10 @@ impl<'m> VariantMetadata<'m> {
// Since shallow validation ensures the first and last offsets are in bounds,
// we can also verify all offsets are in-bounds by checking if
// offsets are monotonically increasing
for next_offset in offsets_iter {
if next_offset <= current_offset {
return Err(ArrowError::InvalidArgumentError(
"offsets not monotonically increasing".to_string(),
));
}
current_offset = next_offset;
if !offsets.is_sorted_by(|a, b| a < b) {
return Err(ArrowError::InvalidArgumentError(
"offsets not monotonically increasing".to_string(),
));
}
}

Expand Down Expand Up @@ -531,4 +523,44 @@ mod tests {
"unexpected error: {err:?}"
);
}

#[test]
fn empty_string_is_valid() {
let bytes = &[
0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
1,
0x00,
0x00,
];
let metadata = VariantMetadata::try_new(bytes).unwrap();
assert_eq!(&metadata[0], "");

let bytes = &[
0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
2,
0x00,
0x00,
0x02,
b'h',
b'i',
];
let metadata = VariantMetadata::try_new(bytes).unwrap();
assert_eq!(&metadata[0], "");
assert_eq!(&metadata[1], "hi");

let bytes = &[
0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1
2,
0x00,
0x02,
0x02, // empty string is allowed, but must be first in a sorted dict
b'h',
b'i',
];
let err = VariantMetadata::try_new(bytes).unwrap_err();
assert!(
matches!(err, ArrowError::InvalidArgumentError(_)),
"unexpected error: {err:?}"
);
}
}
Loading