diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index cc7f0cffd4cf..843352d1ff01 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -95,7 +95,7 @@ impl VariantArray { }; // Ensure the StructArray has a metadata field of BinaryView - let Some(metadata_field) = VariantArray::find_metadata_field(&inner) else { + let Some(metadata_field) = VariantArray::find_metadata_field(inner) else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: StructArray must contain a 'metadata' field".to_string(), )); @@ -106,7 +106,7 @@ impl VariantArray { metadata_field.data_type() ))); } - let Some(value_field) = VariantArray::find_value_field(&inner) else { + let Some(value_field) = VariantArray::find_value_field(inner) else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: StructArray must contain a 'value' field".to_string(), )); diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index f957ebb6f15b..3477f5fbfbe4 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -240,28 +240,23 @@ impl<'m> VariantMetadata<'m> { let value_buffer = string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?; - let mut offsets_iter = map_bytes_to_offsets(offset_bytes, self.header.offset_size); - let mut current_offset = offsets_iter.next().unwrap_or(0); + let mut offsets = map_bytes_to_offsets(offset_bytes, self.header.offset_size); if self.header.is_sorted { // Validate the dictionary values are unique and lexicographically sorted // // Since we use the offsets to access dictionary values, this also validates // offsets are in-bounds and monotonically increasing + let mut current_offset = offsets.next().unwrap_or(0); let mut prev_value: Option<&str> = None; - - for next_offset in offsets_iter { - if next_offset <= current_offset { - return Err(ArrowError::InvalidArgumentError( - "offsets not monotonically increasing".to_string(), - )); - } - + for next_offset in offsets { let current_value = value_buffer .get(current_offset..next_offset) .ok_or_else(|| { - ArrowError::InvalidArgumentError("offset out of bounds".to_string()) + ArrowError::InvalidArgumentError(format!( + "range {current_offset}..{next_offset} is invalid or out of bounds" + )) })?; if let Some(prev_val) = prev_value { @@ -281,13 +276,10 @@ impl<'m> VariantMetadata<'m> { // Since shallow validation ensures the first and last offsets are in bounds, // we can also verify all offsets are in-bounds by checking if // offsets are monotonically increasing - for next_offset in offsets_iter { - if next_offset <= current_offset { - return Err(ArrowError::InvalidArgumentError( - "offsets not monotonically increasing".to_string(), - )); - } - current_offset = next_offset; + if !offsets.is_sorted_by(|a, b| a < b) { + return Err(ArrowError::InvalidArgumentError( + "offsets not monotonically increasing".to_string(), + )); } } @@ -531,4 +523,44 @@ mod tests { "unexpected error: {err:?}" ); } + + #[test] + fn empty_string_is_valid() { + let bytes = &[ + 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1 + 1, + 0x00, + 0x00, + ]; + let metadata = VariantMetadata::try_new(bytes).unwrap(); + assert_eq!(&metadata[0], ""); + + let bytes = &[ + 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1 + 2, + 0x00, + 0x00, + 0x02, + b'h', + b'i', + ]; + let metadata = VariantMetadata::try_new(bytes).unwrap(); + assert_eq!(&metadata[0], ""); + assert_eq!(&metadata[1], "hi"); + + let bytes = &[ + 0b0001_0001, // header: offset_size_minus_one=0, ordered=1, version=1 + 2, + 0x00, + 0x02, + 0x02, // empty string is allowed, but must be first in a sorted dict + b'h', + b'i', + ]; + let err = VariantMetadata::try_new(bytes).unwrap_err(); + assert!( + matches!(err, ArrowError::InvalidArgumentError(_)), + "unexpected error: {err:?}" + ); + } } diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index bce2ffc876b5..f730e630cb72 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -553,6 +553,19 @@ mod tests { assert_eq!(variant_obj.field(2).unwrap().as_string(), Some("hello")); } + #[test] + fn test_variant_object_empty_fields() { + let mut builder = VariantBuilder::new(); + builder.new_object().with_field("", 42).finish().unwrap(); + let (metadata, value) = builder.finish(); + + // Resulting object is valid and has a single empty field + let variant = Variant::try_new(&metadata, &value).unwrap(); + let variant_obj = variant.as_object().unwrap(); + assert_eq!(variant_obj.len(), 1); + assert_eq!(variant_obj.get(""), Some(Variant::from(42))); + } + #[test] fn test_variant_object_empty() { // Create metadata with no fields