Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions parquet/src/arrow/array_reader/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ impl ByteArrayDecoderPlain {
output: &mut OffsetBuffer<I>,
len: usize,
) -> Result<usize> {
let initial_values_length = output.values.len();

let to_read = len.min(self.remaining_values);
output.offsets.reserve(to_read);

Expand Down Expand Up @@ -321,6 +323,10 @@ impl ByteArrayDecoderPlain {
read += 1;
}
self.remaining_values -= to_read;

if self.validate_utf8 {
output.values_as_str(initial_values_length)?;
}
Ok(to_read)
}
}
Expand Down Expand Up @@ -355,6 +361,8 @@ impl ByteArrayDecoderDeltaLength {
output: &mut OffsetBuffer<I>,
len: usize,
) -> Result<usize> {
let initial_values_length = output.values.len();

let to_read = len.min(self.lengths.len() - self.length_offset);
output.offsets.reserve(to_read);

Expand All @@ -381,6 +389,10 @@ impl ByteArrayDecoderDeltaLength {

self.data_offset += start_offset;
self.length_offset += to_read;

if self.validate_utf8 {
output.values_as_str(initial_values_length)?;
}
Ok(to_read)
}
}
Expand Down Expand Up @@ -424,6 +436,7 @@ impl ByteArrayDecoderDelta {
output: &mut OffsetBuffer<I>,
len: usize,
) -> Result<usize> {
let initial_values_length = output.values.len();
assert_eq!(self.prefix_lengths.len(), self.suffix_lengths.len());

let to_read = len.min(self.prefix_lengths.len() - self.length_offset);
Expand Down Expand Up @@ -455,6 +468,10 @@ impl ByteArrayDecoderDelta {
}

self.length_offset += to_read;

if self.validate_utf8 {
output.values_as_str(initial_values_length)?;
}
Ok(to_read)
}
}
Expand Down
31 changes: 25 additions & 6 deletions parquet/src/arrow/array_reader/offset_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,22 @@ impl<I: OffsetSizeTrait + ScalarValue> OffsetBuffer<I> {
self.offsets.len() - 1
}

/// If `validate_utf8` this verifies that the first character of `data` is
/// the start of a UTF-8 codepoint
///
/// Note: This does not verify that the entirety of `data` is valid
/// UTF-8. This should be done by calling [`Self::values_as_str`] after
/// all data has been written
pub fn try_push(&mut self, data: &[u8], validate_utf8: bool) -> Result<()> {
if validate_utf8 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for anyone else following along, I double checked the code and validate_utf8 is disabled for DataType::Binary as one would expect. It is always enabled for DataType::Utf8

if let Err(e) = std::str::from_utf8(data) {
return Err(ParquetError::General(format!(
"encountered non UTF-8 data: {}",
e
)));
if let Some(&b) = data.first() {
// A valid code-point iff it does not start with 0b10xxxxxx
// Bit-magic taken from `std::str::is_char_boundary`
if (b as i8) < -0x40 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return Err(ParquetError::General(
"encountered non UTF-8 data".to_string(),
));
}
}
}

Expand Down Expand Up @@ -68,6 +77,16 @@ impl<I: OffsetSizeTrait + ScalarValue> OffsetBuffer<I> {
Ok(())
}

/// Returns the values buffer as a string slice, returning an error
/// if it is invalid UTF-8
///
/// `start_offset` is the offset in bytes from the start
pub fn values_as_str(&self, start_offset: usize) -> Result<&str> {
std::str::from_utf8(&self.values.as_slice()[start_offset..]).map_err(|e| {
ParquetError::General(format!("encountered non UTF-8 data: {}", e))
})
}

pub fn into_array(
self,
null_buffer: Option<Buffer>,
Expand All @@ -84,7 +103,7 @@ impl<I: OffsetSizeTrait + ScalarValue> OffsetBuffer<I> {

let data = match cfg!(debug_assertions) {
true => array_data_builder.build().unwrap(),
false => unsafe { array_data_builder.build_unchecked() }
false => unsafe { array_data_builder.build_unchecked() },
};

make_array(data)
Expand Down