-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[Variant] Initial API for reading Variant data and metadata #7535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
81e3847
9075dd4
1ebbad7
b3ecdd9
9688fa4
5705687
d80d1d8
9aa5d3b
696fa9b
798b627
8bea9b4
2e262f1
715081b
b774b38
5c668b4
0ceaedc
130e113
45a71c6
032a9d6
fe2bf45
10fe6bf
992a35d
c2c6bfd
695f49d
fd055ef
be0a001
ed77dbf
6549631
dafc028
88137b0
5b4115b
362341e
6946299
d3426ab
c507efb
7d2b06d
a9354c4
a68828b
f4b8f58
1358eed
1273a6d
3fad3eb
e203359
026774d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,149 @@ | ||
| // NOTE: Largely based on the implementation of @PinkCrow007 in https://github.com/apache/arrow-rs/pull/7452 | ||
| // And the feedback there. | ||
mkarbo marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| use arrow_schema::ArrowError; | ||
| use std::{array::TryFromSliceError, str}; | ||
|
|
||
| use crate::utils::{array_from_slice, invalid_utf8_err, non_empty_slice, slice_from_slice}; | ||
|
|
||
| #[derive(Debug, Clone, Copy)] | ||
| pub enum VariantBasicType { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These seem pretty basic, should they be in the top-level lib.rs instead of here in decoder.rs?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, let's move them in a follow on PR |
||
| Primitive = 0, | ||
| ShortString = 1, | ||
| Object = 2, | ||
| Array = 3, | ||
| } | ||
|
|
||
| #[derive(Debug, Clone, Copy)] | ||
| pub enum VariantPrimitiveType { | ||
| Null = 0, | ||
| BooleanTrue = 1, | ||
| BooleanFalse = 2, | ||
mkarbo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Int8 = 3, | ||
| // TODO: Add types for the rest of primitives, once API is agreed upon | ||
| String = 16, | ||
| } | ||
|
|
||
| /// Extracts the basic type from a header byte | ||
| pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Do we ever expect to return Also, out of curiosity, why is one a function and the other an |
||
| // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding | ||
| let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits | ||
| let basic_type = match basic_type { | ||
| 0 => VariantBasicType::Primitive, | ||
| 1 => VariantBasicType::ShortString, | ||
| 2 => VariantBasicType::Object, | ||
| 3 => VariantBasicType::Array, | ||
| _ => { | ||
| return Err(ArrowError::InvalidArgumentError(format!( | ||
| "unknown basic type: {}", | ||
| basic_type | ||
| ))) | ||
| } | ||
mkarbo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| }; | ||
| Ok(basic_type) | ||
| } | ||
|
|
||
| /// Extracts the primitive type from a header byte | ||
| pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> { | ||
|
||
| // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding | ||
| //// Primitive type is encoded in the last 6 bits of the header byte | ||
| let primitive_type = header >> 2; | ||
| let primitive_type = match primitive_type { | ||
| 0 => VariantPrimitiveType::Null, | ||
| 1 => VariantPrimitiveType::BooleanTrue, | ||
| 2 => VariantPrimitiveType::BooleanFalse, | ||
| 3 => VariantPrimitiveType::Int8, | ||
mkarbo marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| // TODO: Add types for the rest, once API is agreed upon | ||
| 16 => VariantPrimitiveType::String, | ||
| _ => { | ||
| return Err(ArrowError::InvalidArgumentError(format!( | ||
| "unknown primitive type: {}", | ||
| primitive_type | ||
| ))) | ||
| } | ||
| }; | ||
| Ok(primitive_type) | ||
| } | ||
|
|
||
| /// To be used in `map_err` when unpacking an integer from a slice of bytes. | ||
| fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { | ||
| ArrowError::InvalidArgumentError(e.to_string()) | ||
| } | ||
|
Comment on lines
+82
to
+85
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be better actually to improve this conversion a bit to explain the error comes from trying to parse variants. If we just add a blanket |
||
|
|
||
| /// Decodes an Int8 from the value section of a variant. | ||
| pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> { | ||
| let value = i8::from_le_bytes(array_from_slice(value, 1)?); | ||
| Ok(value) | ||
| } | ||
|
|
||
| /// Decodes a long string from the value section of a variant. | ||
| pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { | ||
| let len = u32::from_le_bytes( | ||
| slice_from_slice(value, 1..5)? | ||
| .try_into() | ||
| .map_err(map_try_from_slice_error)?, | ||
| ) as usize; | ||
mkarbo marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| let string = | ||
| str::from_utf8(slice_from_slice(value, 5..5 + len)?).map_err(|_| invalid_utf8_err())?; | ||
|
||
| Ok(string) | ||
| } | ||
|
|
||
| /// Decodes a short string from the value section of a variant. | ||
| pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { | ||
| let len = (non_empty_slice(value)?[0] >> 2) as usize; | ||
|
|
||
| let string = | ||
| str::from_utf8(slice_from_slice(value, 1..1 + len)?).map_err(|_| invalid_utf8_err())?; | ||
| Ok(string) | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
|
|
||
| #[test] | ||
| fn test_i8() -> Result<(), ArrowError> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To really write tests like this it will be nice / easier if we have the variant builder API that @PinkCrow007 is working on
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, let me write these out a little more. I think having some hand-crafted tests (e.g., also for object) where we see how the bytes look will be helpful for readers as documentation, not just in validating. We can add test variants using both the builder & manual process. I personally learnt a lot by writing out the details with pen and paper for the metadata and value bytes for various variant data such as However, we can also skip them if you guys prefer, lmk. |
||
| let value = [ | ||
| 0 | 3 << 2, // Primitive type for i8 | ||
| 42, | ||
| ]; | ||
| let result = decode_int8(&value)?; | ||
| assert_eq!(result, 42); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_short_string() -> Result<(), ArrowError> { | ||
| let value = [ | ||
| 1 | 5 << 2, // Basic type for short string | length of short string | ||
| 'H' as u8, | ||
| 'e' as u8, | ||
| 'l' as u8, | ||
| 'l' as u8, | ||
| 'o' as u8, | ||
| 'o' as u8, | ||
| ]; | ||
| let result = decode_short_string(&value)?; | ||
| assert_eq!(result, "Hello"); | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_string() -> Result<(), ArrowError> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW I don't think these unit tests are necessary given we have code that is reading the vaues from
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, let me write these out a little more. I think having some hand-crafted tests (e.g., also for object) where we see how the bytes look will be helpful for readers as documentation, not just in validating. We can add test variants using both the builder & manual process. I personally learnt a lot by writing out the details with pen and paper for the metadata and value bytes for various variant data such as However, we can also skip them if you guys prefer, lmk.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think they are ok, maybe we just shouldn't go too crazy. The other thing we could do is to make them examples in the documentation -- which would help both users and maintainers. |
||
| let value = [ | ||
| 0 | 16 << 2, // Basic type for short string | length of short string | ||
| 5, | ||
| 0, | ||
| 0, | ||
| 0, // Length of string | ||
| 'H' as u8, | ||
| 'e' as u8, | ||
| 'l' as u8, | ||
| 'l' as u8, | ||
| 'o' as u8, | ||
| 'o' as u8, | ||
| ]; | ||
| let result = decode_long_string(&value)?; | ||
| assert_eq!(result, "Hello"); | ||
| Ok(()) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| //! End-to-end check: (almost) every sample from apache/parquet-testing/variant | ||
| //! can be parsed into our `Variant`. | ||
| // NOTE: We keep this file separate rather than a test mod inside variant.rs because it should be | ||
mkarbo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // moved to the test folder later | ||
| use std::fs; | ||
| use std::path::{Path, PathBuf}; | ||
|
|
||
| use crate::variant::{Variant, VariantMetadata}; | ||
| use arrow_schema::ArrowError; | ||
|
|
||
| fn cases_dir() -> PathBuf { | ||
| Path::new(env!("CARGO_MANIFEST_DIR")) | ||
| .join("..") | ||
| .join("parquet-testing") | ||
| .join("variant") | ||
| } | ||
|
|
||
| fn load_case(name: &str) -> Result<(Vec<u8>, Vec<u8>), ArrowError> { | ||
| let root = cases_dir(); | ||
| let meta = fs::read(root.join(format!("{name}.metadata")))?; | ||
| let val = fs::read(root.join(format!("{name}.value")))?; | ||
| Ok((meta, val)) | ||
| } | ||
|
|
||
| fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { | ||
| vec![ | ||
| ("primitive_boolean_false", Variant::BooleanFalse), | ||
| ("primitive_boolean_true", Variant::BooleanTrue), | ||
| ("primitive_int8", Variant::Int8(42)), | ||
| // Using the From<String> trait | ||
| ("primitive_string", Variant::from("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")), | ||
| // Using the From<String> trait | ||
| ("short_string", Variant::from("Less than 64 bytes (❤\u{fe0f} with utf8)")), | ||
| // TODO Reenable when https://github.com/apache/parquet-testing/issues/81 is fixed | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW I made a PR to fix this |
||
| // ("primitive_null", Variant::Null), | ||
| ] | ||
| } | ||
|
|
||
| fn get_non_primitive_cases() -> Vec<&'static str> { | ||
| vec!["object_primitive"] | ||
| } | ||
|
|
||
| #[test] | ||
| fn variant_primitive() -> Result<(), ArrowError> { | ||
| let cases = get_primitive_cases(); | ||
| for (case, want) in cases { | ||
| let (metadata_bytes, value) = load_case(case)?; | ||
| let metadata = VariantMetadata::try_new(&metadata_bytes)?; | ||
| let got = Variant::try_new(&metadata, &value)?; | ||
| assert_eq!(got, want); | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn variant_non_primitive() -> Result<(), ArrowError> { | ||
| let cases = get_non_primitive_cases(); | ||
| for case in cases { | ||
| let (metadata, value) = load_case(case)?; | ||
| let metadata = VariantMetadata::try_new(&metadata)?; | ||
| let variant = Variant::try_new(&metadata, &value)?; | ||
| match case { | ||
| "object_primitive" => { | ||
| assert!(matches!(variant, Variant::Object(_))); | ||
| assert_eq!(metadata.dictionary_size(), 7); | ||
| } | ||
| _ => unreachable!(), | ||
| } | ||
| } | ||
| Ok(()) | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,42 @@ | ||||||||||||||||||||||||||||||||||
| use std::{array::TryFromSliceError, ops::Range}; | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| use arrow_schema::ArrowError; | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| #[inline] | ||||||||||||||||||||||||||||||||||
| pub(crate) fn slice_from_slice(bytes: &[u8], range: Range<usize>) -> Result<&[u8], ArrowError> { | ||||||||||||||||||||||||||||||||||
| bytes.get(range.clone()).ok_or_else(|| { | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
| ArrowError::InvalidArgumentError(format!( | ||||||||||||||||||||||||||||||||||
| "Tried to extract {} bytes at offset {} from {}-byte buffer", | ||||||||||||||||||||||||||||||||||
| range.end - range.start, | ||||||||||||||||||||||||||||||||||
| range.start, | ||||||||||||||||||||||||||||||||||
| bytes.len(), | ||||||||||||||||||||||||||||||||||
| )) | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
| bytes.get(range.clone()).ok_or_else(|| { | |
| ArrowError::InvalidArgumentError(format!( | |
| "Tried to extract {} bytes at offset {} from {}-byte buffer", | |
| range.end - range.start, | |
| range.start, | |
| bytes.len(), | |
| )) | |
| bytes.get(range.clone()).ok_or_else(|| match range.end.checked_sub(range.start) { | |
| Some(len) => ArrowError::InvalidArgumentError(format!( | |
| "Tried to extract {len} bytes at offset {} from {}-byte buffer", | |
| range.start, | |
| bytes.len(), | |
| )), | |
| None => ArrowError::InvalidArgumentError(format!( | |
| "Invalid range: {}..{}", range.start, range.end, | |
| )), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update: If we go for the SliceIndex approach that just prints {index:?} in the error message, we no longer need to worry about overflow.
Uh oh!
There was an error while loading. Please reload this page.