-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[Variant] Initial API for reading Variant data and metadata #7535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 41 commits
81e3847
9075dd4
1ebbad7
b3ecdd9
9688fa4
5705687
d80d1d8
9aa5d3b
696fa9b
798b627
8bea9b4
2e262f1
715081b
b774b38
5c668b4
0ceaedc
130e113
45a71c6
032a9d6
fe2bf45
10fe6bf
992a35d
c2c6bfd
695f49d
fd055ef
be0a001
ed77dbf
6549631
dafc028
88137b0
5b4115b
362341e
6946299
d3426ab
c507efb
7d2b06d
a9354c4
a68828b
f4b8f58
1358eed
1273a6d
3fad3eb
e203359
026774d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,158 @@ | ||||||||||||
| // Licensed to the Apache Software Foundation (ASF) under one | ||||||||||||
| // or more contributor license agreements. See the NOTICE file | ||||||||||||
| // distributed with this work for additional information | ||||||||||||
| // regarding copyright ownership. The ASF licenses this file | ||||||||||||
| // to you under the Apache License, Version 2.0 (the | ||||||||||||
| // "License"); you may not use this file except in compliance | ||||||||||||
| // with the License. You may obtain a copy of the License at | ||||||||||||
| // | ||||||||||||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||
| // | ||||||||||||
| // Unless required by applicable law or agreed to in writing, | ||||||||||||
| // software distributed under the License is distributed on an | ||||||||||||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||||||||||||
| // KIND, either express or implied. See the License for the | ||||||||||||
| // specific language governing permissions and limitations | ||||||||||||
| // under the License. | ||||||||||||
| use arrow_schema::ArrowError; | ||||||||||||
| use std::array::TryFromSliceError; | ||||||||||||
|
|
||||||||||||
| use crate::utils::{array_from_slice, first_byte_from_slice, slice_from_slice, string_from_slice}; | ||||||||||||
|
|
||||||||||||
| #[derive(Debug, Clone, Copy)] | ||||||||||||
| pub enum VariantBasicType { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These seem pretty basic, should they be in the top-level lib.rs instead of here in decoder.rs?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, let's move them in a follow on PR |
||||||||||||
| Primitive = 0, | ||||||||||||
| ShortString = 1, | ||||||||||||
| Object = 2, | ||||||||||||
| Array = 3, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| #[derive(Debug, Clone, Copy)] | ||||||||||||
| pub enum VariantPrimitiveType { | ||||||||||||
| Null = 0, | ||||||||||||
| BooleanTrue = 1, | ||||||||||||
| BooleanFalse = 2, | ||||||||||||
mkarbo marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||
| Int8 = 3, | ||||||||||||
| // TODO: Add types for the rest of primitives, once API is agreed upon | ||||||||||||
| String = 16, | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /// Extracts the basic type from a header byte | ||||||||||||
| pub(crate) fn get_basic_type(header: u8) -> Result<VariantBasicType, ArrowError> { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Do we ever expect to return Also, out of curiosity, why is one a function and the other an |
||||||||||||
| // See https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-encoding | ||||||||||||
| let basic_type = header & 0x03; // Basic type is encoded in the first 2 bits | ||||||||||||
| let basic_type = match basic_type { | ||||||||||||
| 0 => VariantBasicType::Primitive, | ||||||||||||
| 1 => VariantBasicType::ShortString, | ||||||||||||
| 2 => VariantBasicType::Object, | ||||||||||||
| 3 => VariantBasicType::Array, | ||||||||||||
| _ => { | ||||||||||||
| //NOTE: A 2-bit value has a max of 4 different values (0-3), hence this is unreachable as we | ||||||||||||
| // masked `basic_type` with 0x03 above. | ||||||||||||
| unreachable!(); | ||||||||||||
| } | ||||||||||||
| }; | ||||||||||||
| Ok(basic_type) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| impl TryFrom<u8> for VariantPrimitiveType { | ||||||||||||
| type Error = ArrowError; | ||||||||||||
|
|
||||||||||||
| fn try_from(value: u8) -> Result<Self, Self::Error> { | ||||||||||||
| match value { | ||||||||||||
| 0 => Ok(VariantPrimitiveType::Null), | ||||||||||||
| 1 => Ok(VariantPrimitiveType::BooleanTrue), | ||||||||||||
| 2 => Ok(VariantPrimitiveType::BooleanFalse), | ||||||||||||
| 3 => Ok(VariantPrimitiveType::Int8), | ||||||||||||
| // TODO: Add types for the rest, once API is agreed upon | ||||||||||||
| 16 => Ok(VariantPrimitiveType::String), | ||||||||||||
| _ => Err(ArrowError::InvalidArgumentError(format!( | ||||||||||||
| "unknown primitive type: {}", | ||||||||||||
| value | ||||||||||||
| ))), | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| /// Extract the primitive type from a Variant value-header byte | ||||||||||||
| pub(crate) fn get_primitive_type(header: u8) -> Result<VariantPrimitiveType, ArrowError> { | ||||||||||||
| // last 6 bits contain the primitive-type, see spec | ||||||||||||
| VariantPrimitiveType::try_from(header >> 2) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /// To be used in `map_err` when unpacking an integer from a slice of bytes. | ||||||||||||
| fn map_try_from_slice_error(e: TryFromSliceError) -> ArrowError { | ||||||||||||
| ArrowError::InvalidArgumentError(e.to_string()) | ||||||||||||
| } | ||||||||||||
|
Comment on lines
+82
to
+85
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be better actually to improve this conversion a bit to explain the error comes from trying to parse variants. If we just add a blanket |
||||||||||||
|
|
||||||||||||
| /// Decodes an Int8 from the value section of a variant. | ||||||||||||
| pub(crate) fn decode_int8(value: &[u8]) -> Result<i8, ArrowError> { | ||||||||||||
| let value = i8::from_le_bytes(array_from_slice(value, 1)?); | ||||||||||||
| Ok(value) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /// Decodes a long string from the value section of a variant. | ||||||||||||
| pub(crate) fn decode_long_string(value: &[u8]) -> Result<&str, ArrowError> { | ||||||||||||
| let len = u32::from_le_bytes(array_from_slice(value, 1)?) as usize; | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
|
||||||||||||
| let string = string_from_slice(value, 5..5 + len)?; | ||||||||||||
| Ok(string) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| /// Decodes a short string from the value section of a variant. | ||||||||||||
| pub(crate) fn decode_short_string(value: &[u8]) -> Result<&str, ArrowError> { | ||||||||||||
| let len = (first_byte_from_slice(value)? >> 2) as usize; | ||||||||||||
|
|
||||||||||||
| let string = string_from_slice(value, 1..1 + len)?; | ||||||||||||
|
Comment on lines
+102
to
+104
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
|
||||||||||||
| Ok(string) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| #[cfg(test)] | ||||||||||||
| mod tests { | ||||||||||||
| use super::*; | ||||||||||||
|
|
||||||||||||
| #[test] | ||||||||||||
| fn test_i8() -> Result<(), ArrowError> { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To really write tests like this it will be nice / easier if we have the variant builder API that @PinkCrow007 is working on
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, let me write these out a little more. I think having some hand-crafted tests (e.g., also for object) where we see how the bytes look will be helpful for readers as documentation, not just in validating. We can add test variants using both the builder & manual process. I personally learnt a lot by writing out the details with pen and paper for the metadata and value bytes for various variant data such as However, we can also skip them if you guys prefer, lmk. |
||||||||||||
| let value = [ | ||||||||||||
| 0 | 3 << 2, // Primitive type for i8 | ||||||||||||
| 42, | ||||||||||||
| ]; | ||||||||||||
| let result = decode_int8(&value)?; | ||||||||||||
| assert_eq!(result, 42); | ||||||||||||
| Ok(()) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| #[test] | ||||||||||||
| fn test_short_string() -> Result<(), ArrowError> { | ||||||||||||
| let value = [ | ||||||||||||
| 1 | 5 << 2, // Basic type for short string | length of short string | ||||||||||||
| 'H' as u8, | ||||||||||||
| 'e' as u8, | ||||||||||||
| 'l' as u8, | ||||||||||||
| 'l' as u8, | ||||||||||||
| 'o' as u8, | ||||||||||||
| 'o' as u8, | ||||||||||||
| ]; | ||||||||||||
| let result = decode_short_string(&value)?; | ||||||||||||
| assert_eq!(result, "Hello"); | ||||||||||||
| Ok(()) | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| #[test] | ||||||||||||
| fn test_string() -> Result<(), ArrowError> { | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW I don't think these unit tests are necessary given we have code that is reading the vaues from
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, let me write these out a little more. I think having some hand-crafted tests (e.g., also for object) where we see how the bytes look will be helpful for readers as documentation, not just in validating. We can add test variants using both the builder & manual process. I personally learnt a lot by writing out the details with pen and paper for the metadata and value bytes for various variant data such as However, we can also skip them if you guys prefer, lmk.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think they are ok, maybe we just shouldn't go too crazy. The other thing we could do is to make them examples in the documentation -- which would help both users and maintainers. |
||||||||||||
| let value = [ | ||||||||||||
| 0 | 16 << 2, // Basic type for short string | length of short string | ||||||||||||
| 5, | ||||||||||||
| 0, | ||||||||||||
| 0, | ||||||||||||
| 0, // Length of string | ||||||||||||
| 'H' as u8, | ||||||||||||
| 'e' as u8, | ||||||||||||
| 'l' as u8, | ||||||||||||
| 'l' as u8, | ||||||||||||
| 'o' as u8, | ||||||||||||
| 'o' as u8, | ||||||||||||
| ]; | ||||||||||||
| let result = decode_long_string(&value)?; | ||||||||||||
| assert_eq!(result, "Hello"); | ||||||||||||
| Ok(()) | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| //! End-to-end check: (almost) every sample from apache/parquet-testing/variant | ||
| //! can be parsed into our `Variant`. | ||
|
|
||
| // NOTE: We keep this file separate rather than a test mod inside variant.rs because it should be | ||
mkarbo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // moved to the test folder later | ||
| use std::fs; | ||
| use std::path::{Path, PathBuf}; | ||
|
|
||
| use crate::variant::{Variant, VariantMetadata}; | ||
| use arrow_schema::ArrowError; | ||
|
|
||
| fn cases_dir() -> PathBuf { | ||
| Path::new(env!("CARGO_MANIFEST_DIR")) | ||
| .join("..") | ||
| .join("parquet-testing") | ||
| .join("variant") | ||
| } | ||
|
|
||
| fn load_case(name: &str) -> Result<(Vec<u8>, Vec<u8>), ArrowError> { | ||
| let root = cases_dir(); | ||
| let meta = fs::read(root.join(format!("{name}.metadata")))?; | ||
| let val = fs::read(root.join(format!("{name}.value")))?; | ||
| Ok((meta, val)) | ||
| } | ||
|
|
||
| fn get_primitive_cases() -> Vec<(&'static str, Variant<'static, 'static>)> { | ||
| vec![ | ||
| ("primitive_boolean_false", Variant::BooleanFalse), | ||
| ("primitive_boolean_true", Variant::BooleanTrue), | ||
| ("primitive_int8", Variant::Int8(42)), | ||
| // Using the From<String> trait | ||
| ("primitive_string", Variant::from("This string is longer than 64 bytes and therefore does not fit in a short_string and it also includes several non ascii characters such as 🐢, 💖, ♥\u{fe0f}, 🎣 and 🤦!!")), | ||
| // Using the From<String> trait | ||
| ("short_string", Variant::from("Less than 64 bytes (❤\u{fe0f} with utf8)")), | ||
| // TODO Reenable when https://github.com/apache/parquet-testing/issues/81 is fixed | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW I made a PR to fix this |
||
| // ("primitive_null", Variant::Null), | ||
| ] | ||
| } | ||
|
|
||
| fn get_non_primitive_cases() -> Vec<&'static str> { | ||
| vec!["object_primitive", "array_primitive"] | ||
| } | ||
|
|
||
| #[test] | ||
| fn variant_primitive() -> Result<(), ArrowError> { | ||
| let cases = get_primitive_cases(); | ||
| for (case, want) in cases { | ||
| let (metadata_bytes, value) = load_case(case)?; | ||
| let metadata = VariantMetadata::try_new(&metadata_bytes)?; | ||
| let got = Variant::try_new(&metadata, &value)?; | ||
| assert_eq!(got, want); | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn variant_non_primitive() -> Result<(), ArrowError> { | ||
| let cases = get_non_primitive_cases(); | ||
| for case in cases { | ||
| let (metadata, value) = load_case(case)?; | ||
| let metadata = VariantMetadata::try_new(&metadata)?; | ||
| let variant = Variant::try_new(&metadata, &value)?; | ||
| match case { | ||
| "object_primitive" => { | ||
| assert!(matches!(variant, Variant::Object(_))); | ||
| assert_eq!(metadata.dictionary_size(), 7); | ||
| let dict_val = metadata.get_field_by(0)?; | ||
| assert_eq!(dict_val, "int_field"); | ||
| } | ||
| "array_primitive" => match variant { | ||
| Variant::Array(arr) => { | ||
| let v = arr.get(0)?; | ||
| assert!(matches!(v, Variant::Int8(2))); | ||
| let v = arr.get(1)?; | ||
| assert!(matches!(v, Variant::Int8(1))); | ||
| } | ||
| _ => panic!("expected an array"), | ||
| }, | ||
| _ => unreachable!(), | ||
| } | ||
| } | ||
| Ok(()) | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,55 @@ | ||||||||||||||||||||||||||||||||||
| // Licensed to the Apache Software Foundation (ASF) under one | ||||||||||||||||||||||||||||||||||
| // or more contributor license agreements. See the NOTICE file | ||||||||||||||||||||||||||||||||||
| // distributed with this work for additional information | ||||||||||||||||||||||||||||||||||
| // regarding copyright ownership. The ASF licenses this file | ||||||||||||||||||||||||||||||||||
| // to you under the Apache License, Version 2.0 (the | ||||||||||||||||||||||||||||||||||
| // "License"); you may not use this file except in compliance | ||||||||||||||||||||||||||||||||||
| // with the License. You may obtain a copy of the License at | ||||||||||||||||||||||||||||||||||
| // | ||||||||||||||||||||||||||||||||||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||||||||||||||||||||||||
| // | ||||||||||||||||||||||||||||||||||
| // Unless required by applicable law or agreed to in writing, | ||||||||||||||||||||||||||||||||||
| // software distributed under the License is distributed on an | ||||||||||||||||||||||||||||||||||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||||||||||||||||||||||||||||||||||
| // KIND, either express or implied. See the License for the | ||||||||||||||||||||||||||||||||||
| // specific language governing permissions and limitations | ||||||||||||||||||||||||||||||||||
| // under the License. | ||||||||||||||||||||||||||||||||||
| use std::{array::TryFromSliceError, ops::Range, str}; | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| use arrow_schema::ArrowError; | ||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||
| #[inline] | ||||||||||||||||||||||||||||||||||
| pub(crate) fn slice_from_slice(bytes: &[u8], range: Range<usize>) -> Result<&[u8], ArrowError> { | ||||||||||||||||||||||||||||||||||
| bytes.get(range.clone()).ok_or_else(|| { | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
| ArrowError::InvalidArgumentError(format!( | ||||||||||||||||||||||||||||||||||
| "Tried to extract {} bytes at offset {} from {}-byte buffer", | ||||||||||||||||||||||||||||||||||
| range.end - range.start, | ||||||||||||||||||||||||||||||||||
| range.start, | ||||||||||||||||||||||||||||||||||
| bytes.len(), | ||||||||||||||||||||||||||||||||||
| )) | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
| bytes.get(range.clone()).ok_or_else(|| { | |
| ArrowError::InvalidArgumentError(format!( | |
| "Tried to extract {} bytes at offset {} from {}-byte buffer", | |
| range.end - range.start, | |
| range.start, | |
| bytes.len(), | |
| )) | |
| bytes.get(range.clone()).ok_or_else(|| match range.end.checked_sub(range.start) { | |
| Some(len) => ArrowError::InvalidArgumentError(format!( | |
| "Tried to extract {len} bytes at offset {} from {}-byte buffer", | |
| range.start, | |
| bytes.len(), | |
| )), | |
| None => ArrowError::InvalidArgumentError(format!( | |
| "Invalid range: {}..{}", range.start, range.end, | |
| )), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update: If we go for the SliceIndex approach that just prints {index:?} in the error message, we no longer need to worry about overflow.
Uh oh!
There was an error while loading. Please reload this page.