Skip to content

Commit 42b6c17

Browse files
scovichalamb
andauthored
[Variant] Reduce variant-related struct sizes (#7888)
# Which issue does this PR close? - Closes #7831 # Rationale for this change Variants naturally work with `u32` offsets, field ids, etc. Widening them artificially to `usize` on 64-bit architectures causes several problems: 1. A majority of developers will be using 64-bit architectures, and are unlikely to think about integer overflow issues when working with `usize`. But it's actually quite easy for malicious data or buggy code to overflow the `u32` values that variant actually relies on. Worse, it becomes difficult, if not impossible, to validate the code's resistance to 32-bit integer overflow, when manipulating `usize` values on 64-bit hardware. 2. Related to 1/, casting from `usize` to `u32` can clip the value on 64-bit hardware, which makes it harder to reason about the code's correctness (always wondering whether the value _might_ be larger than 32-bits can hold). In contrast, casting from `u32` to `usize` is safe in spite of being fallible in rust (assumes we do _not_ need to support 16-bit architectures). 3. The variant-related data structures occupy significantly more space than they need to, when storing (64-bit) `usize` offsets instead of `u32`. # What changes are included in this PR? Store all variant-related offsets as `u32` instead of `usize`. The `VariantMetadata`, `VariantObject` and `VariantList` structs shrink to 32/64/64 bytes (previously 40/88/80 bytes). Also, rename `OffsetSizeBytes::unpack_usize[_at_offset]` methods to `unpack_u32[_at_offset]`, to more accurately reflect what they actually do now. # Are these changes tested? Existing unit tests cover the use of these values; new static assertions will catch any future size changes. # Are there any user-facing changes? `VariantMetadata` is no longer `Copy`, reflecting the fact that this PR still leaves it 2x larger than a fat pointer. --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 7c42a83 commit 42b6c17

File tree

8 files changed

+118
-114
lines changed

8 files changed

+118
-114
lines changed

parquet-variant-json/src/from_json.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ mod test {
165165
expected: Variant<'a, 'a>,
166166
}
167167

168-
impl<'a> JsonToVariantTest<'a> {
168+
impl JsonToVariantTest<'_> {
169169
fn run(self) -> Result<(), ArrowError> {
170170
let mut variant_builder = VariantBuilder::new();
171171
json_to_variant(self.json, &mut variant_builder)?;

parquet-variant/src/builder.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1932,7 +1932,6 @@ mod tests {
19321932
assert!(metadata.is_empty());
19331933

19341934
let variant = Variant::try_new_with_metadata(metadata, &value).unwrap();
1935-
assert!(metadata.is_empty());
19361935
assert_eq!(variant, Variant::Int8(42));
19371936
}
19381937

parquet-variant/src/decoder.rs

Lines changed: 25 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@ use crate::ShortString;
2222
use arrow_schema::ArrowError;
2323
use chrono::{DateTime, Duration, NaiveDate, NaiveDateTime, Utc};
2424

25-
use std::num::TryFromIntError;
26-
2725
/// The basic type of a [`Variant`] value, encoded in the first two bits of the
2826
/// header byte.
2927
///
@@ -147,11 +145,9 @@ impl OffsetSizeBytes {
147145
/// * `bytes` – the byte buffer to index
148146
/// * `index` – 0-based index into the buffer
149147
///
150-
/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
151-
/// Three-byte values are zero-extended to 32 bits before the final
152-
/// fallible cast to `usize`.
153-
pub(crate) fn unpack_usize(&self, bytes: &[u8], index: usize) -> Result<usize, ArrowError> {
154-
self.unpack_usize_at_offset(bytes, 0, index)
148+
/// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended to 32 bits as needed.
149+
pub(crate) fn unpack_u32(&self, bytes: &[u8], index: usize) -> Result<u32, ArrowError> {
150+
self.unpack_u32_at_offset(bytes, 0, index)
155151
}
156152

157153
/// Return one unsigned little-endian value from `bytes`.
@@ -162,15 +158,13 @@ impl OffsetSizeBytes {
162158
/// * `offset_index` – 0-based index **after** the skipped bytes
163159
/// (`0` is the first value, `1` the next, …).
164160
///
165-
/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
166-
/// Three-byte values are zero-extended to 32 bits before the final
167-
/// fallible cast to `usize`.
168-
pub(crate) fn unpack_usize_at_offset(
161+
/// Each value is `self as u32` bytes wide (1, 2, 3 or 4), zero-extended to 32 bits as needed.
162+
pub(crate) fn unpack_u32_at_offset(
169163
&self,
170164
bytes: &[u8],
171165
byte_offset: usize, // how many bytes to skip
172166
offset_index: usize, // which offset in an array of offsets
173-
) -> Result<usize, ArrowError> {
167+
) -> Result<u32, ArrowError> {
174168
use OffsetSizeBytes::*;
175169

176170
// Index into the byte array:
@@ -179,7 +173,7 @@ impl OffsetSizeBytes {
179173
.checked_mul(*self as usize)
180174
.and_then(|n| n.checked_add(byte_offset))
181175
.ok_or_else(|| overflow_error("unpacking offset array value"))?;
182-
let result = match self {
176+
let value = match self {
183177
One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
184178
Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
185179
Three => {
@@ -192,11 +186,7 @@ impl OffsetSizeBytes {
192186
}
193187
Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
194188
};
195-
196-
// Convert the u32 we extracted to usize (should always succeed on 32- and 64-bit arch)
197-
result
198-
.try_into()
199-
.map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))
189+
Ok(value)
200190
}
201191
}
202192

@@ -518,57 +508,51 @@ mod tests {
518508
}
519509

520510
#[test]
521-
fn unpack_usize_all_widths() {
511+
fn unpack_u32_all_widths() {
522512
// One-byte offsets
523513
let buf_one = [0x01u8, 0xAB, 0xCD];
524-
assert_eq!(
525-
OffsetSizeBytes::One.unpack_usize(&buf_one, 0).unwrap(),
526-
0x01
527-
);
528-
assert_eq!(
529-
OffsetSizeBytes::One.unpack_usize(&buf_one, 2).unwrap(),
530-
0xCD
531-
);
514+
assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 0).unwrap(), 0x01);
515+
assert_eq!(OffsetSizeBytes::One.unpack_u32(&buf_one, 2).unwrap(), 0xCD);
532516

533517
// Two-byte offsets (little-endian 0x1234, 0x5678)
534518
let buf_two = [0x34, 0x12, 0x78, 0x56];
535519
assert_eq!(
536-
OffsetSizeBytes::Two.unpack_usize(&buf_two, 0).unwrap(),
520+
OffsetSizeBytes::Two.unpack_u32(&buf_two, 0).unwrap(),
537521
0x1234
538522
);
539523
assert_eq!(
540-
OffsetSizeBytes::Two.unpack_usize(&buf_two, 1).unwrap(),
524+
OffsetSizeBytes::Two.unpack_u32(&buf_two, 1).unwrap(),
541525
0x5678
542526
);
543527

544528
// Three-byte offsets (0x030201 and 0x0000FF)
545529
let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
546530
assert_eq!(
547-
OffsetSizeBytes::Three.unpack_usize(&buf_three, 0).unwrap(),
531+
OffsetSizeBytes::Three.unpack_u32(&buf_three, 0).unwrap(),
548532
0x030201
549533
);
550534
assert_eq!(
551-
OffsetSizeBytes::Three.unpack_usize(&buf_three, 1).unwrap(),
535+
OffsetSizeBytes::Three.unpack_u32(&buf_three, 1).unwrap(),
552536
0x0000FF
553537
);
554538

555539
// Four-byte offsets (0x12345678, 0x90ABCDEF)
556540
let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
557541
assert_eq!(
558-
OffsetSizeBytes::Four.unpack_usize(&buf_four, 0).unwrap(),
542+
OffsetSizeBytes::Four.unpack_u32(&buf_four, 0).unwrap(),
559543
0x1234_5678
560544
);
561545
assert_eq!(
562-
OffsetSizeBytes::Four.unpack_usize(&buf_four, 1).unwrap(),
546+
OffsetSizeBytes::Four.unpack_u32(&buf_four, 1).unwrap(),
563547
0x90AB_CDEF
564548
);
565549
}
566550

567551
#[test]
568-
fn unpack_usize_out_of_bounds() {
552+
fn unpack_u32_out_of_bounds() {
569553
let tiny = [0x00u8]; // deliberately too short
570-
assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0).is_err());
571-
assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0).is_err());
554+
assert!(OffsetSizeBytes::Two.unpack_u32(&tiny, 0).is_err());
555+
assert!(OffsetSizeBytes::Three.unpack_u32(&tiny, 0).is_err());
572556
}
573557

574558
#[test]
@@ -584,20 +568,20 @@ mod tests {
584568
let width = OffsetSizeBytes::Two;
585569

586570
// dictionary_size starts immediately after the header byte
587-
let dict_size = width.unpack_usize_at_offset(&buf, 1, 0).unwrap();
571+
let dict_size = width.unpack_u32_at_offset(&buf, 1, 0).unwrap();
588572
assert_eq!(dict_size, 2);
589573

590574
// offset array immediately follows the dictionary size
591-
let first = width.unpack_usize_at_offset(&buf, 1, 1).unwrap();
575+
let first = width.unpack_u32_at_offset(&buf, 1, 1).unwrap();
592576
assert_eq!(first, 0);
593577

594-
let second = width.unpack_usize_at_offset(&buf, 1, 2).unwrap();
578+
let second = width.unpack_u32_at_offset(&buf, 1, 2).unwrap();
595579
assert_eq!(second, 5);
596580

597-
let third = width.unpack_usize_at_offset(&buf, 1, 3).unwrap();
581+
let third = width.unpack_u32_at_offset(&buf, 1, 3).unwrap();
598582
assert_eq!(third, 9);
599583

600-
let err = width.unpack_usize_at_offset(&buf, 1, 4);
584+
let err = width.unpack_u32_at_offset(&buf, 1, 4);
601585
assert!(err.is_err())
602586
}
603587
}

parquet-variant/src/utils.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,3 +122,12 @@ where
122122

123123
Some(Err(start))
124124
}
125+
126+
/// Verifies the expected size of type T, for a type that should only grow if absolutely necessary.
127+
#[allow(unused)]
128+
pub(crate) const fn expect_size_of<T>(expected: usize) {
129+
let size = std::mem::size_of::<T>();
130+
if size != expected {
131+
let _ = [""; 0][size];
132+
}
133+
}

parquet-variant/src/variant.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ pub enum Variant<'m, 'v> {
256256
List(VariantList<'m, 'v>),
257257
}
258258

259+
// We don't want this to grow because it could hurt performance of a frequently-created type.
260+
const _: () = crate::utils::expect_size_of::<Variant>(80);
261+
259262
impl<'m, 'v> Variant<'m, 'v> {
260263
/// Attempts to interpret a metadata and value buffer pair as a new `Variant`.
261264
///

parquet-variant/src/variant/list.rs

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use crate::variant::{Variant, VariantMetadata};
2323
use arrow_schema::ArrowError;
2424

2525
// The value header occupies one byte; use a named constant for readability
26-
const NUM_HEADER_BYTES: usize = 1;
26+
const NUM_HEADER_BYTES: u32 = 1;
2727

2828
/// A parsed version of the variant array value header byte.
2929
#[derive(Debug, Clone, PartialEq)]
@@ -34,15 +34,15 @@ pub(crate) struct VariantListHeader {
3434

3535
impl VariantListHeader {
3636
// Hide the ugly casting
37-
const fn num_elements_size(&self) -> usize {
37+
const fn num_elements_size(&self) -> u32 {
3838
self.num_elements_size as _
3939
}
40-
const fn offset_size(&self) -> usize {
40+
const fn offset_size(&self) -> u32 {
4141
self.offset_size as _
4242
}
4343

4444
// Avoid materializing this offset, since it's cheaply and safely computable
45-
const fn first_offset_byte(&self) -> usize {
45+
const fn first_offset_byte(&self) -> u32 {
4646
NUM_HEADER_BYTES + self.num_elements_size()
4747
}
4848

@@ -122,11 +122,14 @@ pub struct VariantList<'m, 'v> {
122122
pub metadata: VariantMetadata<'m>,
123123
pub value: &'v [u8],
124124
header: VariantListHeader,
125-
num_elements: usize,
126-
first_value_byte: usize,
125+
num_elements: u32,
126+
first_value_byte: u32,
127127
validated: bool,
128128
}
129129

130+
// We don't want this to grow because it could increase the size of `Variant` and hurt performance.
131+
const _: () = crate::utils::expect_size_of::<VariantList>(64);
132+
130133
impl<'m, 'v> VariantList<'m, 'v> {
131134
/// Attempts to interpret `value` as a variant array value.
132135
///
@@ -157,7 +160,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
157160
let num_elements =
158161
header
159162
.num_elements_size
160-
.unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?;
163+
.unpack_u32_at_offset(value, NUM_HEADER_BYTES as _, 0)?;
161164

162165
// (num_elements + 1) * offset_size + first_offset_byte
163166
let first_value_byte = num_elements
@@ -185,10 +188,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
185188

186189
// Use the last offset to upper-bound the value buffer
187190
let last_offset = new_self
188-
.get_offset(num_elements)?
191+
.get_offset(num_elements as _)?
189192
.checked_add(first_value_byte)
190193
.ok_or_else(|| overflow_error("variant array size"))?;
191-
new_self.value = slice_from_slice(value, ..last_offset)?;
194+
new_self.value = slice_from_slice(value, ..last_offset as _)?;
192195
Ok(new_self)
193196
}
194197

@@ -210,7 +213,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
210213

211214
let offset_buffer = slice_from_slice(
212215
self.value,
213-
self.header.first_offset_byte()..self.first_value_byte,
216+
self.header.first_offset_byte() as _..self.first_value_byte as _,
214217
)?;
215218

216219
let offsets =
@@ -226,15 +229,15 @@ impl<'m, 'v> VariantList<'m, 'v> {
226229
));
227230
}
228231

229-
let value_buffer = slice_from_slice(self.value, self.first_value_byte..)?;
232+
let value_buffer = slice_from_slice(self.value, self.first_value_byte as _..)?;
230233

231234
// Validate whether values are valid variant objects
232235
for i in 1..offsets.len() {
233236
let start_offset = offsets[i - 1];
234237
let end_offset = offsets[i];
235238

236239
let value_bytes = slice_from_slice(value_buffer, start_offset..end_offset)?;
237-
Variant::try_new_with_metadata(self.metadata, value_bytes)?;
240+
Variant::try_new_with_metadata(self.metadata.clone(), value_bytes)?;
238241
}
239242

240243
self.validated = true;
@@ -244,7 +247,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
244247

245248
/// Return the length of this array
246249
pub fn len(&self) -> usize {
247-
self.num_elements
250+
self.num_elements as _
248251
}
249252

250253
/// Is the array of zero length
@@ -256,7 +259,7 @@ impl<'m, 'v> VariantList<'m, 'v> {
256259
///
257260
/// [invalid]: Self#Validation
258261
pub fn get(&self, index: usize) -> Option<Variant<'m, 'v>> {
259-
(index < self.num_elements).then(|| {
262+
(index < self.len()).then(|| {
260263
self.try_get_with_shallow_validation(index)
261264
.expect("Invalid variant array element")
262265
})
@@ -272,10 +275,10 @@ impl<'m, 'v> VariantList<'m, 'v> {
272275
fn try_get_with_shallow_validation(&self, index: usize) -> Result<Variant<'m, 'v>, ArrowError> {
273276
// Fetch the value bytes between the two offsets for this index, from the value array region
274277
// of the byte buffer
275-
let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?;
278+
let byte_range = self.get_offset(index)? as _..self.get_offset(index + 1)? as _;
276279
let value_bytes =
277-
slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?;
278-
Variant::try_new_with_metadata_and_shallow_validation(self.metadata, value_bytes)
280+
slice_from_slice_at_offset(self.value, self.first_value_byte as _, byte_range)?;
281+
Variant::try_new_with_metadata_and_shallow_validation(self.metadata.clone(), value_bytes)
279282
}
280283

281284
/// Iterates over the values of this list. When working with [unvalidated] input, consider
@@ -297,14 +300,14 @@ impl<'m, 'v> VariantList<'m, 'v> {
297300
fn iter_try_with_shallow_validation(
298301
&self,
299302
) -> impl Iterator<Item = Result<Variant<'m, 'v>, ArrowError>> + '_ {
300-
(0..self.len()).map(move |i| self.try_get_with_shallow_validation(i))
303+
(0..self.len()).map(|i| self.try_get_with_shallow_validation(i))
301304
}
302305

303306
// Attempts to retrieve the ith offset from the offset array region of the byte buffer.
304-
fn get_offset(&self, index: usize) -> Result<usize, ArrowError> {
305-
let byte_range = self.header.first_offset_byte()..self.first_value_byte;
307+
fn get_offset(&self, index: usize) -> Result<u32, ArrowError> {
308+
let byte_range = self.header.first_offset_byte() as _..self.first_value_byte as _;
306309
let offset_bytes = slice_from_slice(self.value, byte_range)?;
307-
self.header.offset_size.unpack_usize(offset_bytes, index)
310+
self.header.offset_size.unpack_u32(offset_bytes, index)
308311
}
309312
}
310313

@@ -623,7 +626,7 @@ mod tests {
623626
expected_num_element_size,
624627
variant_list.header.num_elements_size
625628
);
626-
assert_eq!(list_size, variant_list.num_elements);
629+
assert_eq!(list_size, variant_list.num_elements as usize);
627630

628631
// verify the data in the variant
629632
assert_eq!(list_size, variant_list.len());

0 commit comments

Comments
 (0)