Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 181 additions & 9 deletions parquet-variant/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ impl ValueBuffer {
self.0.len()
}

fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
fn append_non_nested_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
let variant = value.into();
match variant {
Variant::Null => self.append_null(),
Expand Down Expand Up @@ -212,7 +212,9 @@ impl ValueBuffer {
Variant::String(s) => self.append_string(s),
Variant::ShortString(s) => self.append_short_string(s),
Variant::Object(_) | Variant::List(_) => {
todo!("How does this work with the redesign?");
unreachable!(
"Nested values are handled specially by ObjectBuilder and ListBuilder"
);
}
}
}
Expand Down Expand Up @@ -414,7 +416,7 @@ impl VariantBuilder {
}

pub fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
self.buffer.append_value(value);
self.buffer.append_non_nested_value(value);
}

pub fn finish(self) -> (Vec<u8>, Vec<u8>) {
Expand Down Expand Up @@ -477,6 +479,7 @@ pub struct ListBuilder<'a> {
metadata_builder: &'a mut MetadataBuilder,
offsets: Vec<usize>,
buffer: ValueBuffer,
/// Is there a pending nested object or list that needs to be finalized?
pending: bool,
}

Expand Down Expand Up @@ -523,7 +526,7 @@ impl<'a> ListBuilder<'a> {
pub fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
self.check_new_offset();

self.buffer.append_value(value);
self.buffer.append_non_nested_value(value);
let element_end = self.buffer.offset();
self.offsets.push(element_end);
}
Expand Down Expand Up @@ -574,20 +577,23 @@ impl<'a> ListBuilder<'a> {
/// A builder for creating [`Variant::Object`] values.
///
/// See the examples on [`VariantBuilder`] for usage.
pub struct ObjectBuilder<'a> {
pub struct ObjectBuilder<'a, 'b> {
parent_buffer: &'a mut ValueBuffer,
metadata_builder: &'a mut MetadataBuilder,
fields: BTreeMap<u32, usize>, // (field_id, offset)
buffer: ValueBuffer,
/// Is there a pending list or object that needs to be finalized?
pending: Option<(&'b str, usize)>,
}

impl<'a> ObjectBuilder<'a> {
impl<'a, 'b> ObjectBuilder<'a, 'b> {
fn new(parent_buffer: &'a mut ValueBuffer, metadata_builder: &'a mut MetadataBuilder) -> Self {
Self {
parent_buffer,
metadata_builder,
fields: BTreeMap::new(),
buffer: ValueBuffer::default(),
pending: None,
}
}

Expand All @@ -600,11 +606,50 @@ impl<'a> ObjectBuilder<'a> {
let field_start = self.buffer.offset();

self.fields.insert(field_id, field_start);
self.buffer.append_value(value);
self.buffer.append_non_nested_value(value);
}

/// Finalize object with sorted fields
pub fn finish(self) {
fn check_pending_field(&mut self) {
let Some((field_name, field_start)) = self.pending.as_ref() else {
return;
};

let field_id = self.metadata_builder.add_field_name(field_name);
self.fields.insert(field_id, *field_start);

self.pending = None;
}

/// Return a new [`ObjectBuilder`] to add a nested object with the specified
/// key to the object.
pub fn new_object(&mut self, key: &'b str) -> ObjectBuilder {
self.check_pending_field();

let field_start = self.buffer.offset();
let obj_builder = ObjectBuilder::new(&mut self.buffer, self.metadata_builder);
self.pending = Some((key, field_start));

obj_builder
}

/// Return a new [`ListBuilder`] to add a list with the specified key to the
/// object.
pub fn new_list(&mut self, key: &'b str) -> ListBuilder {
self.check_pending_field();

let field_start = self.buffer.offset();
let list_builder = ListBuilder::new(&mut self.buffer, self.metadata_builder);
self.pending = Some((key, field_start));

list_builder
}

/// Finalize object
///
/// This consumes self and writes the object to the parent buffer.
pub fn finish(mut self) {
self.check_pending_field();

let data_size = self.buffer.offset();
let num_fields = self.fields.len();
let is_large = num_fields > u8::MAX as usize;
Expand Down Expand Up @@ -1192,4 +1237,131 @@ mod tests {

assert_eq!(list.get(4).unwrap(), Variant::from(3));
}

#[test]
fn test_nested_object() {
/*
{
"c": {
"b": "a"
}
}

*/

let mut builder = VariantBuilder::new();
{
let mut outer_object_builder = builder.new_object();
{
let mut inner_object_builder = outer_object_builder.new_object("c");
inner_object_builder.insert("b", "a");
inner_object_builder.finish();
}

outer_object_builder.finish();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at these examples, it seems to me like the current API requires callingfinish -- it seems like it might be a nicer API if that just happened on drop() -- so you could use the scope of the example, like you have here, and the dropping would just complete the object

This would be a great follow on PR

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}

let (metadata, value) = builder.finish();
let variant = Variant::try_new(&metadata, &value).unwrap();
let outer_object = variant.as_object().unwrap();

assert_eq!(outer_object.len(), 1);
assert_eq!(outer_object.field_name(0).unwrap(), "c");

let inner_object_variant = outer_object.field(0).unwrap();
let inner_object = inner_object_variant.as_object().unwrap();

assert_eq!(inner_object.len(), 1);
assert_eq!(inner_object.field_name(0).unwrap(), "b");
assert_eq!(inner_object.field(0).unwrap(), Variant::from("a"));
}

#[test]
fn test_nested_object_with_duplicate_field_names_per_object() {
/*
{
"c": {
"c": "a"
}
}

*/

let mut builder = VariantBuilder::new();
{
let mut outer_object_builder = builder.new_object();
{
let mut inner_object_builder = outer_object_builder.new_object("c");
inner_object_builder.insert("c", "a");
inner_object_builder.finish();
}

outer_object_builder.finish();
}

let (metadata, value) = builder.finish();
let variant = Variant::try_new(&metadata, &value).unwrap();
let outer_object = variant.as_object().unwrap();

assert_eq!(outer_object.len(), 1);
assert_eq!(outer_object.field_name(0).unwrap(), "c");

let inner_object_variant = outer_object.field(0).unwrap();
let inner_object = inner_object_variant.as_object().unwrap();

assert_eq!(inner_object.len(), 1);
assert_eq!(inner_object.field_name(0).unwrap(), "c");
assert_eq!(inner_object.field(0).unwrap(), Variant::from("a"));
}

#[test]
fn test_nested_object_with_lists() {
/*
{
"door 1": {
"items": ["apple", false ]
}
}

*/

let mut builder = VariantBuilder::new();
{
let mut outer_object_builder = builder.new_object();
{
let mut inner_object_builder = outer_object_builder.new_object("door 1");

{
let mut inner_object_list_builder = inner_object_builder.new_list("items");
inner_object_list_builder.append_value("apple");
inner_object_list_builder.append_value(false);
inner_object_list_builder.finish();
}

inner_object_builder.finish();
}

outer_object_builder.finish();
}

let (metadata, value) = builder.finish();
let variant = Variant::try_new(&metadata, &value).unwrap();
let outer_object = variant.as_object().unwrap();

assert_eq!(outer_object.len(), 1);
assert_eq!(outer_object.field_name(0).unwrap(), "door 1");

let inner_object_variant = outer_object.field(0).unwrap();
let inner_object = inner_object_variant.as_object().unwrap();

assert_eq!(inner_object.len(), 1);
assert_eq!(inner_object.field_name(0).unwrap(), "items");

let items_variant = inner_object.field(0).unwrap();
let items_list = items_variant.as_list().unwrap();

assert_eq!(items_list.len(), 2);
assert_eq!(items_list.get(0).unwrap(), Variant::from("apple"));
assert_eq!(items_list.get(1).unwrap(), Variant::from(false));
}
}
Loading