@@ -19,7 +19,7 @@ use std::any::Any;
1919use std:: marker:: PhantomData ;
2020use std:: sync:: Arc ;
2121
22- use arrow_buffer:: { Buffer , BufferBuilder , NullBufferBuilder , ScalarBuffer } ;
22+ use arrow_buffer:: { Buffer , NullBufferBuilder , ScalarBuffer } ;
2323use arrow_data:: ByteView ;
2424use arrow_schema:: ArrowError ;
2525use hashbrown:: hash_table:: Entry ;
@@ -28,7 +28,7 @@ use hashbrown::HashTable;
2828use crate :: builder:: ArrayBuilder ;
2929use crate :: types:: bytes:: ByteArrayNativeType ;
3030use crate :: types:: { BinaryViewType , ByteViewType , StringViewType } ;
31- use crate :: { ArrayRef , GenericByteViewArray } ;
31+ use crate :: { Array , ArrayRef , GenericByteViewArray } ;
3232
3333const STARTING_BLOCK_SIZE : u32 = 8 * 1024 ; // 8KiB
3434const MAX_BLOCK_SIZE : u32 = 2 * 1024 * 1024 ; // 2MiB
@@ -79,7 +79,7 @@ impl BlockSizeGrowthStrategy {
7979/// using [`GenericByteViewBuilder::append_block`] and then views into this block appended
8080/// using [`GenericByteViewBuilder::try_append_view`]
8181pub struct GenericByteViewBuilder < T : ByteViewType + ?Sized > {
82- views_builder : BufferBuilder < u128 > ,
82+ views_buffer : Vec < u128 > ,
8383 null_buffer_builder : NullBufferBuilder ,
8484 completed : Vec < Buffer > ,
8585 in_progress : Vec < u8 > ,
@@ -99,7 +99,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
9999 /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values.
100100 pub fn with_capacity ( capacity : usize ) -> Self {
101101 Self {
102- views_builder : BufferBuilder :: new ( capacity) ,
102+ views_buffer : Vec :: with_capacity ( capacity) ,
103103 null_buffer_builder : NullBufferBuilder :: new ( capacity) ,
104104 completed : vec ! [ ] ,
105105 in_progress : vec ! [ ] ,
@@ -148,7 +148,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
148148 pub fn with_deduplicate_strings ( self ) -> Self {
149149 Self {
150150 string_tracker : Some ( (
151- HashTable :: with_capacity ( self . views_builder . capacity ( ) ) ,
151+ HashTable :: with_capacity ( self . views_buffer . capacity ( ) ) ,
152152 Default :: default ( ) ,
153153 ) ) ,
154154 ..self
@@ -201,10 +201,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
201201 let b = b. get_unchecked ( start..end) ;
202202
203203 let view = make_view ( b, block, offset) ;
204- self . views_builder . append ( view) ;
204+ self . views_buffer . push ( view) ;
205205 self . null_buffer_builder . append_non_null ( ) ;
206206 }
207207
208+ /// Appends an array to the builder.
209+ /// This will flush any in-progress block and append the data buffers
210+ /// and add the (adapted) views.
211+ pub fn append_array ( & mut self , array : & GenericByteViewArray < T > ) {
212+ self . flush_in_progress ( ) ;
213+ // keep original views if this array is the first to be added or if there are no data buffers (all inline views)
214+ let keep_views = self . completed . is_empty ( ) || array. data_buffers ( ) . is_empty ( ) ;
215+ let starting_buffer = self . completed . len ( ) as u32 ;
216+
217+ self . completed . extend ( array. data_buffers ( ) . iter ( ) . cloned ( ) ) ;
218+
219+ if keep_views {
220+ self . views_buffer . extend_from_slice ( array. views ( ) ) ;
221+ } else {
222+ self . views_buffer . extend ( array. views ( ) . iter ( ) . map ( |v| {
223+ let mut byte_view = ByteView :: from ( * v) ;
224+ if byte_view. length > 12 {
225+ // Small views (<=12 bytes) are inlined, so only need to update large views
226+ byte_view. buffer_index += starting_buffer;
227+ } ;
228+
229+ byte_view. as_u128 ( )
230+ } ) ) ;
231+ }
232+
233+ if let Some ( null_buffer) = array. nulls ( ) {
234+ self . null_buffer_builder . append_buffer ( null_buffer) ;
235+ } else {
236+ self . null_buffer_builder . append_n_non_nulls ( array. len ( ) ) ;
237+ }
238+ }
239+
208240 /// Try to append a view of the given `block`, `offset` and `length`
209241 ///
210242 /// See [`Self::append_block`]
@@ -255,7 +287,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
255287 /// Useful if we want to know what value has been inserted to the builder
256288 /// The index has to be smaller than `self.len()`, otherwise it will panic
257289 pub fn get_value ( & self , index : usize ) -> & [ u8 ] {
258- let view = self . views_builder . as_slice ( ) . get ( index) . unwrap ( ) ;
290+ let view = self . views_buffer . as_slice ( ) . get ( index) . unwrap ( ) ;
259291 let len = * view as u32 ;
260292 if len <= 12 {
261293 // # Safety
@@ -287,7 +319,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
287319 let mut view_buffer = [ 0 ; 16 ] ;
288320 view_buffer[ 0 ..4 ] . copy_from_slice ( & length. to_le_bytes ( ) ) ;
289321 view_buffer[ 4 ..4 + v. len ( ) ] . copy_from_slice ( v) ;
290- self . views_builder . append ( u128:: from_le_bytes ( view_buffer) ) ;
322+ self . views_buffer . push ( u128:: from_le_bytes ( view_buffer) ) ;
291323 self . null_buffer_builder . append_non_null ( ) ;
292324 return ;
293325 }
@@ -311,16 +343,15 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
311343 Entry :: Occupied ( occupied) => {
312344 // If the string already exists, we will directly use the view
313345 let idx = occupied. get ( ) ;
314- self . views_builder
315- . append ( self . views_builder . as_slice ( ) [ * idx] ) ;
346+ self . views_buffer . push ( self . views_buffer [ * idx] ) ;
316347 self . null_buffer_builder . append_non_null ( ) ;
317348 self . string_tracker = Some ( ( ht, hasher) ) ;
318349 return ;
319350 }
320351 Entry :: Vacant ( vacant) => {
321352 // o.w. we insert the (string hash -> view index)
322353 // the idx is current length of views_builder, as we are inserting a new view
323- vacant. insert ( self . views_builder . len ( ) ) ;
354+ vacant. insert ( self . views_buffer . len ( ) ) ;
324355 }
325356 }
326357 self . string_tracker = Some ( ( ht, hasher) ) ;
@@ -341,7 +372,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
341372 buffer_index : self . completed . len ( ) as u32 ,
342373 offset,
343374 } ;
344- self . views_builder . append ( view. into ( ) ) ;
375+ self . views_buffer . push ( view. into ( ) ) ;
345376 self . null_buffer_builder . append_non_null ( ) ;
346377 }
347378
@@ -358,21 +389,20 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
358389 #[ inline]
359390 pub fn append_null ( & mut self ) {
360391 self . null_buffer_builder . append_null ( ) ;
361- self . views_builder . append ( 0 ) ;
392+ self . views_buffer . push ( 0 ) ;
362393 }
363394
364395 /// Builds the [`GenericByteViewArray`] and reset this builder
365396 pub fn finish ( & mut self ) -> GenericByteViewArray < T > {
366397 self . flush_in_progress ( ) ;
367398 let completed = std:: mem:: take ( & mut self . completed ) ;
368- let len = self . views_builder . len ( ) ;
369- let views = ScalarBuffer :: new ( self . views_builder . finish ( ) , 0 , len) ;
370399 let nulls = self . null_buffer_builder . finish ( ) ;
371400 if let Some ( ( ref mut ht, _) ) = self . string_tracker . as_mut ( ) {
372401 ht. clear ( ) ;
373402 }
403+ let views = std:: mem:: take ( & mut self . views_buffer ) ;
374404 // SAFETY: valid by construction
375- unsafe { GenericByteViewArray :: new_unchecked ( views, completed, nulls) }
405+ unsafe { GenericByteViewArray :: new_unchecked ( views. into ( ) , completed, nulls) }
376406 }
377407
378408 /// Builds the [`GenericByteViewArray`] without resetting the builder
@@ -381,8 +411,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
381411 if !self . in_progress . is_empty ( ) {
382412 completed. push ( Buffer :: from_slice_ref ( & self . in_progress ) ) ;
383413 }
384- let len = self . views_builder . len ( ) ;
385- let views = Buffer :: from_slice_ref ( self . views_builder . as_slice ( ) ) ;
414+ let len = self . views_buffer . len ( ) ;
415+ let views = Buffer :: from_slice_ref ( self . views_buffer . as_slice ( ) ) ;
386416 let views = ScalarBuffer :: new ( views, 0 , len) ;
387417 let nulls = self . null_buffer_builder . finish_cloned ( ) ;
388418 // SAFETY: valid by construction
@@ -396,7 +426,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
396426
397427 /// Return the allocated size of this builder in bytes, useful for memory accounting.
398428 pub fn allocated_size ( & self ) -> usize {
399- let views = self . views_builder . capacity ( ) * std:: mem:: size_of :: < u128 > ( ) ;
429+ let views = self . views_buffer . capacity ( ) * std:: mem:: size_of :: < u128 > ( ) ;
400430 let null = self . null_buffer_builder . allocated_size ( ) ;
401431 let buffer_size = self . completed . iter ( ) . map ( |b| b. capacity ( ) ) . sum :: < usize > ( ) ;
402432 let in_progress = self . in_progress . capacity ( ) ;
@@ -418,7 +448,7 @@ impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> {
418448 fn fmt ( & self , f : & mut std:: fmt:: Formatter < ' _ > ) -> std:: fmt:: Result {
419449 write ! ( f, "{}ViewBuilder" , T :: PREFIX ) ?;
420450 f. debug_struct ( "" )
421- . field ( "views_builder " , & self . views_builder )
451+ . field ( "views_buffer " , & self . views_buffer )
422452 . field ( "in_progress" , & self . in_progress )
423453 . field ( "completed" , & self . completed )
424454 . field ( "null_buffer_builder" , & self . null_buffer_builder )
0 commit comments