-
Notifications
You must be signed in to change notification settings - Fork 1.9k
GC StringViewArray in CoalesceBatchesStream
#11587
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
0fb5f7b
1340a63
a55810e
f886d50
b397944
1fac94a
de0c84a
2c882d6
8ab087c
8e0eaa6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,9 +29,11 @@ use crate::{ | |
| DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, | ||
| }; | ||
|
|
||
| use arrow::array::{AsArray, StringViewBuilder}; | ||
| use arrow::datatypes::SchemaRef; | ||
| use arrow::error::Result as ArrowResult; | ||
| use arrow::record_batch::RecordBatch; | ||
| use arrow_array::Array; | ||
| use datafusion_common::Result; | ||
| use datafusion_execution::TaskContext; | ||
|
|
||
|
|
@@ -216,6 +218,41 @@ impl CoalesceBatchesStream { | |
| match input_batch { | ||
| Poll::Ready(x) => match x { | ||
| Some(Ok(batch)) => { | ||
| let new_columns: Vec<Arc<dyn Array>> = batch | ||
| .columns() | ||
| .iter() | ||
| .map(|c| { | ||
| // Try to re-create the `StringViewArray` to prevent holding the underlying buffer too long. | ||
| if let Some(s) = c.as_string_view_opt() { | ||
| let view_cnt = s.views().len(); | ||
| let buffer_size = s.get_buffer_memory_size(); | ||
|
|
||
| // Re-creating the array copies data and can be time consuming. | ||
| // We only do it if the array is sparse, below is a heuristic to determine if the array is sparse. | ||
| if buffer_size > (view_cnt * 32) { | ||
|
||
| // We use a block size of 2MB (instead of 8KB) to reduce the number of buffers to track. | ||
| // See https://github.com/apache/arrow-rs/issues/6094 for more details. | ||
| let mut builder = | ||
|
||
| StringViewBuilder::with_capacity(s.len()) | ||
| .with_block_size(1024 * 1024 * 2); | ||
|
|
||
| for v in s.iter() { | ||
| builder.append_option(v); | ||
| } | ||
|
|
||
| let gc_string = builder.finish(); | ||
|
|
||
| Arc::new(gc_string) | ||
| } else { | ||
| Arc::clone(c) | ||
| } | ||
| } else { | ||
| Arc::clone(c) | ||
| } | ||
| }) | ||
| .collect(); | ||
| let batch = RecordBatch::try_new(batch.schema(), new_columns)?; | ||
|
|
||
| if batch.num_rows() >= self.target_batch_size | ||
| && self.buffer.is_empty() | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: How about using
ArrayRefinstread ofArc<dyn Array>eases code readabilityThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done, thank you @dharanad !