Skip to content

Commit 2d7892b

Browse files
authored
Enable reading StringViewArray by default from Parquet (8% improvement for entire ClickBench suite) (#13101)
* Read String data as `StringView` by default from Parquet * fix benchmark
1 parent 63e8e6a commit 2d7892b

File tree

14 files changed

+47
-65
lines changed

14 files changed

+47
-65
lines changed

benchmarks/src/bin/external_aggr.rs

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -193,12 +193,7 @@ impl ExternalAggrConfig {
193193
) -> Result<Vec<QueryResult>> {
194194
let query_name =
195195
format!("Q{query_id}({})", human_readable_size(mem_limit as usize));
196-
let mut config = self.common.config();
197-
config
198-
.options_mut()
199-
.execution
200-
.parquet
201-
.schema_force_view_types = self.common.force_view_types;
196+
let config = self.common.config();
202197
let runtime_config = RuntimeConfig::new()
203198
.with_memory_pool(Arc::new(FairSpillPool::new(mem_limit as usize)))
204199
.build_arc()?;

benchmarks/src/clickbench.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ impl RunOpt {
119119
let mut config = self.common.config();
120120
{
121121
let parquet_options = &mut config.options_mut().execution.parquet;
122-
parquet_options.schema_force_view_types = self.common.force_view_types;
123122
// The hits_partitioned dataset specifies string columns
124123
// as binary due to how it was written. Force it to strings
125124
parquet_options.binary_as_string = true;

benchmarks/src/imdb/run.rs

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -305,11 +305,7 @@ impl RunOpt {
305305
.config()
306306
.with_collect_statistics(!self.disable_statistics);
307307
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
308-
config
309-
.options_mut()
310-
.execution
311-
.parquet
312-
.schema_force_view_types = self.common.force_view_types;
308+
313309
let ctx = SessionContext::new_with_config(config);
314310

315311
// register tables
@@ -517,7 +513,6 @@ mod tests {
517513
partitions: Some(2),
518514
batch_size: 8192,
519515
debug: false,
520-
force_view_types: false,
521516
};
522517
let opt = RunOpt {
523518
query: Some(query),
@@ -551,7 +546,6 @@ mod tests {
551546
partitions: Some(2),
552547
batch_size: 8192,
553548
debug: false,
554-
force_view_types: false,
555549
};
556550
let opt = RunOpt {
557551
query: Some(query),

benchmarks/src/tpch/run.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,6 @@ impl RunOpt {
120120
.config()
121121
.with_collect_statistics(!self.disable_statistics);
122122
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
123-
config
124-
.options_mut()
125-
.execution
126-
.parquet
127-
.schema_force_view_types = self.common.force_view_types;
128123
let ctx = SessionContext::new_with_config(config);
129124

130125
// register tables
@@ -345,7 +340,6 @@ mod tests {
345340
partitions: Some(2),
346341
batch_size: 8192,
347342
debug: false,
348-
force_view_types: false,
349343
};
350344
let opt = RunOpt {
351345
query: Some(query),
@@ -379,7 +373,6 @@ mod tests {
379373
partitions: Some(2),
380374
batch_size: 8192,
381375
debug: false,
382-
force_view_types: false,
383376
};
384377
let opt = RunOpt {
385378
query: Some(query),

benchmarks/src/util/options.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,6 @@ pub struct CommonOpt {
3737
/// Activate debug mode to see more details
3838
#[structopt(short, long)]
3939
pub debug: bool,
40-
41-
/// If true, will use StringView/BinaryViewArray instead of String/BinaryArray
42-
/// when reading ParquetFiles
43-
#[structopt(long)]
44-
pub force_view_types: bool,
4540
}
4641

4742
impl CommonOpt {

datafusion/common/src/config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ config_namespace! {
399399

400400
/// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
401401
/// and `Binary/BinaryLarge` with `BinaryView`.
402-
pub schema_force_view_types: bool, default = false
402+
pub schema_force_view_types: bool, default = true
403403

404404
/// (reading) If true, parquet reader will read columns of
405405
/// `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`.

datafusion/common/src/scalar/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -980,6 +980,11 @@ impl ScalarValue {
980980
ScalarValue::from(val.into())
981981
}
982982

983+
/// Returns a [`ScalarValue::Utf8View`] representing `val`
984+
pub fn new_utf8view(val: impl Into<String>) -> Self {
985+
ScalarValue::Utf8View(Some(val.into()))
986+
}
987+
983988
/// Returns a [`ScalarValue::IntervalYearMonth`] representing
984989
/// `years` years and `months` months
985990
pub fn new_interval_ym(years: i32, months: i32) -> Self {

datafusion/core/tests/parquet/page_pruning.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,9 @@ async fn page_index_filter_one_col() {
149149
let session_ctx = SessionContext::new();
150150
let task_ctx = session_ctx.task_ctx();
151151

152-
// 5.create filter date_string_col == 1;
153-
let filter = col("date_string_col").eq(lit("01/01/09"));
152+
// 5.create filter date_string_col == "01/01/09"`;
153+
// Note this test doesn't apply type coercion so the literal must match the actual view type
154+
let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8view("01/01/09")));
154155
let parquet_exec = get_parquet_exec(&state, filter).await;
155156
let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
156157
let batch = results.next().await.unwrap().unwrap();

datafusion/sqllogictest/test_files/describe.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ int_col Int32 YES
8181
bigint_col Int64 YES
8282
float_col Float32 YES
8383
double_col Float64 YES
84-
date_string_col Utf8 YES
85-
string_col Utf8 YES
84+
date_string_col Utf8View YES
85+
string_col Utf8View YES
8686
timestamp_col Timestamp(Nanosecond, None) YES
8787
year Int32 YES
8888
month Int32 YES

datafusion/sqllogictest/test_files/explain.slt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,8 @@ initial_physical_plan
305305
01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
306306
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
307307
initial_physical_plan_with_schema
308-
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
309-
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
308+
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
309+
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
310310
physical_plan after OutputRequirements
311311
01)OutputRequirementExec, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
312312
02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
@@ -328,7 +328,7 @@ physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
328328
physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
329329
physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
330330
physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
331-
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
331+
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
332332

333333

334334
statement ok
@@ -345,8 +345,8 @@ initial_physical_plan_with_stats
345345
01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
346346
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
347347
initial_physical_plan_with_schema
348-
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
349-
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
348+
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
349+
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
350350
physical_plan after OutputRequirements
351351
01)OutputRequirementExec
352352
02)--GlobalLimitExec: skip=0, fetch=10
@@ -369,7 +369,7 @@ physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE
369369
physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
370370
physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10
371371
physical_plan_with_stats ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
372-
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
372+
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
373373

374374

375375
statement ok

0 commit comments

Comments
 (0)