From 9585347def64b8877e2440e60c20fc5486a95856 Mon Sep 17 00:00:00 2001 From: Adam Curtis Date: Wed, 24 Apr 2024 16:13:19 -0400 Subject: [PATCH 1/5] fix: preserve more dictionaries when coercing types --- datafusion/expr/src/type_coercion/binary.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 7eec606658f41..307a009830dc0 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -643,9 +643,8 @@ fn dictionary_coercion( Dictionary(_lhs_index_type, lhs_value_type), Dictionary(_rhs_index_type, rhs_value_type), ) => comparison_coercion(lhs_value_type, rhs_value_type), - (d @ Dictionary(_, value_type), other_type) - | (other_type, d @ Dictionary(_, value_type)) - if preserve_dictionaries && value_type.as_ref() == other_type => + (d @ Dictionary(_, _), other_type) | (other_type, d @ Dictionary(_, _)) + if preserve_dictionaries && can_cast_types(other_type, d) => { Some(d.clone()) } From 2051b537a0e1c7e93df04a6e1345fda9c2031882 Mon Sep 17 00:00:00 2001 From: Adam Curtis Date: Wed, 24 Apr 2024 23:33:40 -0400 Subject: [PATCH 2/5] use coercion_comparison instead of can_cast_types --- datafusion/expr/src/type_coercion/binary.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 307a009830dc0..81e198f404176 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -638,21 +638,26 @@ fn dictionary_coercion( preserve_dictionaries: bool, ) -> Option { use arrow::datatypes::DataType::*; + let maybe_preserve_dictionaries = + |index_type: &DataType, value_type: DataType| -> DataType { + if preserve_dictionaries { + Dictionary(Box::new(index_type.clone()), Box::new(value_type)) + } else { + value_type + } + }; match (lhs_type, rhs_type) { ( Dictionary(_lhs_index_type, lhs_value_type), Dictionary(_rhs_index_type, rhs_value_type), ) => comparison_coercion(lhs_value_type, rhs_value_type), - (d @ Dictionary(_, _), other_type) | (other_type, d @ Dictionary(_, _)) - if preserve_dictionaries && can_cast_types(other_type, d) => - { - Some(d.clone()) - } - (Dictionary(_index_type, value_type), _) => { + (Dictionary(index_type, value_type), _) => { comparison_coercion(value_type, rhs_type) + .map(|new_type| maybe_preserve_dictionaries(index_type, new_type)) } - (_, Dictionary(_index_type, value_type)) => { + (_, Dictionary(index_type, value_type)) => { comparison_coercion(lhs_type, value_type) + .map(|new_type| maybe_preserve_dictionaries(index_type, new_type)) } _ => None, } From 7d49c0f2fef7545aa6bfaaff7b745250b54a412b Mon Sep 17 00:00:00 2001 From: Adam Curtis Date: Thu, 25 Apr 2024 12:35:14 -0400 Subject: [PATCH 3/5] test: update dictionary coercion tests --- datafusion/sqllogictest/test_files/scalar.slt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 0c3fca4465269..b21e0e10fbc43 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1790,26 +1790,26 @@ select coalesce(column1, 'none_set') from test1; foo none_set -# test coercion Int -query I +# test coercion Int and Dictionary +query ? select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ---- 34 # test with Int -query I +query ? select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'),34); ---- 123 # test with null -query I +query ? select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); ---- 34 # test with null -query T +query ? select coalesce(null, column1, 'none_set') from test1; ---- foo From 8db4a8e6475abb73a477095e1545366488afa464 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 25 Apr 2024 13:20:07 -0400 Subject: [PATCH 4/5] Print out types in tests as well --- datafusion/sqllogictest/test_files/scalar.slt | 59 +++++++++++-------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index b21e0e10fbc43..08f79ac0b331e 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1768,52 +1768,61 @@ SELECT make_array(1, 2, 3); [1, 2, 3] # coalesce static empty value -query T -SELECT COALESCE('', 'test') +query TT +SELECT COALESCE('', 'test'), +arrow_typeof(COALESCE('', 'test')) ---- -(empty) +(empty) Utf8 # coalesce static value with null -query T -SELECT COALESCE(NULL, 'test') +query TT +SELECT COALESCE(NULL, 'test'), +arrow_typeof(COALESCE(NULL, 'test')) ---- -test - +test Utf8 +# Create table with a dictionary value statement ok create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); -# test coercion string -query ? -select coalesce(column1, 'none_set') from test1; +# test coercion string (should preserve the dictionary type) +query ?T +select coalesce(column1, 'none_set'), +arrow_typeof(coalesce(column1, 'none_set')) +from test1; ---- -foo -none_set +foo Dictionary(Int32, Utf8) +none_set Dictionary(Int32, Utf8) # test coercion Int and Dictionary -query ? -select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')); +query ?T +select coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)')), +arrow_typeof(coalesce(34, arrow_cast(123, 'Dictionary(Int32, Int8)'))) ---- -34 +34 Dictionary(Int32, Int64) # test with Int -query ? -select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'),34); +query ?T +select coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'),34), +arrow_typeof(coalesce(arrow_cast(123, 'Dictionary(Int32, Int8)'),34)) ---- -123 +123 Dictionary(Int32, Int64) # test with null -query ? -select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')); +query ?T +select coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)')), +arrow_typeof(coalesce(null, 34, arrow_cast(123, 'Dictionary(Int32, Int8)'))) ---- -34 +34 Dictionary(Int32, Int64) # test with null -query ? -select coalesce(null, column1, 'none_set') from test1; +query ?T +select coalesce(null, column1, 'none_set'), +arrow_typeof(coalesce(null, column1, 'none_set')) +from test1; ---- -foo -none_set +foo Dictionary(Int32, Utf8) +none_set Dictionary(Int32, Utf8) statement ok drop table test1 From 7f64b4aad11e0d45d0df2e1db1268f94cc8f9990 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 25 Apr 2024 13:25:48 -0400 Subject: [PATCH 5/5] Add dictionary filter reproducer --- .../sqllogictest/test_files/dictionary.slt | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt index 06b726502664f..8a11b4139dcdd 100644 --- a/datafusion/sqllogictest/test_files/dictionary.slt +++ b/datafusion/sqllogictest/test_files/dictionary.slt @@ -386,3 +386,53 @@ drop table m3; statement ok drop table m3_source; + + +## Test that filtering on dictionary columns coerces the filter value to the dictionary type +statement ok +create table test as values + ('row1', arrow_cast('1', 'Dictionary(Int32, Utf8)')), + ('row2', arrow_cast('2', 'Dictionary(Int32, Utf8)')), + ('row3', arrow_cast('3', 'Dictionary(Int32, Utf8)')) +; + +# query using an string '1' which must be coerced into a dictionary string +query T? +SELECT * from test where column2 = '1'; +---- +row1 1 + +# filter should not have a cast on column2 +query TT +explain SELECT * from test where column2 = '1'; +---- +logical_plan +01)Filter: test.column2 = Dictionary(Int32, Utf8("1")) +02)--TableScan: test projection=[column1, column2] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: column2@1 = 1 +03)----MemoryExec: partitions=1, partition_sizes=[1] + + +# Now query using an integer which must be coerced into a dictionary string +query T? +SELECT * from test where column2 = 1; +---- +row1 1 + +# filter should not have a cast on column2 +query TT +explain SELECT * from test where column2 = 1; +---- +logical_plan +01)Filter: test.column2 = Dictionary(Int32, Utf8("1")) +02)--TableScan: test projection=[column1, column2] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: column2@1 = 1 +03)----MemoryExec: partitions=1, partition_sizes=[1] + + +statement ok +drop table test;