From 00fd5958fa470711fd13fe6a84279537181200b9 Mon Sep 17 00:00:00 2001 From: wiedld Date: Wed, 2 Oct 2024 15:21:49 -0700 Subject: [PATCH 1/6] test: reproducer for missing schema metadata on cross join --- datafusion/sqllogictest/test_files/metadata.slt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index f38281abc5ab..9957d293627d 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -96,5 +96,18 @@ select count(id) cnt from table_with_metadata group by name order by cnt; 1 + +# Regression test: missing schema metadata, when aggregate on cross join +statement error DataFusion error: Internal error: Physical input schema should be the same as the one converted from logical input schema.. +SELECT count("data"."id") +FROM + ( + SELECT "id" FROM "table_with_metadata" + ) as "data", + ( + SELECT "id" FROM "table_with_metadata" + ) as "samples"; + + statement ok drop table table_with_metadata; From 77f8eb68bc4b8b6e7418fd5f2134a98084fae8a2 Mon Sep 17 00:00:00 2001 From: wiedld Date: Wed, 2 Oct 2024 15:24:41 -0700 Subject: [PATCH 2/6] fix: pass thru schema metadata on cross join --- datafusion/physical-plan/src/joins/cross_join.rs | 13 ++++++++++--- datafusion/sqllogictest/test_files/metadata.slt | 4 +++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 11153556f253..a70645f3d6c0 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -69,15 +69,22 @@ impl CrossJoinExec { /// Create a new [CrossJoinExec]. pub fn new(left: Arc, right: Arc) -> Self { // left then right - let all_columns: Fields = { + let (all_columns, metadata) = { let left_schema = left.schema(); let right_schema = right.schema(); let left_fields = left_schema.fields().iter(); let right_fields = right_schema.fields().iter(); - left_fields.chain(right_fields).cloned().collect() + + let mut metadata = left_schema.metadata().clone(); + metadata.extend(right_schema.metadata().clone()); + + ( + left_fields.chain(right_fields).cloned().collect::(), + metadata, + ) }; - let schema = Arc::new(Schema::new(all_columns)); + let schema = Arc::new(Schema::new(all_columns).with_metadata(metadata)); let cache = Self::compute_properties(&left, &right, Arc::clone(&schema)); CrossJoinExec { left, diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index 9957d293627d..a29f4c39a904 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -98,7 +98,7 @@ select count(id) cnt from table_with_metadata group by name order by cnt; # Regression test: missing schema metadata, when aggregate on cross join -statement error DataFusion error: Internal error: Physical input schema should be the same as the one converted from logical input schema.. +query I SELECT count("data"."id") FROM ( @@ -107,6 +107,8 @@ FROM ( SELECT "id" FROM "table_with_metadata" ) as "samples"; +---- +6 statement ok From 56bb0ac3b51db481f5f271eb2d53db1eb3c5fe4e Mon Sep 17 00:00:00 2001 From: wiedld Date: Wed, 2 Oct 2024 16:05:54 -0700 Subject: [PATCH 3/6] fix: preserve metadata when transforming to view types --- .../core/src/datasource/file_format/mod.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 60f2b2dcefa9..e16986c660ad 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -241,16 +241,14 @@ pub fn transform_schema_to_view(schema: &Schema) -> Schema { .fields .iter() .map(|field| match field.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => Arc::new(Field::new( - field.name(), - DataType::Utf8View, - field.is_nullable(), - )), - DataType::Binary | DataType::LargeBinary => Arc::new(Field::new( - field.name(), - DataType::BinaryView, - field.is_nullable(), - )), + DataType::Utf8 | DataType::LargeUtf8 => Arc::new( + Field::new(field.name(), DataType::Utf8View, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), + DataType::Binary | DataType::LargeBinary => Arc::new( + Field::new(field.name(), DataType::BinaryView, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), _ => field.clone(), }) .collect(); From 31cbfbdeed55559621b674e90a756928671c28a8 Mon Sep 17 00:00:00 2001 From: wiedld Date: Wed, 2 Oct 2024 17:01:58 -0700 Subject: [PATCH 4/6] test: reproducer for missing field metadata in left hand NULL field of union --- datafusion/sqllogictest/src/test_context.rs | 8 +++++++- datafusion/sqllogictest/test_files/metadata.slt | 10 +++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index d3ee720467b6..9a0db1c41c71 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -314,8 +314,13 @@ pub async fn register_metadata_tables(ctx: &SessionContext) { String::from("metadata_key"), String::from("the name field"), )])); + let l_name = + Field::new("l_name", DataType::Utf8, true).with_metadata(HashMap::from([( + String::from("metadata_key"), + String::from("the l_name field"), + )])); - let schema = Schema::new(vec![id, name]).with_metadata(HashMap::from([( + let schema = Schema::new(vec![id, name, l_name]).with_metadata(HashMap::from([( String::from("metadata_key"), String::from("the entire schema"), )])); @@ -325,6 +330,7 @@ pub async fn register_metadata_tables(ctx: &SessionContext) { vec![ Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])) as _, Arc::new(StringArray::from(vec![None, Some("bar"), Some("baz")])) as _, + Arc::new(StringArray::from(vec![None, Some("l_bar"), Some("l_baz")])) as _, ], ) .unwrap(); diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index a29f4c39a904..2cc4ebe76e9b 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -25,7 +25,7 @@ ## with metadata in SQL. query IT -select * from table_with_metadata; +select id, name from table_with_metadata; ---- 1 NULL NULL bar @@ -110,6 +110,14 @@ FROM ---- 6 +# Regression test: missing field metadata, from the NULL field on the left side of the union +statement error DataFusion error: Internal error: Physical input schema should be the same as the one converted from logical input schema.. +(SELECT id, NULL::string as name, l_name FROM "table_with_metadata") + UNION +(SELECT id, name, NULL::string as l_name FROM "table_with_metadata") +ORDER BY id, name, l_name; + + statement ok drop table table_with_metadata; From 0a18ba483a48e3e120b3f82fc41913375e132ca1 Mon Sep 17 00:00:00 2001 From: wiedld Date: Wed, 2 Oct 2024 17:19:35 -0700 Subject: [PATCH 5/6] fix: preserve field metadata from right side of union --- datafusion/physical-plan/src/union.rs | 7 ++++++- datafusion/sqllogictest/test_files/metadata.slt | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 78b25686054d..2df85655e0b9 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -474,7 +474,12 @@ fn union_schema(inputs: &[Arc]) -> SchemaRef { .iter() .filter_map(|input| { if input.schema().fields().len() > i { - Some(input.schema().field(i).clone()) + let field = input.schema().field(i).clone(); + let right_hand_metdata = + inputs[1].schema().field(i).metadata().clone(); + let mut metadata = field.metadata().clone(); + metadata.extend(right_hand_metdata); + Some(field.with_metadata(metadata)) } else { None } diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index 2cc4ebe76e9b..d0853b9e4983 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -111,11 +111,17 @@ FROM 6 # Regression test: missing field metadata, from the NULL field on the left side of the union -statement error DataFusion error: Internal error: Physical input schema should be the same as the one converted from logical input schema.. +query ITT (SELECT id, NULL::string as name, l_name FROM "table_with_metadata") UNION (SELECT id, name, NULL::string as l_name FROM "table_with_metadata") ORDER BY id, name, l_name; +---- +1 NULL NULL +3 baz NULL +3 NULL l_baz +NULL bar NULL +NULL NULL l_bar From d1ffc74ed88f3412b9da6d5a0acc5a057f55efe1 Mon Sep 17 00:00:00 2001 From: wiedld Date: Thu, 3 Oct 2024 20:34:24 -0700 Subject: [PATCH 6/6] chore: safe indexing --- datafusion/physical-plan/src/union.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 2df85655e0b9..1cf22060b62a 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -475,8 +475,12 @@ fn union_schema(inputs: &[Arc]) -> SchemaRef { .filter_map(|input| { if input.schema().fields().len() > i { let field = input.schema().field(i).clone(); - let right_hand_metdata = - inputs[1].schema().field(i).metadata().clone(); + let right_hand_metdata = inputs + .get(1) + .map(|right_input| { + right_input.schema().field(i).metadata().clone() + }) + .unwrap_or_default(); let mut metadata = field.metadata().clone(); metadata.extend(right_hand_metdata); Some(field.with_metadata(metadata))