From 38c98f8dee0694e94ef496858b984a21ca1dd98f Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 16 Jul 2024 12:14:19 -0400
Subject: [PATCH 01/10] Test + workaround for SanityCheck plan

---
 .../src/physical_optimizer/sanity_checker.rs  | 10 +++++
 datafusion/sqllogictest/test_files/union.slt  | 37 +++++++++++++++++++
 2 files changed, 47 insertions(+)
diff --git a/datafusion/core/src/physical_optimizer/sanity_checker.rs b/datafusion/core/src/physical_optimizer/sanity_checker.rs
index e392105fbcb78..123d61d94a215 100644
--- a/datafusion/core/src/physical_optimizer/sanity_checker.rs
+++ b/datafusion/core/src/physical_optimizer/sanity_checker.rs
@@ -34,6 +34,8 @@ use datafusion_physical_plan::joins::SymmetricHashJoinExec;
 use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties};
 
 use datafusion_physical_optimizer::PhysicalOptimizerRule;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::union::UnionExec;
 use itertools::izip;
 
 /// The SanityCheckPlan rule rejects the following query plans:
@@ -125,6 +127,14 @@ pub fn check_plan_sanity(
         plan.required_input_ordering().iter(),
         plan.required_input_distribution().iter()
     ) {
+        // TEMP HACK WORKAROUND https://github.com/apache/datafusion/issues/11492
+        if child.as_any().downcast_ref::<UnionExec>().is_some() {
+            continue;
+        }
+        if child.as_any().downcast_ref::<SortExec>().is_some() {
+            continue;
+        }
+
         let child_eq_props = child.equivalence_properties();
         if let Some(sort_req) = sort_req {
             if !child_eq_props.ordering_satisfy_requirement(sort_req) {
diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt
index a3d0ff4383ae6..12eb551ff8760 100644
--- a/datafusion/sqllogictest/test_files/union.slt
+++ b/datafusion/sqllogictest/test_files/union.slt
@@ -538,6 +538,9 @@ physical_plan
 # Clean up after the test
 ########
 
+statement ok
+drop table t
+
 statement ok
 drop table t1;
 
@@ -761,3 +764,37 @@ SELECT NULL WHERE FALSE;
 ----
 0.5
 1
+
+###
+# Test for https://github.com/apache/datafusion/issues/11492
+###
+
+# Input data is
+# a,b,c
+# 1,2,3
+
+statement ok
+CREATE EXTERNAL TABLE t (
+  a INT,
+  b INT,
+  c INT
+)
+STORED AS CSV
+LOCATION '../core/tests/data/example.csv'
+WITH ORDER (a ASC)
+OPTIONS ('format.has_header' 'true');
+
+query T
+SELECT (SELECT a from t ORDER BY a) UNION ALL (SELECT 'bar' as a from t) ORDER BY a;
+----
+1
+bar
+
+query I
+SELECT (SELECT a from t ORDER BY a) UNION ALL (SELECT NULL as a from t) ORDER BY a;
+----
+1
+NULL 
+
+statement ok
+drop table t

From 76b145e8b77d991029e3fb87b13fba25b5b3a18c Mon Sep 17 00:00:00 2001
From: wiedld <wiedld@users.noreply.github.com>
Date: Tue, 1 Oct 2024 00:03:30 -1000
Subject: [PATCH 02/10] Provide field and schema metadata missing on distinct
 aggregations. (#12691)

* test(12687): reproducer of missing metadata bug

* fix(12687): minimum change needed to fix the missing metadata
---
 .../physical-plan/src/aggregates/mod.rs       | 25 +++++++-----
 datafusion/physical-plan/src/projection.rs    |  2 +-
 .../sqllogictest/test_files/metadata.slt      | 38 +++++++++++++++++++
 3 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs
index c3bc7b042e655..1dc41059a7ee5 100644
--- a/datafusion/physical-plan/src/aggregates/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -26,6 +26,7 @@ use crate::aggregates::{
     topk_stream::GroupedTopKAggregateStream,
 };
 use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
+use crate::projection::get_field_metadata;
 use crate::windows::get_ordered_partition_by_indices;
 use crate::{
     DisplayFormatType, Distribution, ExecutionPlan, InputOrderMode,
@@ -793,14 +794,17 @@ fn create_schema(
 ) -> Result<Schema> {
     let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len());
     for (index, (expr, name)) in group_expr.iter().enumerate() {
-        fields.push(Field::new(
-            name,
-            expr.data_type(input_schema)?,
-            // In cases where we have multiple grouping sets, we will use NULL expressions in
-            // order to align the grouping sets. So the field must be nullable even if the underlying
-            // schema field is not.
-            group_expr_nullable[index] || expr.nullable(input_schema)?,
-        ))
+        fields.push(
+            Field::new(
+                name,
+                expr.data_type(input_schema)?,
+                // In cases where we have multiple grouping sets, we will use NULL expressions in
+                // order to align the grouping sets. So the field must be nullable even if the underlying
+                // schema field is not.
+                group_expr_nullable[index] || expr.nullable(input_schema)?,
+            )
+            .with_metadata(get_field_metadata(expr, input_schema).unwrap_or_default()),
+        )
     }
 
     match mode {
@@ -821,7 +825,10 @@ fn create_schema(
         }
     }
 
-    Ok(Schema::new(fields))
+    Ok(Schema::new_with_metadata(
+        fields,
+        input_schema.metadata().clone(),
+    ))
 }
 
 fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef {
diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs
index f1b9cdaf728ff..4c889d1fc88c4 100644
--- a/datafusion/physical-plan/src/projection.rs
+++ b/datafusion/physical-plan/src/projection.rs
@@ -237,7 +237,7 @@ impl ExecutionPlan for ProjectionExec {
 
 /// If e is a direct column reference, returns the field level
 /// metadata for that field, if any. Otherwise returns None
-fn get_field_metadata(
+pub(crate) fn get_field_metadata(
     e: &Arc<dyn PhysicalExpr>,
     input_schema: &Schema,
 ) -> Option<HashMap<String, String>> {
diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt
index 3b2b219244f55..f38281abc5abd 100644
--- a/datafusion/sqllogictest/test_files/metadata.slt
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -58,5 +58,43 @@ WHERE "data"."id" = "samples"."id";
 1
 3
 
+
+
+# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
+query I
+select count(distinct name) from table_with_metadata;
+----
+2
+
+# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
+query I
+select approx_median(distinct id) from table_with_metadata;
+----
+2
+
+# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
+statement ok
+select array_agg(distinct id) from table_with_metadata;
+
+query I
+select distinct id from table_with_metadata order by id;
+----
+1
+3
+NULL
+
+query I
+select count(id) from table_with_metadata;
+----
+2
+
+query I
+select count(id) cnt from table_with_metadata group by name order by cnt;
+----
+0
+1
+1
+
+
 statement ok
 drop table table_with_metadata;

From ccdfa9a729dc36123606123185c3d0f37f837b33 Mon Sep 17 00:00:00 2001
From: wiedld <wiedld@users.noreply.github.com>
Date: Fri, 4 Oct 2024 08:28:12 -0700
Subject: [PATCH 03/10] Provide field and schema metadata missing on cross
 joins, and union with null fields. (#12729)

* test: reproducer for missing schema metadata on cross join

* fix: pass thru schema metadata on cross join

* fix: preserve metadata when transforming to view types

* test: reproducer for missing field metadata in left hand NULL field of union

* fix: preserve field metadata from right side of union

* chore: safe indexing
---
 .../core/src/datasource/file_format/mod.rs    | 18 +++++------
 .../physical-plan/src/joins/cross_join.rs     | 13 ++++++--
 datafusion/physical-plan/src/union.rs         | 11 ++++++-
 datafusion/sqllogictest/src/test_context.rs   |  8 ++++-
 .../sqllogictest/test_files/metadata.slt      | 31 ++++++++++++++++++-
 5 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs
index a503e36adbeb1..2a4fa1d64ecbc 100644
--- a/datafusion/core/src/datasource/file_format/mod.rs
+++ b/datafusion/core/src/datasource/file_format/mod.rs
@@ -241,16 +241,14 @@ pub fn transform_schema_to_view(schema: &Schema) -> Schema {
         .fields
         .iter()
         .map(|field| match field.data_type() {
-            DataType::Utf8 | DataType::LargeUtf8 => Arc::new(Field::new(
-                field.name(),
-                DataType::Utf8View,
-                field.is_nullable(),
-            )),
-            DataType::Binary | DataType::LargeBinary => Arc::new(Field::new(
-                field.name(),
-                DataType::BinaryView,
-                field.is_nullable(),
-            )),
+            DataType::Utf8 | DataType::LargeUtf8 => Arc::new(
+                Field::new(field.name(), DataType::Utf8View, field.is_nullable())
+                    .with_metadata(field.metadata().to_owned()),
+            ),
+            DataType::Binary | DataType::LargeBinary => Arc::new(
+                Field::new(field.name(), DataType::BinaryView, field.is_nullable())
+                    .with_metadata(field.metadata().to_owned()),
+            ),
             _ => field.clone(),
         })
         .collect();
diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs
index 11153556f2538..a70645f3d6c0c 100644
--- a/datafusion/physical-plan/src/joins/cross_join.rs
+++ b/datafusion/physical-plan/src/joins/cross_join.rs
@@ -69,15 +69,22 @@ impl CrossJoinExec {
     /// Create a new [CrossJoinExec].
     pub fn new(left: Arc<dyn ExecutionPlan>, right: Arc<dyn ExecutionPlan>) -> Self {
         // left then right
-        let all_columns: Fields = {
+        let (all_columns, metadata) = {
             let left_schema = left.schema();
             let right_schema = right.schema();
             let left_fields = left_schema.fields().iter();
             let right_fields = right_schema.fields().iter();
-            left_fields.chain(right_fields).cloned().collect()
+
+            let mut metadata = left_schema.metadata().clone();
+            metadata.extend(right_schema.metadata().clone());
+
+            (
+                left_fields.chain(right_fields).cloned().collect::<Fields>(),
+                metadata,
+            )
         };
 
-        let schema = Arc::new(Schema::new(all_columns));
+        let schema = Arc::new(Schema::new(all_columns).with_metadata(metadata));
         let cache = Self::compute_properties(&left, &right, Arc::clone(&schema));
         CrossJoinExec {
             left,
diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs
index 78b25686054d8..1cf22060b62ab 100644
--- a/datafusion/physical-plan/src/union.rs
+++ b/datafusion/physical-plan/src/union.rs
@@ -474,7 +474,16 @@ fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> SchemaRef {
                 .iter()
                 .filter_map(|input| {
                     if input.schema().fields().len() > i {
-                        Some(input.schema().field(i).clone())
+                        let field = input.schema().field(i).clone();
+                        let right_hand_metdata = inputs
+                            .get(1)
+                            .map(|right_input| {
+                                right_input.schema().field(i).metadata().clone()
+                            })
+                            .unwrap_or_default();
+                        let mut metadata = field.metadata().clone();
+                        metadata.extend(right_hand_metdata);
+                        Some(field.with_metadata(metadata))
                     } else {
                         None
                     }
diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs
index 19016d328f4cf..76ded12ef1561 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -313,8 +313,13 @@ pub async fn register_metadata_tables(ctx: &SessionContext) {
         String::from("metadata_key"),
         String::from("the name field"),
     )]));
+    let l_name =
+        Field::new("l_name", DataType::Utf8, true).with_metadata(HashMap::from([(
+            String::from("metadata_key"),
+            String::from("the l_name field"),
+        )]));
 
-    let schema = Schema::new(vec![id, name]).with_metadata(HashMap::from([(
+    let schema = Schema::new(vec![id, name, l_name]).with_metadata(HashMap::from([(
         String::from("metadata_key"),
         String::from("the entire schema"),
     )]));
@@ -324,6 +329,7 @@ pub async fn register_metadata_tables(ctx: &SessionContext) {
         vec![
             Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])) as _,
             Arc::new(StringArray::from(vec![None, Some("bar"), Some("baz")])) as _,
+            Arc::new(StringArray::from(vec![None, Some("l_bar"), Some("l_baz")])) as _,
         ],
     )
     .unwrap();
diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt
index f38281abc5abd..d0853b9e4983b 100644
--- a/datafusion/sqllogictest/test_files/metadata.slt
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -25,7 +25,7 @@
 ## with metadata in SQL.
 
 query IT
-select * from table_with_metadata;
+select id, name from table_with_metadata;
 ----
 1 NULL
 NULL bar
@@ -96,5 +96,34 @@ select count(id) cnt from table_with_metadata group by name order by cnt;
 1
 
 
+
+# Regression test: missing schema metadata, when aggregate on cross join
+query I
+SELECT count("data"."id")
+FROM
+  (
+   SELECT "id" FROM "table_with_metadata"
+  ) as "data",
+  (
+    SELECT "id" FROM "table_with_metadata"
+  ) as "samples";
+----
+6
+
+# Regression test: missing field metadata, from the NULL field on the left side of the union
+query ITT
+(SELECT id, NULL::string as name, l_name FROM "table_with_metadata")
+  UNION
+(SELECT id, name, NULL::string as l_name FROM "table_with_metadata")
+ORDER BY id, name, l_name;
+----
+1 NULL NULL
+3 baz NULL
+3 NULL l_baz
+NULL bar NULL
+NULL NULL l_bar
+
+
+
 statement ok
 drop table table_with_metadata;

From add65db69b8697cc7455765e201255fd6369e0f4 Mon Sep 17 00:00:00 2001
From: wiedld <dlw405@gmail.com>
Date: Wed, 9 Oct 2024 09:59:06 -0700
Subject: [PATCH 04/10] fix(iox-11401): temporary patch to permit count col be
 nullable

---
 datafusion/functions-aggregate/src/count.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs
index 417e28e72a71f..10233b2505281 100644
--- a/datafusion/functions-aggregate/src/count.rs
+++ b/datafusion/functions-aggregate/src/count.rs
@@ -122,7 +122,7 @@ impl AggregateUDFImpl for Count {
     }
 
     fn is_nullable(&self) -> bool {
-        false
+        true
     }
 
     fn state_fields(&self, args: StateFieldsArgs) -> Result<Vec<Field>> {

From a166cd0cbd435891495be0bb60e06d384a707014 Mon Sep 17 00:00:00 2001
From: wiedld <dlw405@gmail.com>
Date: Fri, 11 Oct 2024 12:30:18 -0700
Subject: [PATCH 05/10] fix: when extracting metadata from expr, handle
 CastExpr

---
 datafusion/physical-plan/src/projection.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs
index 4c889d1fc88c4..0f4d06130d8aa 100644
--- a/datafusion/physical-plan/src/projection.rs
+++ b/datafusion/physical-plan/src/projection.rs
@@ -40,7 +40,7 @@ use datafusion_common::stats::Precision;
 use datafusion_common::Result;
 use datafusion_execution::TaskContext;
 use datafusion_physical_expr::equivalence::ProjectionMapping;
-use datafusion_physical_expr::expressions::Literal;
+use datafusion_physical_expr::expressions::{CastExpr, Literal};
 
 use futures::stream::{Stream, StreamExt};
 use log::trace;
@@ -241,6 +241,10 @@ pub(crate) fn get_field_metadata(
     e: &Arc<dyn PhysicalExpr>,
     input_schema: &Schema,
 ) -> Option<HashMap<String, String>> {
+    if let Some(cast) = e.as_any().downcast_ref::<CastExpr>() {
+        return get_field_metadata(cast.expr(), input_schema);
+    }
+
     // Look up field by index in schema (not NAME as there can be more than one
     // column with the same name)
     e.as_any()

From bfbaf0acf2816f89465fe9783e191d9f8e920e9f Mon Sep 17 00:00:00 2001
From: wiedld <dlw405@gmail.com>
Date: Fri, 11 Oct 2024 12:31:34 -0700
Subject: [PATCH 06/10] fix: handle when the left side of the union has no
 fields (e.g. an empty projection)

---
 datafusion/physical-plan/src/union.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs
index 1cf22060b62ab..d6eab89f2fdd5 100644
--- a/datafusion/physical-plan/src/union.rs
+++ b/datafusion/physical-plan/src/union.rs
@@ -468,7 +468,13 @@ pub fn can_interleave<T: Borrow<Arc<dyn ExecutionPlan>>>(
 }
 
 fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> SchemaRef {
-    let fields: Vec<Field> = (0..inputs[0].schema().fields().len())
+    let fields: Vec<Field> = (0..std::cmp::max(
+        inputs[0].schema().fields().len(),
+        inputs
+            .get(1)
+            .map(|l| l.schema().fields().len())
+            .unwrap_or_default(),
+    ))
         .map(|i| {
             inputs
                 .iter()

From dd3b48bd5d96d142c033d6b2aa4d1e1cb6fda7ec Mon Sep 17 00:00:00 2001
From: wiedld <dlw405@gmail.com>
Date: Fri, 11 Oct 2024 12:33:06 -0700
Subject: [PATCH 07/10] fix: because either the left or right fields may be
 chosen, add metadata from both to each other

---
 datafusion/physical-plan/src/union.rs | 29 +++++++++++++--------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs
index d6eab89f2fdd5..8956478643840 100644
--- a/datafusion/physical-plan/src/union.rs
+++ b/datafusion/physical-plan/src/union.rs
@@ -478,21 +478,20 @@ fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> SchemaRef {
         .map(|i| {
             inputs
                 .iter()
-                .filter_map(|input| {
-                    if input.schema().fields().len() > i {
-                        let field = input.schema().field(i).clone();
-                        let right_hand_metdata = inputs
-                            .get(1)
-                            .map(|right_input| {
-                                right_input.schema().field(i).metadata().clone()
-                            })
-                            .unwrap_or_default();
-                        let mut metadata = field.metadata().clone();
-                        metadata.extend(right_hand_metdata);
-                        Some(field.with_metadata(metadata))
-                    } else {
-                        None
-                    }
+                .enumerate()
+                .filter_map(|(input_idx, input)| {
+                    let field = input.schema().field(i).clone();
+                    let mut metadata = field.metadata().clone();
+
+                    let other_side_metdata = inputs
+                        .get(input_idx ^ (1 << 0))
+                        .map(|other_input| {
+                            other_input.schema().field(i).metadata().clone()
+                        })
+                        .unwrap_or_default();
+
+                    metadata.extend(other_side_metdata);
+                    Some(field.with_metadata(metadata))
                 })
                 .find_or_first(|f| f.is_nullable())
                 .unwrap()

From e9723c93b3d6099be132d13c218743e5ffc50a80 Mon Sep 17 00:00:00 2001
From: wiedld <dlw405@gmail.com>
Date: Tue, 15 Oct 2024 10:19:41 -0700
Subject: [PATCH 08/10] fix: now() UDF is not nullable

---
 datafusion/functions/src/datetime/now.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/datafusion/functions/src/datetime/now.rs b/datafusion/functions/src/datetime/now.rs
index b2221215b94b7..74eb5aea4255c 100644
--- a/datafusion/functions/src/datetime/now.rs
+++ b/datafusion/functions/src/datetime/now.rs
@@ -21,7 +21,7 @@ use arrow::datatypes::DataType;
 use arrow::datatypes::DataType::Timestamp;
 use arrow::datatypes::TimeUnit::Nanosecond;
 
-use datafusion_common::{internal_err, Result, ScalarValue};
+use datafusion_common::{internal_err, ExprSchema, Result, ScalarValue};
 use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
 use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility};
 
@@ -84,4 +84,8 @@ impl ScalarUDFImpl for NowFunc {
             ScalarValue::TimestampNanosecond(now_ts, Some("+00:00".into())),
         )))
     }
+
+    fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool {
+        false
+    }
 }

From 78b910bb1bccd1e598183083bfde962c8be47534 Mon Sep 17 00:00:00 2001
From: wiedld <dlw405@gmail.com>
Date: Tue, 15 Oct 2024 12:51:48 -0700
Subject: [PATCH 09/10] chore: temp patch to convert errors to logged warnings,
 for schema mismatch in aggregates

---
 datafusion/core/src/physical_planner.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index 84d285fc2509d..6e4de9f60bd1c 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -673,7 +673,7 @@ impl DefaultPhysicalPlanner {
                     logical_input_schema.as_ref().clone().into();
 
                 if physical_input_schema != physical_input_schema_from_logical {
-                    return internal_err!("Physical input schema should be the same as the one converted from logical input schema.");
+                    log::warn!("Physical input schema should be the same as the one converted from logical input schema, but did not match for logical plan:\n{}", input.display_indent());
                 }
 
                 let groups = self.create_grouping_physical_expr(

From c27d5f2356a21ee6224c149ee971c89c2cc13e18 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 30 Oct 2024 12:08:29 -0400
Subject: [PATCH 10/10] Apply projection to `Statistics` in `FilterExec`

---
 datafusion/common/src/stats.rs                | 20 ++++++++
 datafusion/physical-plan/src/filter.rs        |  7 ++-
 .../sqllogictest/test_files/parquet.slt       | 48 +++++++++++++++++++
 3 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index d8e62b3045f93..1c774a95d0e8b 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -258,6 +258,26 @@ impl Statistics {
         self
     }
 
+    /// Project the statistics to the given column indices.
+    ///
+    /// For example, if we had statistics for columns `{"a", "b", "c"}`,
+    /// projecting to `vec![2, 1]` would return statistics for columns `{"c",
+    /// "b"}`.
+    pub fn project(mut self, projection: Option<&Vec<usize>>) -> Self {
+        let Some(projection) = projection else {
+            return self;
+        };
+
+        // todo: it would be nice to avoid cloning column statistics if
+        // possible (e.g. if the projection did not contain duplicates)
+        self.column_statistics = projection
+            .iter()
+            .map(|&i| self.column_statistics[i].clone())
+            .collect();
+
+        self
+    }
+
     /// Calculates the statistics after `fetch` and `skip` operations apply.
     /// Here, `self` denotes per-partition statistics. Use the `n_partitions`
     /// parameter to compute global statistics in a multi-partition setting.
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
index 3da0f21156d94..31cf29d01d1d1 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -370,7 +370,12 @@ impl ExecutionPlan for FilterExec {
     /// The output statistics of a filtering operation can be estimated if the
     /// predicate's selectivity value can be determined for the incoming data.
     fn statistics(&self) -> Result<Statistics> {
-        Self::statistics_helper(&self.input, self.predicate(), self.default_selectivity)
+        let stats = Self::statistics_helper(
+            &self.input,
+            self.predicate(),
+            self.default_selectivity,
+        )?;
+        Ok(stats.project(self.projection.as_ref()))
     }
 }
 
diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt
index f8b163adc7967..031eb9f0ff389 100644
--- a/datafusion/sqllogictest/test_files/parquet.slt
+++ b/datafusion/sqllogictest/test_files/parquet.slt
@@ -348,3 +348,51 @@ DROP TABLE list_columns;
 # Clean up
 statement ok
 DROP TABLE listing_table;
+
+## Tests for https://github.com/apache/datafusion/issues/13186
+statement ok
+create table cpu (time timestamp, usage_idle float, usage_user float, cpu int);
+
+statement ok
+insert into cpu values ('1970-01-01 00:00:00', 1.0, 2.0, 3);
+
+# must put it into a parquet file to get statistics
+statement ok
+copy (select * from cpu) to 'test_files/scratch/parquet/cpu.parquet';
+
+# Run queries against parquet files
+statement ok
+create external table cpu_parquet
+stored as parquet
+location 'test_files/scratch/parquet/cpu.parquet';
+
+# Double filtering
+#
+# Expect 1 row for both queries
+query PI
+select time, rn
+from (
+  select time, row_number() OVER (ORDER BY usage_idle, time) as rn
+  from cpu
+  where cpu = 3
+) where rn > 0;
+----
+1970-01-01T00:00:00 1
+
+query PI
+select time, rn
+from (
+  select time, row_number() OVER (ORDER BY usage_idle, time) as rn
+  from cpu_parquet
+  where cpu = 3
+) where rn > 0;
+----
+1970-01-01T00:00:00 1
+
+
+# Clean up
+statement ok
+drop table cpu;
+
+statement ok
+drop table cpu_parquet;