Skip to content

Commit 5349c73

Browse files
committed
Merge remote-tracking branch 'apache/main' into alamb/test_upstream_coalesce
2 parents ed31ce1 + e6df27c commit 5349c73

File tree

291 files changed

+9073
-550
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

291 files changed

+9073
-550
lines changed

.github/workflows/rust.yml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,6 @@ on:
3939
workflow_dispatch:
4040

4141
jobs:
42-
# Check license header
43-
license-header-check:
44-
runs-on: ubuntu-latest
45-
name: Check License Header
46-
steps:
47-
- uses: actions/checkout@v4
48-
- uses: korandoru/hawkeye@v6
49-
5042
# Check crate compiles and base cargo check passes
5143
linux-build-lib:
5244
name: linux build test

Cargo.lock

Lines changed: 19 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ recursive = "0.1.1"
167167
regex = "1.8"
168168
rstest = "0.25.0"
169169
serde_json = "1"
170-
sqlparser = { version = "0.55.0", features = ["visitor"] }
170+
sqlparser = { version = "0.55.0", default-features = false, features = ["std", "visitor"] }
171171
tempfile = "3"
172172
tokio = { version = "1.45", features = ["macros", "rt", "sync"] }
173173
url = "2.5.4"

benchmarks/README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,6 @@ In addition, topk_tpch is available from the bench.sh script:
518518
./bench.sh run topk_tpch
519519
```
520520

521-
522521
## IMDB
523522

524523
Run Join Order Benchmark (JOB) on IMDB dataset.

datafusion/catalog/src/information_schema.rs

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,14 @@ impl InformationSchemaConfig {
103103
// schema name may not exist in the catalog, so we need to check
104104
if let Some(schema) = catalog.schema(&schema_name) {
105105
for table_name in schema.table_names() {
106-
if let Some(table) = schema.table(&table_name).await? {
106+
if let Some(table_type) =
107+
schema.table_type(&table_name).await?
108+
{
107109
builder.add_table(
108110
&catalog_name,
109111
&schema_name,
110112
&table_name,
111-
table.table_type(),
113+
table_type,
112114
);
113115
}
114116
}
@@ -1359,3 +1361,92 @@ impl PartitionStream for InformationSchemaParameters {
13591361
))
13601362
}
13611363
}
1364+
1365+
#[cfg(test)]
1366+
mod tests {
1367+
use super::*;
1368+
use crate::CatalogProvider;
1369+
1370+
#[tokio::test]
1371+
async fn make_tables_uses_table_type() {
1372+
let config = InformationSchemaConfig {
1373+
catalog_list: Arc::new(Fixture),
1374+
};
1375+
let mut builder = InformationSchemaTablesBuilder {
1376+
catalog_names: StringBuilder::new(),
1377+
schema_names: StringBuilder::new(),
1378+
table_names: StringBuilder::new(),
1379+
table_types: StringBuilder::new(),
1380+
schema: Arc::new(Schema::empty()),
1381+
};
1382+
1383+
assert!(config.make_tables(&mut builder).await.is_ok());
1384+
1385+
assert_eq!("BASE TABLE", builder.table_types.finish().value(0));
1386+
}
1387+
1388+
#[derive(Debug)]
1389+
struct Fixture;
1390+
1391+
#[async_trait]
1392+
impl SchemaProvider for Fixture {
1393+
// InformationSchemaConfig::make_tables should use this.
1394+
async fn table_type(&self, _: &str) -> Result<Option<TableType>> {
1395+
Ok(Some(TableType::Base))
1396+
}
1397+
1398+
// InformationSchemaConfig::make_tables used this before `table_type`
1399+
// existed but should not, as it may be expensive.
1400+
async fn table(&self, _: &str) -> Result<Option<Arc<dyn TableProvider>>> {
1401+
panic!("InformationSchemaConfig::make_tables called SchemaProvider::table instead of table_type")
1402+
}
1403+
1404+
fn as_any(&self) -> &dyn Any {
1405+
unimplemented!("not required for these tests")
1406+
}
1407+
1408+
fn table_names(&self) -> Vec<String> {
1409+
vec!["atable".to_string()]
1410+
}
1411+
1412+
fn table_exist(&self, _: &str) -> bool {
1413+
unimplemented!("not required for these tests")
1414+
}
1415+
}
1416+
1417+
impl CatalogProviderList for Fixture {
1418+
fn as_any(&self) -> &dyn Any {
1419+
unimplemented!("not required for these tests")
1420+
}
1421+
1422+
fn register_catalog(
1423+
&self,
1424+
_: String,
1425+
_: Arc<dyn CatalogProvider>,
1426+
) -> Option<Arc<dyn CatalogProvider>> {
1427+
unimplemented!("not required for these tests")
1428+
}
1429+
1430+
fn catalog_names(&self) -> Vec<String> {
1431+
vec!["acatalog".to_string()]
1432+
}
1433+
1434+
fn catalog(&self, _: &str) -> Option<Arc<dyn CatalogProvider>> {
1435+
Some(Arc::new(Self))
1436+
}
1437+
}
1438+
1439+
impl CatalogProvider for Fixture {
1440+
fn as_any(&self) -> &dyn Any {
1441+
unimplemented!("not required for these tests")
1442+
}
1443+
1444+
fn schema_names(&self) -> Vec<String> {
1445+
vec!["aschema".to_string()]
1446+
}
1447+
1448+
fn schema(&self, _: &str) -> Option<Arc<dyn SchemaProvider>> {
1449+
Some(Arc::new(Self))
1450+
}
1451+
}
1452+
}

datafusion/catalog/src/schema.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use std::sync::Arc;
2626

2727
use crate::table::TableProvider;
2828
use datafusion_common::Result;
29+
use datafusion_expr::TableType;
2930

3031
/// Represents a schema, comprising a number of named tables.
3132
///
@@ -54,6 +55,14 @@ pub trait SchemaProvider: Debug + Sync + Send {
5455
name: &str,
5556
) -> Result<Option<Arc<dyn TableProvider>>, DataFusionError>;
5657

58+
/// Retrieves the type of a specific table from the schema by name, if it exists, otherwise
59+
/// returns `None`. Implementations for which this operation is cheap but [Self::table] is
60+
/// expensive can override this to improve operations that only need the type, e.g.
61+
/// `SELECT * FROM information_schema.tables`.
62+
async fn table_type(&self, name: &str) -> Result<Option<TableType>> {
63+
self.table(name).await.map(|o| o.map(|t| t.table_type()))
64+
}
65+
5766
/// If supported by the implementation, adds a new table named `name` to
5867
/// this schema.
5968
///

datafusion/common/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ base64 = "0.22.1"
5858
half = { workspace = true }
5959
hashbrown = { workspace = true }
6060
indexmap = { workspace = true }
61-
libc = "0.2.173"
61+
libc = "0.2.174"
6262
log = { workspace = true }
6363
object_store = { workspace = true, optional = true }
6464
parquet = { workspace = true, optional = true, default-features = true }

datafusion/common/src/config.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,13 @@ config_namespace! {
614614
/// during aggregations, if possible
615615
pub enable_topk_aggregation: bool, default = true
616616

617+
/// When set to true attempts to push down dynamic filters generated by operators into the file scan phase.
618+
/// For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer
619+
/// will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans.
620+
/// This means that if we already have 10 timestamps in the year 2025
621+
/// any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan.
622+
pub enable_dynamic_filter_pushdown: bool, default = true
623+
617624
/// When set to true, the optimizer will insert filters before a join between
618625
/// a nullable and non-nullable column to filter out nulls on the nullable side. This
619626
/// filter can add additional overhead when the file format does not fully support

datafusion/common/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ pub mod file_options;
4646
pub mod format;
4747
pub mod hash_utils;
4848
pub mod instant;
49+
mod null_equality;
4950
pub mod parsers;
5051
pub mod pruning;
5152
pub mod rounding;
@@ -79,6 +80,7 @@ pub use functional_dependencies::{
7980
};
8081
use hashbrown::hash_map::DefaultHashBuilder;
8182
pub use join_type::{JoinConstraint, JoinSide, JoinType};
83+
pub use null_equality::NullEquality;
8284
pub use param_value::ParamValues;
8385
pub use scalar::{ScalarType, ScalarValue};
8486
pub use schema_reference::SchemaReference;
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
/// Represents the behavior for null values when evaluating equality. Currently, its primary use
19+
/// case is to define the behavior of joins for null values.
20+
///
21+
/// # Examples
22+
///
23+
/// The following table shows the expected equality behavior for `NullEquality`.
24+
///
25+
/// | A | B | NullEqualsNothing | NullEqualsNull |
26+
/// |------|------|-------------------|----------------|
27+
/// | NULL | NULL | false | true |
28+
/// | NULL | 'b' | false | false |
29+
/// | 'a' | NULL | false | false |
30+
/// | 'a' | 'b' | false | false |
31+
///
32+
/// # Order
33+
///
34+
/// The order on this type represents the "restrictiveness" of the behavior. The more restrictive
35+
/// a behavior is, the fewer elements are considered to be equal to null.
36+
/// [NullEquality::NullEqualsNothing] represents the most restrictive behavior.
37+
///
38+
/// This mirrors the old order with `null_equals_null` booleans, as `false` indicated that
39+
/// `null != null`.
40+
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Hash)]
41+
pub enum NullEquality {
42+
/// Null is *not* equal to anything (`null != null`)
43+
NullEqualsNothing,
44+
/// Null is equal to null (`null == null`)
45+
NullEqualsNull,
46+
}

0 commit comments

Comments
 (0)