Skip to content

Commit a159393

Browse files
authored
Test DataFusion 45.0.0 with Sail (#365)
1 parent 39a1b5a commit a159393

File tree

19 files changed

+489
-336
lines changed

19 files changed

+489
-336
lines changed

Cargo.lock

Lines changed: 258 additions & 222 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ ryu = "1.0.18"
5050
either = "1.12.0"
5151
num-bigint = "0.4.6"
5252
num-traits = "0.2.19"
53-
serde_arrow = { version = "0.12.3", features = ["arrow-53"] }
53+
serde_arrow = { version = "0.12.3", features = ["arrow-54"] }
5454
mimalloc = { version = "0.1.43", default-features = false }
5555
rand = "0.8.5"
5656
rand_chacha = "0.3.1"
@@ -86,16 +86,16 @@ chumsky = { version = "=1.0.0-alpha.7", default-features = false, features = ["p
8686
# The versions of the following dependencies are managed manually.
8787
######
8888

89-
datafusion = { version = "44.0.0", features = ["serde", "pyarrow", "avro"] }
90-
datafusion-common = { version = "44.0.0", features = ["object_store", "pyarrow", "avro"] }
91-
datafusion-expr = { version = "44.0.0" }
92-
datafusion-expr-common = { version = "44.0.0" }
93-
datafusion-proto = { version = "44.0.0" }
94-
datafusion-functions-nested = { version = "44.0.0" }
95-
datafusion-functions-json = { git = "https://github.com/lakehq/datafusion-functions-json.git", rev = "7bcca26" }
89+
datafusion = { version = "45.0.0", features = ["serde", "pyarrow", "avro"] }
90+
datafusion-common = { version = "45.0.0", features = ["object_store", "pyarrow", "avro"] }
91+
datafusion-expr = { version = "45.0.0" }
92+
datafusion-expr-common = { version = "45.0.0" }
93+
datafusion-proto = { version = "45.0.0" }
94+
datafusion-functions-nested = { version = "45.0.0" }
95+
datafusion-functions-json = { git = "https://github.com/lakehq/datafusion-functions-json.git", rev = "453183d" }
9696
# auto-initialize: Changes [`Python::with_gil`] to automatically initialize the Python interpreter if needed.
97-
pyo3 = { version = "0.22.0", features = ["auto-initialize", "serde"] }
98-
arrow-flight = { version = "53.3.0" }
97+
pyo3 = { version = "0.23.4", features = ["auto-initialize", "serde"] }
98+
arrow-flight = { version = "54.1.0" }
9999
# The `object_store` version must match the one used in DataFusion.
100100
object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] }
101101
# We use a patched latest version of sqlparser. The version may be different from the one used in DataFusion.
@@ -109,6 +109,31 @@ sqlparser = { git = "https://github.com/lakehq/sqlparser-rs.git", rev = "9ade53d
109109
[patch.crates-io]
110110
# Override dependencies to use our forked versions.
111111
# You can use `path = "..."` to temporarily point to your local copy of the crates to speed up local development.
112+
datafusion = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
113+
datafusion-catalog = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
114+
datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
115+
datafusion-common-runtime = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
116+
datafusion-doc = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
117+
datafusion-execution = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
118+
datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
119+
datafusion-expr-common = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
120+
#datafusion-ffi = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
121+
datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
122+
datafusion-functions-aggregate = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
123+
datafusion-functions-aggregate-common = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
124+
datafusion-functions-nested = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
125+
datafusion-functions-table = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
126+
datafusion-functions-window = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
127+
datafusion-functions-window-common = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
128+
datafusion-macros = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
129+
datafusion-optimizer = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
130+
datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
131+
datafusion-physical-expr-common = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
132+
datafusion-physical-optimizer = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
133+
datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
134+
datafusion-proto = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
135+
datafusion-proto-common = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
136+
datafusion-sql = { git = "https://github.com/apache/datafusion.git", rev = "a9fb58c" }
112137

113138
[profile.release]
114139
# https://doc.rust-lang.org/cargo/reference/profiles.html#release

crates/sail-cli/src/spark/shell.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use std::ffi::CString;
12
use std::net::Ipv4Addr;
23

34
use pyo3::prelude::PyAnyMethods;
@@ -32,7 +33,12 @@ pub fn run_pyspark_shell() -> Result<(), Box<dyn std::error::Error>> {
3233
})?;
3334
runtime.spawn(server_task);
3435
Python::with_gil(|py| -> PyResult<_> {
35-
let shell = PyModule::from_code_bound(py, SHELL_SOURCE_CODE, "shell.py", "shell")?;
36+
let shell = PyModule::from_code(
37+
py,
38+
CString::new(SHELL_SOURCE_CODE)?.as_c_str(),
39+
CString::new("shell.py")?.as_c_str(),
40+
CString::new("shell")?.as_c_str(),
41+
)?;
3642
shell
3743
.getattr("run_pyspark_shell")?
3844
.call((server_port,), None)?;

crates/sail-execution/src/codec.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,20 @@ use datafusion::arrow::datatypes::{DataType, Schema, TimeUnit};
66
use datafusion::common::parsers::CompressionTypeVariant;
77
use datafusion::common::{plan_datafusion_err, plan_err, JoinSide, Result};
88
use datafusion::datasource::file_format::file_compression_type::FileCompressionType;
9+
#[allow(deprecated)]
910
use datafusion::datasource::physical_plan::{ArrowExec, NdJsonExec};
11+
use datafusion::datasource::physical_plan::{ArrowSource, JsonSource};
1012
use datafusion::execution::FunctionRegistry;
1113
use datafusion::functions::string::overlay::OverlayFunc;
1214
use datafusion::logical_expr::{AggregateUDF, AggregateUDFImpl, ScalarUDF, ScalarUDFImpl};
1315
use datafusion::physical_expr::LexOrdering;
1416
use datafusion::physical_plan::joins::utils::{ColumnIndex, JoinFilter};
1517
use datafusion::physical_plan::joins::SortMergeJoinExec;
18+
#[allow(deprecated)]
1619
use datafusion::physical_plan::memory::MemoryExec;
1720
use datafusion::physical_plan::recursive_query::RecursiveQueryExec;
1821
use datafusion::physical_plan::sorts::partial_sort::PartialSortExec;
22+
#[allow(deprecated)]
1923
use datafusion::physical_plan::values::ValuesExec;
2024
use datafusion::physical_plan::work_table::WorkTableExec;
2125
use datafusion::physical_plan::{ExecutionPlan, Partitioning};
@@ -226,6 +230,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec {
226230
let sort_information =
227231
self.try_decode_lex_orderings(&sort_information, registry, &schema)?;
228232
Ok(Arc::new(
233+
#[allow(deprecated)]
229234
MemoryExec::try_new(&partitions, Arc::new(schema), projection)?
230235
.with_show_sizes(show_sizes)
231236
.try_with_sort_information(sort_information)?,
@@ -234,6 +239,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec {
234239
NodeKind::Values(gen::ValuesExecNode { data, schema }) => {
235240
let schema = self.try_decode_schema(&schema)?;
236241
let data = read_record_batches(&data)?;
242+
#[allow(deprecated)]
237243
Ok(Arc::new(ValuesExec::try_new_from_batches(
238244
Arc::new(schema),
239245
data,
@@ -247,9 +253,11 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec {
247253
&self.try_decode_message(&base_config)?,
248254
registry,
249255
self,
256+
Arc::new(JsonSource::new()), // TODO: Look into configuring this if needed
250257
)?;
251258
let file_compression_type: FileCompressionType =
252259
self.try_decode_file_compression_type(file_compression_type)?;
260+
#[allow(deprecated)]
253261
Ok(Arc::new(NdJsonExec::new(
254262
base_config,
255263
file_compression_type,
@@ -260,7 +268,9 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec {
260268
&self.try_decode_message(&base_config)?,
261269
registry,
262270
self,
271+
Arc::new(ArrowSource::default()), // TODO: Look into configuring this if needed
263272
)?;
273+
#[allow(deprecated)]
264274
Ok(Arc::new(ArrowExec::new(base_config)))
265275
}
266276
NodeKind::WorkTable(gen::WorkTableExecNode { name, schema }) => {
@@ -334,7 +344,11 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec {
334344
})
335345
})
336346
.collect::<Result<Vec<_>>>()?;
337-
Some(JoinFilter::new(expression, column_indices, schema))
347+
Some(JoinFilter::new(
348+
expression,
349+
column_indices,
350+
Arc::new(schema),
351+
))
338352
} else {
339353
None
340354
};
@@ -364,6 +378,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec {
364378
}
365379

366380
fn try_encode(&self, node: Arc<dyn ExecutionPlan>, buf: &mut Vec<u8>) -> Result<()> {
381+
#[allow(deprecated)]
367382
let node_kind = if let Some(range) = node.as_any().downcast_ref::<RangeExec>() {
368383
let schema = self.try_encode_schema(range.schema().as_ref())?;
369384
NodeKind::Range(gen::RangeExecNode {

crates/sail-plan/src/extension/function/spark_array.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@ use datafusion::arrow::array::{
99
use datafusion::arrow::buffer::OffsetBuffer;
1010
use datafusion::arrow::datatypes::{DataType, Field};
1111
use datafusion_common::utils::SingleRowListArrayBuilder;
12-
use datafusion_common::{internal_err, plan_err, ExprSchema, Result};
12+
use datafusion_common::{internal_err, plan_err, Result};
1313
use datafusion_expr::type_coercion::binary::comparison_coercion;
14-
use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, TypeSignature, Volatility};
14+
use datafusion_expr::{
15+
ColumnarValue, ReturnInfo, ReturnTypeArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
16+
};
1517

1618
use crate::extension::function::functions_nested_utils::make_scalar_function;
1719

@@ -75,8 +77,9 @@ impl ScalarUDFImpl for SparkArray {
7577
}
7678
}
7779

78-
fn is_nullable(&self, _args: &[Expr], _schema: &dyn ExprSchema) -> bool {
79-
false
80+
fn return_type_from_args(&self, args: ReturnTypeArgs) -> Result<ReturnInfo> {
81+
let return_type = self.return_type(args.arg_types)?;
82+
Ok(ReturnInfo::new_non_nullable(return_type))
8083
}
8184

8285
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {

crates/sail-plan/src/extension/function/spark_concat.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ use datafusion::arrow::datatypes::DataType;
55
use datafusion::functions::string::concat::ConcatFunc;
66
use datafusion_common::utils::list_ndims;
77
use datafusion_common::{plan_err, ExprSchema, Result};
8-
use datafusion_expr::type_coercion::binary::get_wider_type;
98
use datafusion_expr::{ColumnarValue, Expr, ExprSchemable, ScalarUDFImpl, Signature, Volatility};
109
use datafusion_functions_nested::concat::ArrayConcat;
1110

@@ -41,7 +40,7 @@ impl ScalarUDFImpl for SparkConcat {
4140
&self.signature
4241
}
4342

44-
/// [Credit]: <https://github.com/apache/datafusion/blob/7b2284c8a0b49234e9607bfef10d73ef788d9458/datafusion/functions-nested/src/concat.rs#L274-L301>
43+
/// [Credit]: <https://github.com/apache/datafusion/blob/7ccc6d7c55ae9dbcb7dee031f394bf11a03000ba/datafusion/functions-nested/src/concat.rs#L276-L310>
4544
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
4645
if arg_types
4746
.iter()
@@ -56,7 +55,15 @@ impl ScalarUDFImpl for SparkConcat {
5655
let dims = list_ndims(arg_type);
5756
expr_type = match max_dims.cmp(&dims) {
5857
Ordering::Greater => expr_type,
59-
Ordering::Equal => get_wider_type(&expr_type, arg_type)?,
58+
Ordering::Equal => {
59+
if expr_type == DataType::Null {
60+
arg_type.clone()
61+
} else if !expr_type.equals_datatype(arg_type) {
62+
return plan_err!("It is not possible to concatenate arrays of different types. Expected: {expr_type}, got: {arg_type}");
63+
} else {
64+
expr_type
65+
}
66+
}
6067
Ordering::Less => {
6168
max_dims = dims;
6269
arg_type.clone()

crates/sail-plan/src/extension/source/rename.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,22 @@ impl RenameTableProvider {
4949

5050
fn to_inner_expr(&self, expr: &Expr) -> Result<Expr> {
5151
let rewrite = |e: Expr| -> Result<Transformed<Expr>> {
52-
if let Expr::Column(Column { name, relation }) = e {
52+
if let Expr::Column(Column {
53+
name,
54+
relation,
55+
spans,
56+
}) = e
57+
{
5358
let name = self
5459
.names
5560
.get(&name)
5661
.ok_or_else(|| plan_datafusion_err!("column {name} not found"))?
5762
.clone();
58-
Ok(Transformed::yes(Expr::Column(Column { name, relation })))
63+
Ok(Transformed::yes(Expr::Column(Column {
64+
name,
65+
relation,
66+
spans,
67+
})))
5968
} else {
6069
Ok(Transformed::no(e))
6170
}

crates/sail-python-udf/src/cereal/pyspark_udf.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use pyo3::exceptions::PyValueError;
22
use pyo3::prelude::PyAnyMethods;
33
use pyo3::types::PyModule;
4-
use pyo3::{intern, PyObject, Python, ToPyObject};
4+
use pyo3::{intern, Bound, IntoPyObject, PyAny, Python};
55
use sail_common::spec;
66

77
use crate::cereal::{check_python_udf_version, should_write_config};
@@ -11,24 +11,27 @@ use crate::error::{PyUdfError, PyUdfResult};
1111
pub struct PySparkUdfPayload;
1212

1313
impl PySparkUdfPayload {
14-
pub fn load(py: Python, data: &[u8]) -> PyUdfResult<PyObject> {
14+
pub fn load<'py>(py: Python<'py>, data: &[u8]) -> PyUdfResult<Bound<'py, PyAny>> {
1515
let (eval_type, v) = data
1616
.split_at_checked(size_of::<i32>())
1717
.ok_or_else(|| PyUdfError::invalid("missing eval_type"))?;
1818
let eval_type = eval_type
1919
.try_into()
2020
.map_err(|e| PyValueError::new_err(format!("eval_type bytes: {e}")))?;
2121
let eval_type = i32::from_be_bytes(eval_type);
22-
let infile = PyModule::import_bound(py, intern!(py, "io"))?
22+
let infile = PyModule::import(py, intern!(py, "io"))?
2323
.getattr(intern!(py, "BytesIO"))?
2424
.call1((v,))?;
25-
let serializer = PyModule::import_bound(py, intern!(py, "pyspark.serializers"))?
25+
let serializer = PyModule::import(py, intern!(py, "pyspark.serializers"))?
2626
.getattr(intern!(py, "CPickleSerializer"))?
2727
.call0()?;
28-
let tuple = PyModule::import_bound(py, intern!(py, "pyspark.worker"))?
28+
let tuple = PyModule::import(py, intern!(py, "pyspark.worker"))?
2929
.getattr(intern!(py, "read_udfs"))?
3030
.call1((serializer, infile, eval_type))?;
31-
Ok(tuple.get_item(0)?.to_object(py))
31+
tuple
32+
.get_item(0)?
33+
.into_pyobject(py)
34+
.map_err(|e| PyUdfError::PythonError(e.into()))
3235
}
3336

3437
pub fn build(

crates/sail-python-udf/src/cereal/pyspark_udtf.rs

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use datafusion::arrow::pyarrow::ToPyArrow;
33
use pyo3::exceptions::PyValueError;
44
use pyo3::prelude::PyAnyMethods;
55
use pyo3::types::PyModule;
6-
use pyo3::{intern, PyObject, PyResult, Python, ToPyObject};
6+
use pyo3::{intern, Bound, IntoPyObject, PyAny, PyResult, Python};
77
use sail_common::spec;
88

99
use crate::cereal::{check_python_udf_version, should_write_config};
@@ -13,24 +13,27 @@ use crate::error::{PyUdfError, PyUdfResult};
1313
pub struct PySparkUdtfPayload;
1414

1515
impl PySparkUdtfPayload {
16-
pub fn load(py: Python, v: &[u8]) -> PyUdfResult<PyObject> {
16+
pub fn load<'py>(py: Python<'py>, v: &[u8]) -> PyUdfResult<Bound<'py, PyAny>> {
1717
let (eval_type, v) = v
1818
.split_at_checked(size_of::<i32>())
1919
.ok_or_else(|| PyUdfError::invalid("missing eval_type"))?;
2020
let eval_type = eval_type
2121
.try_into()
2222
.map_err(|e| PyValueError::new_err(format!("eval_type bytes: {e}")))?;
2323
let eval_type = i32::from_be_bytes(eval_type);
24-
let infile = PyModule::import_bound(py, intern!(py, "io"))?
24+
let infile = PyModule::import(py, intern!(py, "io"))?
2525
.getattr(intern!(py, "BytesIO"))?
2626
.call1((v,))?;
27-
let serializer = PyModule::import_bound(py, intern!(py, "pyspark.serializers"))?
27+
let serializer = PyModule::import(py, intern!(py, "pyspark.serializers"))?
2828
.getattr(intern!(py, "CPickleSerializer"))?
2929
.call0()?;
30-
let tuple = PyModule::import_bound(py, intern!(py, "pyspark.worker"))?
30+
let tuple = PyModule::import(py, intern!(py, "pyspark.worker"))?
3131
.getattr(intern!(py, "read_udtf"))?
3232
.call1((serializer, infile, eval_type))?;
33-
Ok(tuple.get_item(0)?.to_object(py))
33+
tuple
34+
.get_item(0)?
35+
.into_pyobject(py)
36+
.map_err(|e| PyUdfError::PythonError(e.into()))
3437
}
3538

3639
pub fn build(
@@ -70,7 +73,7 @@ impl PySparkUdtfPayload {
7073

7174
let type_string = Python::with_gil(|py| -> PyResult<String> {
7275
let return_type = return_type.to_pyarrow(py)?.clone_ref(py).into_bound(py);
73-
PyModule::import_bound(py, intern!(py, "pyspark.sql.pandas.types"))?
76+
PyModule::import(py, intern!(py, "pyspark.sql.pandas.types"))?
7477
.getattr(intern!(py, "from_arrow_type"))?
7578
.call1((return_type,))?
7679
.getattr(intern!(py, "json"))?

0 commit comments

Comments
 (0)