Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions dask_sql/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,9 @@ def f(x):
f (:obj:`Callable`): The function to register
name (:obj:`str`): Under which name should the new function be addressable in SQL
parameters (:obj:`List[Tuple[str, type]]`): A list ot tuples of parameter name and parameter type.
Use `numpy dtypes <https://numpy.org/doc/stable/reference/arrays.dtypes.html>`_ if possible.
Use `numpy dtypes <https://numpy.org/doc/stable/reference/arrays.dtypes.html>`_ if possible. This
function is sensitive to the order of specified parameters when `row_udf=True`, and it is assumed
that column arguments are specified in order, followed by scalar arguments.
return_type (:obj:`type`): The return type of the function
replace (:obj:`bool`): If `True`, do not raise an error if a function with the same name is already
present; instead, replace the original function. Default is `False`.
Expand Down Expand Up @@ -964,8 +966,7 @@ def _register_callable(
schema = self.schema[schema_name]

if not aggregation:
f = UDF(f, row_udf, return_type)

f = UDF(f, row_udf, parameters, return_type)
lower_name = name.lower()
if lower_name in schema.functions:
if replace:
Expand Down
12 changes: 8 additions & 4 deletions dask_sql/datacontainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def assign(self) -> dd.DataFrame:


class UDF:
def __init__(self, func, row_udf: bool, return_type=None):
def __init__(self, func, row_udf: bool, params, return_type=None):
"""
Helper class that handles different types of UDFs and manages
how they should be mapped to dask operations. Two versions of
Expand All @@ -196,6 +196,8 @@ def __init__(self, func, row_udf: bool, return_type=None):
self.row_udf = row_udf
self.func = func

self.names = [param[0] for param in params]

if return_type is None:
# These UDFs go through apply and without providing
# a return type, dask will attempt to guess it, and
Expand All @@ -212,9 +214,11 @@ def __call__(self, *args, **kwargs):
column_args.append(operand)
else:
scalar_args.append(operand)
df = column_args[0].to_frame()
for col in column_args[1:]:
df[col.name] = col

df = column_args[0].to_frame(self.names[0])
for name, col in zip(self.names[1:], column_args[1:]):
df[name] = col

result = df.apply(
self.func, axis=1, args=tuple(scalar_args), meta=self.meta
).astype(self.meta[1])
Expand Down
15 changes: 15 additions & 0 deletions tests/integration/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,19 @@ def df_simple():
return pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.2, 3.3]})


@pytest.fixture()
def df_wide():
return pd.DataFrame(
{
"a": [0, 1, 2],
"b": [3, 4, 5],
"c": [6, 7, 8],
"d": [9, 10, 11],
"e": [12, 13, 14],
}
)


@pytest.fixture()
def df():
np.random.seed(42)
Expand Down Expand Up @@ -126,6 +139,7 @@ def gpu_datetime_table(datetime_table):
@pytest.fixture()
def c(
df_simple,
df_wide,
df,
user_table_1,
user_table_2,
Expand All @@ -142,6 +156,7 @@ def c(
):
dfs = {
"df_simple": df_simple,
"df_wide": df_wide,
"df": df,
"user_table_1": user_table_1,
"user_table_2": user_table_2,
Expand Down
25 changes: 23 additions & 2 deletions tests/integration/test_function.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
import operator

import dask.dataframe as dd
Expand Down Expand Up @@ -25,7 +26,7 @@ def f(x):

def test_custom_function_row(c, df):
def f(row):
return row["a"] ** 2
return row["x"] ** 2

c.register_function(f, "f", [("x", np.float64)], np.float64, row_udf=True)

Expand All @@ -40,13 +41,33 @@ def f(row):
assert_frame_equal(return_df.reset_index(drop=True), df[["a"]] ** 2)


@pytest.mark.parametrize("colnames", list(itertools.combinations(["a", "b", "c"], 2)))
def test_custom_function_any_colnames(colnames, df_wide, c):
# a third column is needed

def f(row):
return row["x"] + row["y"]

colname_x, colname_y = colnames
c.register_function(
f, "f", [("x", np.int64), ("y", np.int64)], np.int64, row_udf=True
)

return_df = c.sql(f"SELECT F({colname_x},{colname_y}) FROM df_wide")

expect = df_wide[colname_x] + df_wide[colname_y]
got = return_df.iloc[:, 0]

dd.assert_eq(expect, got, check_names=False)


@pytest.mark.parametrize(
"retty",
[None, np.float64, np.float32, np.int64, np.int32, np.int16, np.int8, np.bool_],
)
def test_custom_function_row_return_types(c, df, retty):
def f(row):
return row["a"] ** 2
return row["x"] ** 2

if retty is None:
with pytest.raises(ValueError):
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_show.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def test_tables(c):
"Table": [
"df",
"df_simple",
"df_wide",
"user_table_1",
"user_table_2",
"long_table",
Expand All @@ -47,6 +48,7 @@ def test_tables(c):
else [
"df",
"df_simple",
"df_wide",
"user_table_1",
"user_table_2",
"long_table",
Expand Down