dask-contrib · charlesbluca · Mar 1, 2022 · Feb 23, 2022 · Feb 23, 2022 · Feb 28, 2022
@@ -326,7 +326,9 @@ def f(x):
             f (:obj:`Callable`): The function to register
             name (:obj:`str`): Under which name should the new function be addressable in SQL
             parameters (:obj:`List[Tuple[str, type]]`): A list ot tuples of parameter name and parameter type.
-                Use `numpy dtypes <https://numpy.org/doc/stable/reference/arrays.dtypes.html>`_ if possible.
+                Use `numpy dtypes <https://numpy.org/doc/stable/reference/arrays.dtypes.html>`_ if possible. This
+                function is sensitive to the order of specified parameters when `row_udf=True`, and it is assumed
+                that column arguments are specified in order, followed by scalar arguments.
             return_type (:obj:`type`): The return type of the function
             replace (:obj:`bool`): If `True`, do not raise an error if a function with the same name is already
             present; instead, replace the original function. Default is `False`.
@@ -964,8 +966,7 @@ def _register_callable(
         schema = self.schema[schema_name]
 
         if not aggregation:
-            f = UDF(f, row_udf, return_type)
-
+            f = UDF(f, row_udf, parameters, return_type)
         lower_name = name.lower()
         if lower_name in schema.functions:
             if replace:

@@ -183,7 +183,7 @@ def assign(self) -> dd.DataFrame:
 
 
 class UDF:
-    def __init__(self, func, row_udf: bool, return_type=None):
+    def __init__(self, func, row_udf: bool, params, return_type=None):
         """
         Helper class that handles different types of UDFs and manages
         how they should be mapped to dask operations. Two versions of
@@ -196,6 +196,8 @@ def __init__(self, func, row_udf: bool, return_type=None):
         self.row_udf = row_udf
         self.func = func
 
+        self.names = [param[0] for param in params]
+
         if return_type is None:
             # These UDFs go through apply and without providing
             # a return type, dask will attempt to guess it, and
@@ -212,9 +214,11 @@ def __call__(self, *args, **kwargs):
                     column_args.append(operand)
                 else:
                     scalar_args.append(operand)
-            df = column_args[0].to_frame()
-            for col in column_args[1:]:
-                df[col.name] = col
+
+            df = column_args[0].to_frame(self.names[0])
+            for name, col in zip(self.names[1:], column_args[1:]):
+                df[name] = col
+
             result = df.apply(
                 self.func, axis=1, args=tuple(scalar_args), meta=self.meta
             ).astype(self.meta[1])

@@ -37,6 +37,19 @@ def df_simple():
     return pd.DataFrame({"a": [1, 2, 3], "b": [1.1, 2.2, 3.3]})
 
 
+@pytest.fixture()
+def df_wide():
+    return pd.DataFrame(
+        {
+            "a": [0, 1, 2],
+            "b": [3, 4, 5],
+            "c": [6, 7, 8],
+            "d": [9, 10, 11],
+            "e": [12, 13, 14],
+        }
+    )
+
+
 @pytest.fixture()
 def df():
     np.random.seed(42)
@@ -126,6 +139,7 @@ def gpu_datetime_table(datetime_table):
 @pytest.fixture()
 def c(
     df_simple,
+    df_wide,
     df,
     user_table_1,
     user_table_2,
@@ -142,6 +156,7 @@ def c(
 ):
     dfs = {
         "df_simple": df_simple,
+        "df_wide": df_wide,
         "df": df,
         "user_table_1": user_table_1,
         "user_table_2": user_table_2,

@@ -1,3 +1,4 @@
+import itertools
 import operator
 
 import dask.dataframe as dd
@@ -25,7 +26,7 @@ def f(x):
 
 def test_custom_function_row(c, df):
     def f(row):
-        return row["a"] ** 2
+        return row["x"] ** 2
 
     c.register_function(f, "f", [("x", np.float64)], np.float64, row_udf=True)
 
@@ -40,13 +41,33 @@ def f(row):
     assert_frame_equal(return_df.reset_index(drop=True), df[["a"]] ** 2)
 
 
+@pytest.mark.parametrize("colnames", list(itertools.combinations(["a", "b", "c"], 2)))
+def test_custom_function_any_colnames(colnames, df_wide, c):
+    # a third column is needed
+
+    def f(row):
+        return row["x"] + row["y"]
+
+    colname_x, colname_y = colnames
+    c.register_function(
+        f, "f", [("x", np.int64), ("y", np.int64)], np.int64, row_udf=True
+    )
+
+    return_df = c.sql(f"SELECT F({colname_x},{colname_y}) FROM df_wide")
+
+    expect = df_wide[colname_x] + df_wide[colname_y]
+    got = return_df.iloc[:, 0]
+
+    dd.assert_eq(expect, got, check_names=False)
+
+
 @pytest.mark.parametrize(
     "retty",
     [None, np.float64, np.float32, np.int64, np.int32, np.int16, np.int8, np.bool_],
 )
 def test_custom_function_row_return_types(c, df, retty):
     def f(row):
-        return row["a"] ** 2
+        return row["x"] ** 2
 
     if retty is None:
         with pytest.raises(ValueError):

@@ -35,6 +35,7 @@ def test_tables(c):
             "Table": [
                 "df",
                 "df_simple",
+                "df_wide",
                 "user_table_1",
                 "user_table_2",
                 "long_table",
@@ -47,6 +48,7 @@ def test_tables(c):
             else [
                 "df",
                 "df_simple",
+                "df_wide",
                 "user_table_1",
                 "user_table_2",
                 "long_table",