Add configuration variable for CPU/GPU decimal support (#1131)

charlesbluca · web-flow · commit ab7340b1e8e3 · 2023-05-09T12:35:26.000-04:00
* Add configuration variable for decimal support

* Make decimal import lazy
diff --git a/conftest.py b/conftest.py
@@ -15,6 +15,8 @@ def pytest_runtest_setup(item):
             pytest.skip("need --rungpu option to run")
         # FIXME: P2P shuffle isn't fully supported on GPU, so we must explicitly disable it
         dask.config.set({"dataframe.shuffle.algorithm": "tasks"})
+        # manually enable cudf decimal support
+        dask.config.set({"sql.mappings.decimal_support": "cudf"})
     else:
         dask.config.set({"dataframe.shuffle.algorithm": None})
     if "queries" in item.keywords and not item.config.getoption("--runqueries"):
diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py
@@ -1,19 +1,14 @@
 import logging
-from decimal import Decimal
 from typing import Any
 
 import dask.array as da
+import dask.config as dask_config
 import dask.dataframe as dd
 import numpy as np
 import pandas as pd
 
 from dask_planner.rust import DaskTypeMap, SqlTypeName
 
-try:
-    import cudf
-except ImportError:
-    cudf = None
-
 logger = logging.getLogger(__name__)
 
 
@@ -54,7 +49,7 @@
 _SQL_TO_PYTHON_SCALARS = {
     "SqlTypeName.DOUBLE": np.float64,
     "SqlTypeName.FLOAT": np.float32,
-    "SqlTypeName.DECIMAL": Decimal,
+    "SqlTypeName.DECIMAL": np.float32,
     "SqlTypeName.BIGINT": np.int64,
     "SqlTypeName.INTEGER": np.int32,
     "SqlTypeName.SMALLINT": np.int16,
@@ -71,8 +66,7 @@
 _SQL_TO_PYTHON_FRAMES = {
     "SqlTypeName.DOUBLE": np.float64,
     "SqlTypeName.FLOAT": np.float32,
-    # a column of Decimals in pandas is `object`, but cuDF has a dedicated dtype
-    "SqlTypeName.DECIMAL": object if not cudf else cudf.Decimal128Dtype(38, 10),
+    "SqlTypeName.DECIMAL": np.float64,  # We use np.float64 always, even though we might be able to use a smaller type
     "SqlTypeName.BIGINT": pd.Int64Dtype(),
     "SqlTypeName.INTEGER": pd.Int32Dtype(),
     "SqlTypeName.SMALLINT": pd.Int16Dtype(),
@@ -151,6 +145,14 @@ def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any:
 
         return literal_value
 
+    elif (
+        sql_type == SqlTypeName.DECIMAL
+        and dask_config.get("sql.mappings.decimal_support") == "cudf"
+    ):
+        from decimal import Decimal
+
+        python_type = Decimal
+
     elif sql_type == SqlTypeName.INTERVAL_DAY:
         return np.timedelta64(literal_value[0], "D") + np.timedelta64(
             literal_value[1], "ms"
@@ -219,7 +221,16 @@ def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any:
 def sql_to_python_type(sql_type: "SqlTypeName", *args) -> type:
     """Turn an SQL type into a dataframe dtype"""
     try:
-        if str(sql_type) == "SqlTypeName.DECIMAL":
+        if (
+            sql_type == SqlTypeName.DECIMAL
+            and dask_config.get("sql.mappings.decimal_support") == "cudf"
+        ):
+            try:
+                import cudf
+            except ImportError:
+                raise ModuleNotFoundError(
+                    "Setting `sql.mappings.decimal_support=cudf` requires cudf"
+                )
             return cudf.Decimal128Dtype(*args)
         return _SQL_TO_PYTHON_FRAMES[str(sql_type)]
     except KeyError:  # pragma: no cover
diff --git a/dask_sql/sql-schema.yaml b/dask_sql/sql-schema.yaml
@@ -75,3 +75,12 @@ properties:
               optimization (when possible). ``nelem`` is defined as the limit or ``k`` value times the
               number of columns. Default is 1000000, corresponding to a LIMIT clause of 1 million in a
               1 column table.
+
+      mappings:
+        type: object
+        properties:
+
+          decimal_support:
+            type: string
+            description:
+              Decides how to handle decimal scalars/columns. ``"pandas"`` handling will treat decimals scalars and columns as floats and float64 columns, respectively, while ``"cudf"`` handling treats decimal scalars as ``decimal.Decimal`` objects and decimal columns as ``cudf.Decimal128Dtype`` columns, handling precision/scale accordingly. Default is ``"pandas"``, but ``"cudf"`` should be used if attempting to work with decimal columns on GPU.
diff --git a/dask_sql/sql.yaml b/dask_sql/sql.yaml
@@ -18,3 +18,6 @@ sql:
 
   sort:
     topk-nelem-limit: 1000000
+
+  mappings:
+    decimal_support: "pandas"
diff --git a/tests/integration/test_filter.py b/tests/integration/test_filter.py
@@ -251,20 +251,16 @@ def test_filtered_csv(tmpdir, c):
     assert_eq(return_df, expected_df)
 
 
-@pytest.mark.gpu
-def test_filter_decimal(c):
-    import cudf
-
-    df = cudf.DataFrame(
+@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
+def test_filter_decimal(c, gpu):
+    df = pd.DataFrame(
         {
             "a": [304.5, 35.305, 9.043, 102.424, 53.34],
             "b": [2.2, 82.4, 42, 76.9, 54.4],
             "c": [1, 2, 2, 5, 9],
         }
     )
-    df["a"] = df["a"].astype(cudf.Decimal64Dtype(12, 3))
-    df["b"] = df["b"].astype(cudf.Decimal64Dtype(7, 1))
-    c.create_table("df", df)
+    c.create_table("df", df, gpu=gpu)
 
     result_df = c.sql(
         """
@@ -273,7 +269,7 @@ def test_filter_decimal(c):
         FROM
             df
         WHERE
-            a < b
+            CAST(a AS DECIMAL) < CAST(b AS DECIMAL)
         """
     )
 
@@ -284,16 +280,19 @@ def test_filter_decimal(c):
     result_df = c.sql(
         """
         SELECT
-            b
+            CAST(b AS DECIMAL) as b
         FROM
             df
         WHERE
-            a < decimal '100.2'
+            CAST(a AS DECIMAL) < DECIMAL '100.2'
         """
     )
 
-    expected_df = cudf.DataFrame({"b": [82.4, 42, 54.4]})
-    expected_df["b"] = expected_df["b"].astype(cudf.Decimal64Dtype(7, 1))
+    # decimal precision doesn't match up with pandas floats
+    if gpu:
+        result_df["b"] = result_df["b"].astype("float64")
+
+    expected_df = df.loc[df.a < 100.2][["b"]]
 
-    assert_eq(result_df.reset_index(drop=True), expected_df)
+    assert_eq(result_df, expected_df, check_index=False)
     c.drop_table("df")
diff --git a/tests/integration/test_groupby.py b/tests/integration/test_groupby.py
@@ -598,54 +598,60 @@ def test_groupby_split_every(c, gpu):
     c.drop_table("split_every_input")
 
 
-@pytest.mark.gpu
-def test_agg_decimal(c):
-    import cudf
-
-    df = cudf.DataFrame(
+@pytest.mark.parametrize("gpu", [False, pytest.param(True, marks=pytest.mark.gpu)])
+def test_agg_decimal(c, gpu):
+    df = pd.DataFrame(
         {
             "a": [1.23, 12.65, 134.64, -34.3, 945.19],
             "b": [1, 1, 2, 2, 3],
         }
     )
-    df["a"] = df["a"].astype(cudf.Decimal64Dtype(10, 2))
 
-    c.create_table("df", df, gpu=True)
+    c.create_table("df", df, gpu=gpu)
 
     result_df = c.sql(
         """
         SELECT
-            SUM(a) as s,
-            COUNT(a) as c,
-            SUM(a+a) as s2
+            SUM(CAST(a AS DECIMAL)) as s,
+            COUNT(CAST(a AS DECIMAL)) as c,
+            SUM(CAST(a+a AS DECIMAL)) as s2
         FROM
             df
         GROUP BY
             b
         """
     )
+    # decimal precision doesn't match up with pandas floats
+    if gpu:
+        result_df["s"] = result_df["s"].astype("float64")
+        result_df["s2"] = result_df["s2"].astype("float64")
 
-    expected_df = cudf.DataFrame(
+    expected_df = pd.DataFrame(
         {
             "s": df.groupby("b").sum()["a"],
-            "c": df.groupby("b").count()["a"].astype("int64"),
+            "c": df.groupby("b").count()["a"],
             "s2": df.groupby("b").sum()["a"] + df.groupby("b").sum()["a"],
         }
     )
 
-    assert_eq(result_df, expected_df.reset_index(drop=True))
+    # dtype of count aggregation is float on gpu
+    assert_eq(result_df, expected_df, check_index=False, check_dtype=(not gpu))
 
     result_df = c.sql(
         """
         SELECT
-            MIN(a) as min,
-            MAX(a) as max
+            MIN(CAST(a AS DECIMAL)) as min,
+            MAX(CAST(a AS DECIMAL)) as max
         FROM
             df
         """
     )
+    # decimal precision doesn't match up with pandas floats
+    if gpu:
+        result_df["min"] = result_df["min"].astype("float64")
+        result_df["max"] = result_df["max"].astype("float64")
 
-    expected_df = cudf.DataFrame(
+    expected_df = pd.DataFrame(
         {
             "min": [df.a.min()],
             "max": [df.a.max()],