dask-contrib · charlesbluca · Feb 1, 2024 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
@@ -14,7 +14,7 @@ def pytest_addoption(parser):
 def pytest_runtest_setup(item):
     # TODO: get pyarrow strings and p2p shuffle working
     dask.config.set({"dataframe.convert-string": False})
-    dask.config.set({"dataframe.shuffle.algorithm": "tasks"})
+    dask.config.set({"dataframe.shuffle.method": "tasks"})
     if "gpu" in item.keywords:
         if not item.config.getoption("--rungpu"):
             pytest.skip("need --rungpu option to run")

@@ -853,7 +853,7 @@ def _get_ral(self, sql):
             except DFOptimizationException as oe:
                 # Use original plan and warn about inability to optimize plan
                 rel = nonOptimizedRel
-                logger.warn(str(oe))
+                logger.warning(str(oe))
         else:
             rel = nonOptimizedRel
 

@@ -37,7 +37,7 @@
     pd.UInt16Dtype(): SqlTypeName.SMALLINT,
     np.uint8: SqlTypeName.TINYINT,
     pd.UInt8Dtype(): SqlTypeName.TINYINT,
-    np.bool8: SqlTypeName.BOOLEAN,
+    np.bool_: SqlTypeName.BOOLEAN,
     pd.BooleanDtype(): SqlTypeName.BOOLEAN,
     str: SqlTypeName.VARCHAR,
     np.object_: SqlTypeName.VARCHAR,
@@ -55,7 +55,7 @@
     "SqlTypeName.INTEGER": np.int32,
     "SqlTypeName.SMALLINT": np.int16,
     "SqlTypeName.TINYINT": np.int8,
-    "SqlTypeName.BOOLEAN": np.bool8,
+    "SqlTypeName.BOOLEAN": np.bool_,
     "SqlTypeName.VARCHAR": str,
     "SqlTypeName.CHAR": str,
     "SqlTypeName.NULL": type(None),
@@ -100,7 +100,7 @@ def python_to_sql_type(python_type) -> "DaskTypeMap":
     if isinstance(python_type, np.dtype):
         python_type = python_type.type
 
-    if pd.api.types.is_datetime64tz_dtype(python_type):
+    if isinstance(python_type, pd.DatetimeTZDtype):
         return DaskTypeMap(
             SqlTypeName.TIMESTAMP_WITH_LOCAL_TIME_ZONE,
             unit=str(python_type.unit),
@@ -277,8 +277,8 @@ def similar_type(lhs: type, rhs: type) -> bool:
     is_object = pdt.is_object_dtype
     is_string = pdt.is_string_dtype
     is_dt_ns = pdt.is_datetime64_ns_dtype
-    is_dt_tz = lambda t: is_dt_ns(t) and pdt.is_datetime64tz_dtype(t)
-    is_dt_ntz = lambda t: is_dt_ns(t) and not pdt.is_datetime64tz_dtype(t)
+    is_dt_tz = lambda t: is_dt_ns(t) and isinstance(t, pd.DatetimeTZDtype)
+    is_dt_ntz = lambda t: is_dt_ns(t) and not isinstance(t, pd.DatetimeTZDtype)
     is_td_ns = pdt.is_timedelta64_ns_dtype
     is_bool = pdt.is_bool_dtype
 
@@ -334,8 +334,8 @@ def cast_column_to_type(col: dd.Series, expected_type: str):
     pdt = pd.api.types
 
     is_dt_ns = pdt.is_datetime64_ns_dtype
-    is_dt_tz = lambda t: is_dt_ns(t) and pdt.is_datetime64tz_dtype(t)
-    is_dt_ntz = lambda t: is_dt_ns(t) and not pdt.is_datetime64tz_dtype(t)
+    is_dt_tz = lambda t: is_dt_ns(t) and isinstance(t, pd.DatetimeTZDtype)
+    is_dt_ntz = lambda t: is_dt_ns(t) and not isinstance(t, pd.DatetimeTZDtype)
 
     current_type = col.dtype
 

@@ -341,7 +341,9 @@ def _apply_window(
         # TODO: That is a bit of a hack. We should really use the real column dtype
         meta = df._meta.assign(**{col: 0.0 for col in newly_created_columns})
 
-        df = df.groupby(group_columns, dropna=False).apply(filled_map, meta=meta)
+        df = df.groupby(group_columns, dropna=False)[df.columns.tolist()].apply(
+            filled_map, meta=meta
+        )
         logger.debug(
             f"Having created a dataframe {LoggableDataFrame(df)} after windowing. Will now drop {temporary_columns}."
         )

@@ -16,7 +16,7 @@
 from dask.highlevelgraph import HighLevelGraph
 from dask.utils import random_state_data
 
-from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200
+from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT
 from dask_sql._datafusion_lib import SqlTypeName
 from dask_sql.datacontainer import DataContainer
 from dask_sql.mappings import (
@@ -311,7 +311,7 @@ def false_(
         Returns false on nan.
         """
         if is_frame(df):
-            return ~df.fillna(True)
+            return ~df.astype("boolean").fillna(True)
 
         return not pd.isna(df) and df is not None and not np.isnan(df) and not bool(df)
 
@@ -331,7 +331,7 @@ def true_(
         Returns false on nan.
         """
         if is_frame(df):
-            return df.fillna(False)
+            return df.astype("boolean").fillna(False)
 
         return not pd.isna(df) and df is not None and not np.isnan(df) and bool(df)
 
@@ -794,11 +794,11 @@ def _round_datetime(self, *operands):
 
         unit_map = {
             "DAY": "D",
-            "HOUR": "H",
-            "MINUTE": "T",
-            "SECOND": "S",
+            "HOUR": "h",
+            "MINUTE": "min",
+            "SECOND": "s",
             "MICROSECOND": "U",
-            "MILLISECOND": "L",
+            "MILLISECOND": "ms",
         }
 
         try:
@@ -960,7 +960,7 @@ def date_part(self, what, df: SeriesOrScalar):
         elif what in {"SECOND", "SECONDS"}:
             return df.second
         elif what in {"WEEK", "WEEKS"}:
-            return df.isocalendar().week if PANDAS_GT_200 else df.week
+            return df.isocalendar().week
         elif what in {"YEAR", "YEARS"}:
             return df.year
         elif what == "DATE":

@@ -25,7 +25,7 @@ def create_meta_data(c: Context):
     """
 
     if c is None:
-        logger.warn("Context None: jdbc meta data not created")
+        logger.warning("Context None: jdbc meta data not created")
         return
     catalog = ""
     system_schema = "system_jdbc"

@@ -112,7 +112,7 @@ For this, it uses the following mapping:
 +-----------------------+----------------+
 | From Python Type      | To SQL Type    |
 +=======================+================+
-| ``np.bool8``          |  ``BOOLEAN``   |
+| ``np.bool_``          |  ``BOOLEAN``   |
 +-----------------------+----------------+
 | ``np.datetime64``     |  ``TIMESTAMP`` |
 +-----------------------+----------------+

@@ -89,3 +89,18 @@ locked = true
 
 [tool.isort]
 profile = "black"
+
+[tool.pytest.ini_options]
+markers = [
+    "gpu: marks tests that require GPUs (skipped by default, run with --rungpu)",
+    "queries: marks tests that run test queries (skipped by default, run with --runqueries)",
+]
+addopts = "-v -rsxfE --color=yes --cov dask_sql --cov-config=.coveragerc --cov-report=term-missing"
+filterwarnings = [
+    "error:::dask_sql[.*]",
+    "error:::dask[.*]",
+    "ignore:Need to do a cross-join:ResourceWarning:dask_sql[.*]",
+    "ignore:Dask doesn't support Dask frames:ResourceWarning:dask_sql[.*]",
+    "ignore:Running on a single-machine scheduler:UserWarning:dask[.*]",
+]
+xfail_strict = true
@@ -8,7 +8,7 @@
 from dask.datasets import timeseries as dd_timeseries
 from dask.distributed import Client
 
-from tests.utils import assert_eq
+from tests.utils import assert_eq, convert_nullable_columns
 
 try:
     import cudf
@@ -333,20 +333,17 @@ def _assert_query_gives_same_result(query, sort_columns=None, **kwargs):
 
         # allow that the names are different
         # as expressions are handled differently
-        dask_result.columns = sql_result.columns
+        sql_result.columns = dask_result.columns
 
-        # replace all pd.NA scalars, which are resistent to
-        # check_dype=False and .astype()
-        dask_result = dask_result.replace({pd.NA: None})
+        sql_result = sql_result.convert_dtypes()
+        dask_result = dask_result.convert_dtypes()
 
-        if sort_columns:
-            sql_result = sql_result.sort_values(sort_columns)
-            dask_result = dask_result.sort_values(sort_columns)
+        convert_nullable_columns(sql_result)
+        convert_nullable_columns(dask_result)
 
-        sql_result = sql_result.reset_index(drop=True)
-        dask_result = dask_result.reset_index(drop=True)
-
-        assert_eq(sql_result, dask_result, check_dtype=False, **kwargs)
+        assert_eq(
+            sql_result, dask_result, check_dtype=False, check_index=False, **kwargs
+        )
 
     return _assert_query_gives_same_result
 

@@ -19,41 +19,32 @@
 
 from dask_sql import Context
 from dask_sql.utils import ParsingException
-from tests.utils import assert_eq
+from tests.utils import assert_eq, convert_nullable_columns
 
 
-def cast_datetime_to_string(df):
-    cols = df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()
-
-    if not cols:
-        return df
-
-    for col in cols:
-        df[col] = df[col].dt.strftime("%Y-%m-%d %H:%M:%S")
-
-    return df
-
-
-def eq_sqlite(sql, check_index=True, **dfs):
+def eq_sqlite(sql, **dfs):
     c = Context()
     engine = sqlite3.connect(":memory:")
 
     for name, df in dfs.items():
         c.create_table(name, df)
         df.to_sql(name, engine, index=False)
 
-    dask_result = c.sql(sql).reset_index(drop=True)
-    sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True)
+    dask_result = c.sql(sql).compute().convert_dtypes()
+    sqlite_result = pd.read_sql(sql, engine).convert_dtypes()
+
+    convert_nullable_columns(dask_result)
+    convert_nullable_columns(sqlite_result)
 
-    # casting to object to ensure equality with sql-lite
-    # which returns object dtype for datetime inputs
-    dask_result = cast_datetime_to_string(dask_result)
+    datetime_cols = dask_result.select_dtypes(
+        include=["datetime64[ns]"]
+    ).columns.tolist()
+    for col in datetime_cols:
+        sqlite_result[col] = pd.to_datetime(sqlite_result[col])
 
-    # Make sure SQL and Dask use the same "NULL" value
-    dask_result = dask_result.fillna(np.NaN)
-    sqlite_result = sqlite_result.fillna(np.NaN)
+    sqlite_result = sqlite_result.astype(dask_result.dtypes)
 
-    assert_eq(dask_result, sqlite_result, check_dtype=False, check_index=check_index)
+    assert_eq(dask_result, sqlite_result, check_dtype=False, check_index=False)
 
 
 def make_rand_df(size: int, **kwargs):
@@ -953,7 +944,6 @@ def test_union():
             UNION ALL SELECT * FROM c
         ORDER BY b NULLS FIRST, c NULLS FIRST
         """,
-        check_index=False,
         a=a,
         b=b,
         c=c,

@@ -171,7 +171,7 @@ def test_group_by_case(c):
     )
 
 
-def test_group_by_nan(c):
+def test_group_by_nan(c, user_table_nan):
     return_df = c.sql(
         """
     SELECT
@@ -180,7 +180,7 @@ def test_group_by_nan(c):
     GROUP BY c
     """
     )
-    expected_df = pd.DataFrame({"c": [3, float("nan"), 1]})
+    expected_df = user_table_nan.drop_duplicates(subset=["c"])
 
     # we return nullable int dtype instead of float
     assert_eq(return_df, expected_df, check_dtype=False)

@@ -364,6 +364,9 @@ def test_conditional_join_with_limit(c):
     assert_eq(actual_df, expected_df, check_index=False)
 
 
+@pytest.mark.filterwarnings(
+    "ignore:You are merging on int and float:UserWarning:dask.dataframe.multi"
+)
 def test_intersect(c):
 
     # Join df_simple against itself

@@ -502,7 +502,9 @@ def test_describe_model(c):
         .sort_index()
     )
     # test
-    result = c.sql("DESCRIBE MODEL ex_describe_model")["Params"].apply(lambda x: str(x))
+    result = c.sql("DESCRIBE MODEL ex_describe_model")["Params"].apply(
+        lambda x: str(x), meta=("Params", "object")
+    )
 
     assert_eq(expected_series, result)