diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index 6a3a37f0b..a867996d1 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -15,7 +15,7 @@ dependencies: - mlflow - mock - numpy>=1.21.6 -- pandas>=1.4.0,<2.0.0 +- pandas>=1.4.0 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index d7b4700f8..7424529d6 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -15,7 +15,7 @@ dependencies: - mlflow - mock - numpy>=1.21.6 -- pandas>=1.4.0,<2.0.0 +- pandas>=1.4.0 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index f1b76271a..2467e144a 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -18,7 +18,7 @@ dependencies: - mlflow - mock - numpy>=1.21.6 -- pandas>=1.4.0,<2.0.0 +- pandas>=1.4.0 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 2ee59f6cb..917892f24 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -18,7 +18,7 @@ dependencies: - mlflow - mock - numpy>=1.21.6 -- pandas>=1.4.0,<2.0.0 +- pandas>=1.4.0 - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index f2ad46249..5152cfc4e 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -35,7 +35,7 @@ requirements: run: - python - dask >=2022.3.0 - - pandas >=1.4.0,<2.0.0 + - pandas >=1.4.0 # FIXME: handling is needed for httpx-based fastapi>=0.87.0 - fastapi >=0.69.0,<0.87.0 - uvicorn >=0.13.4 diff --git a/dask_sql/_compat.py b/dask_sql/_compat.py index f76982a4c..be8cfbae5 100644 --- a/dask_sql/_compat.py +++ b/dask_sql/_compat.py @@ -8,6 +8,7 @@ _dask_version = parseVersion(dask.__version__) INDEXER_WINDOW_STEP_IMPLEMENTED = _pandas_version >= parseVersion("1.5.0") +PANDAS_GT_200 = _pandas_version >= parseVersion("2.0.0") # TODO: remove if prompt-toolkit min version gets bumped PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29") diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 2b66c0504..9ba22f797 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -331,13 +331,19 @@ def cast_column_type( def cast_column_to_type(col: dd.Series, expected_type: str): """Cast the given column to the expected type""" + pdt = pd.api.types + + is_dt_ns = pdt.is_datetime64_ns_dtype + is_dt_tz = lambda t: is_dt_ns(t) and pdt.is_datetime64tz_dtype(t) + is_dt_ntz = lambda t: is_dt_ns(t) and not pdt.is_datetime64tz_dtype(t) + current_type = col.dtype if similar_type(current_type, expected_type): logger.debug("...not converting.") return None - if pd.api.types.is_integer_dtype(expected_type): + if pdt.is_integer_dtype(expected_type): if pd.api.types.is_float_dtype(current_type): logger.debug("...truncating...") # Currently "trunc" can not be applied to NA (the pandas missing value type), @@ -345,10 +351,14 @@ def cast_column_to_type(col: dd.Series, expected_type: str): # For our use case, that does not matter, as the conversion to integer later # will convert both NA and np.NaN to NA. col = da.trunc(col.fillna(value=np.NaN)) - elif pd.api.types.is_timedelta64_dtype(current_type): + elif pdt.is_timedelta64_dtype(current_type): logger.debug(f"Explicitly casting from {current_type} to np.int64") return col.astype(np.int64) + if is_dt_tz(current_type) and is_dt_ntz(expected_type): + # casting from timezone-aware to timezone-naive datatypes with astype is deprecated in pandas 2 + return col.dt.tz_localize(None) + logger.debug(f"Need to cast from {current_type} to {expected_type}") return col.astype(expected_type) diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index 14011ccef..69f734a54 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -47,26 +47,22 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai df = dc.df # Calculate statistics - statistics = dd.from_pandas( - pd.DataFrame({col: [] for col in columns}), npartitions=1 - ) - statistics = statistics.append(df[[mapping(col) for col in columns]].describe()) - - # Add additional information - statistics = statistics.append( - pd.Series( - { - col: str(python_to_sql_type(df[mapping(col)].dtype)).lower() - for col in columns - }, - name="data_type", - ) - ) - statistics = statistics.append( - pd.Series( - {col: col for col in columns}, - name="col_name", - ) + statistics = dd.concat( + [ + df[[mapping(col) for col in columns]].describe(), + pd.DataFrame( + { + mapping(col): str( + python_to_sql_type(df[mapping(col)].dtype) + ).lower() + for col in columns + }, + index=["data_type"], + ), + pd.DataFrame( + {mapping(col): col for col in columns}, index=["col_name"] + ), + ] ) cc = ColumnContainer(statistics.columns) diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index db604da3a..712173704 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -15,7 +15,7 @@ from dask.utils import random_state_data from dask_planner.rust import SqlTypeName -from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT +from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 from dask_sql.datacontainer import DataContainer from dask_sql.mappings import ( cast_column_to_type, @@ -927,7 +927,7 @@ def date_part(self, what, df: SeriesOrScalar): elif what in {"SECOND", "SECONDS"}: return df.second elif what in {"WEEK", "WEEKS"}: - return df.week + return df.isocalendar().week if PANDAS_GT_200 else df.week elif what in {"YEAR", "YEARS"}: return df.year elif what == "DATE": diff --git a/dask_sql/server/presto_jdbc.py b/dask_sql/server/presto_jdbc.py index d3c3880cb..02f77a1b4 100644 --- a/dask_sql/server/presto_jdbc.py +++ b/dask_sql/server/presto_jdbc.py @@ -37,15 +37,15 @@ def create_meta_data(c: Context): # catalogs = pd.DataFrame().append(create_catalog_row(catalog), ignore_index=True) # c.create_table("catalogs", catalogs, schema_name=system_schema) - schemas = pd.DataFrame().append(create_schema_row(), ignore_index=True) + schemas = pd.DataFrame(create_schema_row(), index=[0]) c.create_table("schemas", schemas, schema_name=system_schema) schema_rows = [] - tables = pd.DataFrame().append(create_table_row(), ignore_index=True) + tables = pd.DataFrame(create_table_row(), index=[0]) c.create_table("tables", tables, schema_name=system_schema) table_rows = [] - columns = pd.DataFrame().append(create_column_row(), ignore_index=True) + columns = pd.DataFrame(create_column_row(), index=[0]) c.create_table("columns", columns, schema_name=system_schema) column_rows = [] diff --git a/docker/conda.txt b/docker/conda.txt index 0c57cf45d..d24d217aa 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -1,6 +1,6 @@ python>=3.8 dask>=2022.3.0 -pandas>=1.4.0,<2.0.0 +pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 maven>=3.6.0 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index 09d07834f..da965a53c 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -17,7 +17,7 @@ RUN mamba install -y \ "setuptools-rust>=1.5.2" \ # core dependencies "dask>=2022.3.0" \ - "pandas>=1.4.0,<2.0.0" \ + "pandas>=1.4.0" \ # FIXME: handling is needed for httpx-based fastapi>=0.87.0 "fastapi>=0.69.0,<0.87.0" \ "uvicorn>=0.13.4" \ diff --git a/docs/environment.yml b/docs/environment.yml index ae25c9440..96a727465 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -8,7 +8,7 @@ dependencies: - sphinx-tabs - dask-sphinx-theme>=2.0.3 - dask>=2022.3.0 - - pandas>=1.4.0,<2.0.0 + - pandas>=1.4.0 - fugue>=0.7.3 # FIXME: handling is needed for httpx-based fastapi>=0.87.0 - fastapi>=0.69.0,<0.87.0 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 374528d89..c9d8c6b0e 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -2,7 +2,7 @@ sphinx>=4.0.0 sphinx-tabs dask-sphinx-theme>=3.0.0 dask>=2022.3.0 -pandas>=1.4.0,<2.0.0 +pandas>=1.4.0 fugue>=0.7.3 # FIXME: handling is needed for httpx-based fastapi>=0.87.0 fastapi>=0.69.0,<0.87.0 diff --git a/setup.py b/setup.py index 085d362fe..d149ac5f0 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ install_requires=[ "dask[dataframe]>=2022.3.0", "distributed>=2022.3.0", - "pandas>=1.4.0,<2.0.0", + "pandas>=1.4.0", # FIXME: handling is needed for httpx-based fastapi>=0.87.0 "fastapi>=0.69.0,<0.87.0", "uvicorn>=0.13.4", diff --git a/tests/integration/fixtures.py b/tests/integration/fixtures.py index 65fc3b156..90b6f3828 100644 --- a/tests/integration/fixtures.py +++ b/tests/integration/fixtures.py @@ -169,12 +169,21 @@ def gpu_string_table(string_table): @pytest.fixture() def gpu_datetime_table(datetime_table): - # cudf doesn't have support for timezoned datetime data - datetime_table["timezone"] = datetime_table["timezone"].astype("datetime64[ns]") - datetime_table["utc_timezone"] = datetime_table["utc_timezone"].astype( - "datetime64[ns]" - ) - return cudf.from_pandas(datetime_table) if cudf else None + if cudf: + # TODO: remove once `from_pandas` has support for timezone-aware data + # https://github.com/rapidsai/cudf/issues/13611 + df = datetime_table.copy() + df["timezone"] = df["timezone"].dt.tz_localize(None) + df["utc_timezone"] = df["utc_timezone"].dt.tz_localize(None) + gdf = cudf.from_pandas(df) + gdf["timezone"] = gdf["timezone"].dt.tz_localize( + str(datetime_table["timezone"].dt.tz) + ) + gdf["utc_timezone"] = gdf["utc_timezone"].dt.tz_localize( + str(datetime_table["utc_timezone"].dt.tz) + ) + return gdf + return None @pytest.fixture() diff --git a/tests/integration/test_analyze.py b/tests/integration/test_analyze.py index 8371476c1..a7ccf65b6 100644 --- a/tests/integration/test_analyze.py +++ b/tests/integration/test_analyze.py @@ -1,3 +1,4 @@ +import dask.dataframe as dd import pandas as pd from dask_sql.mappings import python_to_sql_type @@ -8,24 +9,21 @@ def test_analyze(c, df): result_df = c.sql("ANALYZE TABLE df COMPUTE STATISTICS FOR ALL COLUMNS") # extract table and compute stats with Dask manually - expected_df = ( - c.sql("SELECT * FROM df") - .describe() - .append( - pd.Series( + expected_df = dd.concat( + [ + c.sql("SELECT * FROM df").describe(), + pd.DataFrame( { col: str(python_to_sql_type(df[col].dtype)).lower() for col in df.columns }, - name="data_type", - ) - ) - .append( - pd.Series( + index=["data_type"], + ), + pd.DataFrame( {col: col for col in df.columns}, - name="col_name", - ) - ) + index=["col_name"], + ), + ] ) assert_eq(result_df, expected_df) diff --git a/tests/integration/test_filter.py b/tests/integration/test_filter.py index 9de072d5e..cede43185 100644 --- a/tests/integration/test_filter.py +++ b/tests/integration/test_filter.py @@ -92,9 +92,8 @@ def test_filter_cast_date(c, input_table, request): CAST(timezone AS DATE) > DATE '2014-08-01' """ ) - expected_df = datetime_table[ - datetime_table["timezone"].astype(" pd.Timestamp("2014-08-01") ] assert_eq(return_df, expected_df) @@ -110,6 +109,9 @@ def test_filter_cast_date(c, input_table, request): ), ], ) +@pytest.mark.xfail( + reason="Need support for non-UTC timezoned literals, see https://github.com/dask-contrib/dask-sql/issues/1193" +) def test_filter_cast_timestamp(c, input_table, request): datetime_table = request.getfixturevalue(input_table) return_df = c.sql( diff --git a/tests/integration/test_jdbc.py b/tests/integration/test_jdbc.py index c4634311f..aa39737ad 100644 --- a/tests/integration/test_jdbc.py +++ b/tests/integration/test_jdbc.py @@ -19,8 +19,7 @@ def c(): c = Context() c.create_schema(schema) - row = create_table_row() - tables = pd.DataFrame().append(row, ignore_index=True) + tables = pd.DataFrame(create_table_row(), index=[0]) tables = tables.astype({"AN_INT": "int64"}) c.create_table(table, tables, schema_name=schema) diff --git a/tests/integration/test_join.py b/tests/integration/test_join.py index 3b131541c..fedabcb6e 100644 --- a/tests/integration/test_join.py +++ b/tests/integration/test_join.py @@ -119,7 +119,7 @@ def test_join_cross(c, user_table_1, department_table): user_table_1["key"] = 1 department_table["key"] = 1 - expected_df = dd.merge(user_table_1, department_table, on="key").drop("key", 1) + expected_df = dd.merge(user_table_1, department_table, on="key").drop(columns="key") assert_eq(return_df, expected_df, check_index=False) diff --git a/tests/integration/test_select.py b/tests/integration/test_select.py index 92ca6b53d..9c4331d77 100644 --- a/tests/integration/test_select.py +++ b/tests/integration/test_select.py @@ -4,6 +4,7 @@ from dask.dataframe.optimize import optimize_dataframe_getitem from dask.utils_test import hlg_layer +from dask_sql._compat import PANDAS_GT_200 from dask_sql.utils import ParsingException from tests.utils import assert_eq @@ -33,7 +34,10 @@ def test_select_column(c, df): def test_select_different_types(c): expected_df = pd.DataFrame( { - "date": pd.to_datetime(["2022-01-21 17:34", "2022-01-21", "17:34", pd.NaT]), + "date": pd.to_datetime( + ["2022-01-21 17:34", "2022-01-21", "17:34", pd.NaT], + format="mixed" if PANDAS_GT_200 else None, + ), "string": ["this is a test", "another test", "äölüć", ""], "integer": [1, 2, -4, 5], "float": [-1.1, np.NaN, pd.NA, np.sqrt(2)], @@ -163,13 +167,13 @@ def test_date_casting(c, input_table, request): expected_df = datetime_table expected_df["timezone"] = ( - expected_df["timezone"].astype("