From 5d16103de4eacba6c92f7f0489744b7015a30a8a Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 3 Dec 2023 21:11:36 +1300 Subject: [PATCH 1/6] Convert pyarrow date32/date64 dtypes to np.datetime64 Handle date columns in pandas.DataFrame with pyarrow dtypes like date32[day][pyarrow] or date64[ms][pyarrow] by modifying the vectors_to_arrays conversion function. Added some parametrized unit tests to test_info.py to ensure this works. --- pygmt/clib/conversion.py | 12 +++++++++++- pygmt/tests/test_info.py | 15 ++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index 04264739e8f..38ec245ef63 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -166,7 +166,17 @@ def vectors_to_arrays(vectors): >>> all(isinstance(i, np.ndarray) for i in vectors_to_arrays(data)) True """ - arrays = [as_c_contiguous(np.asarray(i)) for i in vectors] + arrays = [] + for vector in vectors: + vec_dtype = str(vector.dtype) + if "[pyarrow]" in vec_dtype: # handle pyarrow date32/date64 dtypes + array = vector.to_numpy( + dtype=np.datetime64 if "date" in vec_dtype else None + ) + else: + array = np.asarray(vector) + arrays.append(array) + return arrays diff --git a/pygmt/tests/test_info.py b/pygmt/tests/test_info.py index 999965417a3..81aca18c38d 100644 --- a/pygmt/tests/test_info.py +++ b/pygmt/tests/test_info.py @@ -8,6 +8,7 @@ import numpy as np import numpy.testing as npt import pandas as pd +import pandas.util._test_decorators as td import pytest import xarray as xr from pygmt import info @@ -107,14 +108,22 @@ def test_info_numpy_array_time_column(): assert output == expected_output -def test_info_pandas_dataframe_time_column(): +@pytest.mark.parametrize( + "dtype", + [ + "datetime64[ns]", + pytest.param("date32[day][pyarrow]", marks=td.skip_if_no(package="pyarrow")), + pytest.param("date64[ms][pyarrow]", marks=td.skip_if_no(package="pyarrow")), + ], +) +def test_info_pandas_dataframe_date_column(dtype): """ - Make sure info works on pandas.DataFrame inputs with a time column. + Make sure info works on pandas.DataFrame inputs with a date column. """ table = pd.DataFrame( data={ "z": [10, 13, 12, 15, 14], - "time": pd.date_range(start="2020-01-01", periods=5), + "date": pd.date_range(start="2020-01-01", periods=5).astype(dtype=dtype), } ) output = info(data=table) From 19a14d8422be19d533ed3e8cab59291e1c4b75ee Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Mon, 4 Dec 2023 09:02:20 +1300 Subject: [PATCH 2/6] Handle Python lists without dtype attr and use as_c_contiguous Need to handle Python lists that don't have the dtype attribute, unlike pandas.Series objects. Also ensure that we return a C-contiguous array. --- pygmt/clib/conversion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index 38ec245ef63..7cd3d357ce6 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -168,14 +168,14 @@ def vectors_to_arrays(vectors): """ arrays = [] for vector in vectors: - vec_dtype = str(vector.dtype) + vec_dtype = str(getattr(vector, "dtype", "")) if "[pyarrow]" in vec_dtype: # handle pyarrow date32/date64 dtypes array = vector.to_numpy( dtype=np.datetime64 if "date" in vec_dtype else None ) else: array = np.asarray(vector) - arrays.append(array) + arrays.append(as_c_contiguous(array)) return arrays From 4f40f65f43cdac9e21ec9fb821a36f46c4ea7542 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Mon, 4 Dec 2023 09:31:11 +1300 Subject: [PATCH 3/6] Add doctest to check that date32/date64 are converted to datetime64 Ensure that pyarrow date32 and date64 dtypes are converted to numpy.datetime64 dtype. Added pyarrow dependency to ci_doctests.yaml. Also changed from using `"date" in vec_dtype` to `vec_dtype.startswith("date")`. --- .github/workflows/ci_doctests.yaml | 1 + pygmt/clib/conversion.py | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci_doctests.yaml b/.github/workflows/ci_doctests.yaml index abd52b08555..b7c87cd733c 100644 --- a/.github/workflows/ci_doctests.yaml +++ b/.github/workflows/ci_doctests.yaml @@ -58,6 +58,7 @@ jobs: contextily geopandas ipython + pyarrow rioxarray build make diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index 7cd3d357ce6..97216b935a5 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -162,16 +162,38 @@ def vectors_to_arrays(vectors): True >>> all(isinstance(i, np.ndarray) for i in arrays) True + >>> data = [[1, 2], (3, 4), range(5, 7)] >>> all(isinstance(i, np.ndarray) for i in vectors_to_arrays(data)) True + + >>> import datetime + >>> import pytest + >>> pa = pytest.importorskip("pyarrow") + >>> vectors = [ + ... pd.Series( + ... data=[datetime.date(2020, 1, 1), datetime.date(2021, 12, 31)], + ... dtype="date32[day][pyarrow]", + ... ), + ... pd.Series( + ... data=[datetime.date(2022, 1, 1), datetime.date(2023, 12, 31)], + ... dtype="date64[ms][pyarrow]", + ... ), + ... ] + >>> arrays = vectors_to_arrays(vectors) + >>> all(a.flags.c_contiguous for a in arrays) + True + >>> all(isinstance(a, np.ndarray) for a in arrays) + True + >>> all(isinstance(a.dtype, np.dtypes.DateTime64DType) for a in arrays) + True """ arrays = [] for vector in vectors: vec_dtype = str(getattr(vector, "dtype", "")) if "[pyarrow]" in vec_dtype: # handle pyarrow date32/date64 dtypes array = vector.to_numpy( - dtype=np.datetime64 if "date" in vec_dtype else None + dtype=np.datetime64 if vec_dtype.startswith("date") else None ) else: array = np.asarray(vector) From ca0989a36c07c6216cd81300a14fde406e310add Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sat, 16 Dec 2023 17:08:18 +1300 Subject: [PATCH 4/6] Refactor to use pygmt.helpers.testing.skip_if_no --- pygmt/tests/test_info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pygmt/tests/test_info.py b/pygmt/tests/test_info.py index 81aca18c38d..129259b93f1 100644 --- a/pygmt/tests/test_info.py +++ b/pygmt/tests/test_info.py @@ -8,11 +8,11 @@ import numpy as np import numpy.testing as npt import pandas as pd -import pandas.util._test_decorators as td import pytest import xarray as xr from pygmt import info from pygmt.exceptions import GMTInvalidInput +from pygmt.helpers.testing import skip_if_no TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "data") POINTS_DATA = os.path.join(TEST_DATA_DIR, "points.txt") @@ -112,8 +112,8 @@ def test_info_numpy_array_time_column(): "dtype", [ "datetime64[ns]", - pytest.param("date32[day][pyarrow]", marks=td.skip_if_no(package="pyarrow")), - pytest.param("date64[ms][pyarrow]", marks=td.skip_if_no(package="pyarrow")), + pytest.param("date32[day][pyarrow]", marks=skip_if_no(package="pyarrow")), + pytest.param("date64[ms][pyarrow]", marks=skip_if_no(package="pyarrow")), ], ) def test_info_pandas_dataframe_date_column(dtype): From d0c1dad321a1c7f8e3381c956e642f46acc77eb9 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sat, 16 Dec 2023 19:38:28 +1300 Subject: [PATCH 5/6] Document that PyArrow date32/date64 dtypes are now supported in PyGMT --- doc/install.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/install.rst b/doc/install.rst index f3594e52521..504eb87a911 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -112,8 +112,8 @@ The following are optional dependencies: If you have `PyArrow `__ installed, PyGMT does have some initial support for ``pandas.Series`` and ``pandas.DataFrame`` objects with Apache Arrow-backed arrays. Specifically, - only uint/int/float dtypes are supported for now. Support for datetime and - string Arrow dtypes are still working in progress. For more details, see + only uint/int/float and date32/date64 dtypes are supported for now. Support + for string Arrow dtypes is still a work in progress. For more details, see `issue #2800 `__. Installing GMT and other dependencies From de08508d82fe2e5e79255542ca65193010320699 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Sun, 17 Dec 2023 15:52:50 +1300 Subject: [PATCH 6/6] Refactor to use dict mapping instead of if-then Co-Authored-By: Dongdong Tian --- pygmt/clib/conversion.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index 97216b935a5..2ce0299ef3b 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -188,15 +188,14 @@ def vectors_to_arrays(vectors): >>> all(isinstance(a.dtype, np.dtypes.DateTime64DType) for a in arrays) True """ + dtypes = { + "date32[day][pyarrow]": np.datetime64, + "date64[ms][pyarrow]": np.datetime64, + } arrays = [] for vector in vectors: vec_dtype = str(getattr(vector, "dtype", "")) - if "[pyarrow]" in vec_dtype: # handle pyarrow date32/date64 dtypes - array = vector.to_numpy( - dtype=np.datetime64 if vec_dtype.startswith("date") else None - ) - else: - array = np.asarray(vector) + array = np.asarray(a=vector, dtype=dtypes.get(vec_dtype, None)) arrays.append(as_c_contiguous(array)) return arrays