From 2c963a6280c209fd637a22b3f143b8662e6e6378 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Fri, 1 Dec 2023 17:21:43 +0800 Subject: [PATCH 1/5] geopandas: Convert columns with big 64-bit integers to float type --- pygmt/helpers/tempfile.py | 11 +++++- pygmt/tests/test_geopandas.py | 72 +++++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 17 deletions(-) diff --git a/pygmt/helpers/tempfile.py b/pygmt/helpers/tempfile.py index 43cc68b3f63..2642d932807 100644 --- a/pygmt/helpers/tempfile.py +++ b/pygmt/helpers/tempfile.py @@ -131,7 +131,10 @@ def tempfile_from_geojson(geojson): os.remove(tmpfile.name) # ensure file is deleted first ogrgmt_kwargs = {"filename": tmpfile.name, "driver": "OGR_GMT", "mode": "w"} try: - # Map int/int64 to int32 since OGR_GMT only supports 32-bit integer + # OGR_GMT only support 32-bit integers. + # We need to map int/int64 types: + # 1. to int32 type if the column is a 32-bit integer + # 2. to float64 type if the column is big 64-bit integer # https://github.com/geopandas/geopandas/issues/967#issuecomment-842877704 # https://github.com/GenericMappingTools/pygmt/issues/2497 if geojson.index.name is None: @@ -140,7 +143,11 @@ def tempfile_from_geojson(geojson): schema = gpd.io.file.infer_schema(geojson) for col, dtype in schema["properties"].items(): if dtype in ("int", "int64"): - schema["properties"][col] = "int32" + if geojson[col].abs().max() <= 2**31 - 1: + schema["properties"][col] = "int32" + else: + schema["properties"][col] = "float" + geojson[col] = geojson[col].astype(dtype=float) ogrgmt_kwargs["schema"] = schema # Using geopandas.to_file to directly export to OGR_GMT format geojson.to_file(**ogrgmt_kwargs) diff --git a/pygmt/tests/test_geopandas.py b/pygmt/tests/test_geopandas.py index adc7dd7f99c..699034b3727 100644 --- a/pygmt/tests/test_geopandas.py +++ b/pygmt/tests/test_geopandas.py @@ -36,7 +36,27 @@ def fixture_gdf(): index=["multipolygon", "polygon", "linestring"], geometry=[multipolygon, polygon, linestring], ) + return gdf + +@pytest.fixture(scope="module", name="gdf_ridge") +def fixture_gdf_ridge(): + """ + Read a @RidgeTest.shp shapefile with geopandas.GeoDataFrame and reproject + the geometry. + """ + # Read shapefile in geopandas.GeoDataFrame + shapefile = which( + fname="@RidgeTest.shp @RidgeTest.shx @RidgeTest.dbf @RidgeTest.prj", + download="c", + ) + gdf = gpd.read_file(shapefile[0]) + # Reproject geometry + gdf["geometry"] = ( + gdf.to_crs(crs="EPSG:3857") + .buffer(distance=100000) + .to_crs(crs="OGC:CRS84") # convert to lon/lat to prevent @null in PROJ CRS + ) return gdf @@ -144,7 +164,7 @@ def test_geopandas_plot3d_non_default_circle(): ], ) @pytest.mark.mpl_image_compare(filename="test_geopandas_plot_int_dtypes.png") -def test_geopandas_plot_int_dtypes(dtype): +def test_geopandas_plot_int_dtypes(gdf_ridge, dtype): """ Check that plotting a geopandas GeoDataFrame with integer columns works, including int32 and int64 (non-nullable), Int32 and Int64 (nullable). @@ -152,26 +172,46 @@ def test_geopandas_plot_int_dtypes(dtype): This is a regression test for https://github.com/GenericMappingTools/pygmt/issues/2497 """ - # Read shapefile in geopandas.GeoDataFrame - shapefile = which( - fname="@RidgeTest.shp @RidgeTest.shx @RidgeTest.dbf @RidgeTest.prj", - download="c", - ) - gdf = gpd.read_file(shapefile[0]) + # Convert NPOINTS column to integer type + gdf_ridge["NPOINTS"] = gdf_ridge.NPOINTS.astype(dtype=dtype) - # Reproject geometry and change dtype of NPOINTS column - gdf["geometry"] = ( - gdf.to_crs(crs="EPSG:3857") - .buffer(distance=100000) - .to_crs(crs="OGC:CRS84") # convert to lon/lat to prevent @null in PROJ CRS + # Plot figure with three polygons colored based on NPOINTS value + fig = Figure() + makecpt(cmap="lisbon", series=[10, 60, 10], continuous=True) + fig.plot( + data=gdf_ridge, + frame=True, + pen="1p,black", + close=True, + fill="+z", + cmap=True, + aspatial="Z=NPOINTS", ) - gdf["NPOINTS"] = gdf.NPOINTS.astype(dtype=dtype) + fig.colorbar() + return fig + + +@pytest.mark.mpl_image_compare(filename="test_geopandas_plot_int_dtypes.png") +def test_geopandas_plot_int64_as_float(gdf_ridge): + """ + Check that big 64-bit integers are correctly mapped to float type in + geopandas.GeoDataFrame object. + """ + factor = 2**32 + # Convert NPOINTS column to int64 type and make big integers + gdf_ridge["NPOINTS"] = gdf_ridge.NPOINTS.astype(dtype="int64") + gdf_ridge["NPOINTS"] *= factor + + # Make sure the column is bigger than the largest 32-bit integer + assert gdf_ridge["NPOINTS"].abs().max() > 2**31 - 1 # Plot figure with three polygons colored based on NPOINTS value fig = Figure() - makecpt(cmap="lisbon", series=[10, 60, 10], continuous=True) + makecpt( + cmap="lisbon", series=[10 * factor, 60 * factor, 10 * factor], continuous=True + ) fig.plot( - data=gdf, + data=gdf_ridge, frame=True, pen="1p,black", close=True, @@ -179,5 +219,7 @@ def test_geopandas_plot_int_dtypes(dtype): cmap=True, aspatial="Z=NPOINTS", ) + # Generate a CPT for 10-60 range and plot to reuse the baseline image + makecpt(cmap="lisbon", series=[10, 60, 10], continuous=True) fig.colorbar() return fig From 2aa87e45113977ba09c55ddf6d94d9a8ca37254b Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Sat, 2 Dec 2023 16:15:08 +0800 Subject: [PATCH 2/5] No need to convert dtype --- pygmt/helpers/tempfile.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pygmt/helpers/tempfile.py b/pygmt/helpers/tempfile.py index 2642d932807..62f6bbf90be 100644 --- a/pygmt/helpers/tempfile.py +++ b/pygmt/helpers/tempfile.py @@ -143,11 +143,8 @@ def tempfile_from_geojson(geojson): schema = gpd.io.file.infer_schema(geojson) for col, dtype in schema["properties"].items(): if dtype in ("int", "int64"): - if geojson[col].abs().max() <= 2**31 - 1: - schema["properties"][col] = "int32" - else: - schema["properties"][col] = "float" - geojson[col] = geojson[col].astype(dtype=float) + overflow = geojson[col].abs().max() > 2**31 - 1 + schema["properties"][col] = "float" if overflow else "int32" ogrgmt_kwargs["schema"] = schema # Using geopandas.to_file to directly export to OGR_GMT format geojson.to_file(**ogrgmt_kwargs) From bb520305791e57c0913f463150fb21b64e46fafd Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Sun, 3 Dec 2023 18:03:23 +0800 Subject: [PATCH 3/5] Update pygmt/helpers/tempfile.py Co-authored-by: Michael Grund <23025878+michaelgrund@users.noreply.github.com> --- pygmt/helpers/tempfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygmt/helpers/tempfile.py b/pygmt/helpers/tempfile.py index 62f6bbf90be..97bce754d32 100644 --- a/pygmt/helpers/tempfile.py +++ b/pygmt/helpers/tempfile.py @@ -131,7 +131,7 @@ def tempfile_from_geojson(geojson): os.remove(tmpfile.name) # ensure file is deleted first ogrgmt_kwargs = {"filename": tmpfile.name, "driver": "OGR_GMT", "mode": "w"} try: - # OGR_GMT only support 32-bit integers. + # OGR_GMT only supports 32-bit integers. # We need to map int/int64 types: # 1. to int32 type if the column is a 32-bit integer # 2. to float64 type if the column is big 64-bit integer From 320b1ae5a7f988107d18a899ad872365ccf6b4a6 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Sun, 3 Dec 2023 18:15:33 +0800 Subject: [PATCH 4/5] Improve the comments --- pygmt/helpers/tempfile.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pygmt/helpers/tempfile.py b/pygmt/helpers/tempfile.py index 97bce754d32..057384926e7 100644 --- a/pygmt/helpers/tempfile.py +++ b/pygmt/helpers/tempfile.py @@ -131,10 +131,9 @@ def tempfile_from_geojson(geojson): os.remove(tmpfile.name) # ensure file is deleted first ogrgmt_kwargs = {"filename": tmpfile.name, "driver": "OGR_GMT", "mode": "w"} try: - # OGR_GMT only supports 32-bit integers. - # We need to map int/int64 types: - # 1. to int32 type if the column is a 32-bit integer - # 2. to float64 type if the column is big 64-bit integer + # OGR_GMT only supports 32-bit integers. We need to map int/int64 + # types to int32/float type depending on if the column has an + # 32-bit integer overflow issue. Related issues: # https://github.com/geopandas/geopandas/issues/967#issuecomment-842877704 # https://github.com/GenericMappingTools/pygmt/issues/2497 if geojson.index.name is None: From 9ee01449961778e77600be8e58cb3d029b2a99d1 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Sun, 3 Dec 2023 19:47:10 +0800 Subject: [PATCH 5/5] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Yvonne Fröhlich <94163266+yvonnefroehlich@users.noreply.github.com> --- pygmt/helpers/tempfile.py | 2 +- pygmt/tests/test_geopandas.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pygmt/helpers/tempfile.py b/pygmt/helpers/tempfile.py index 057384926e7..e09923a97a5 100644 --- a/pygmt/helpers/tempfile.py +++ b/pygmt/helpers/tempfile.py @@ -132,7 +132,7 @@ def tempfile_from_geojson(geojson): ogrgmt_kwargs = {"filename": tmpfile.name, "driver": "OGR_GMT", "mode": "w"} try: # OGR_GMT only supports 32-bit integers. We need to map int/int64 - # types to int32/float type depending on if the column has an + # types to int32/float types depending on if the column has an # 32-bit integer overflow issue. Related issues: # https://github.com/geopandas/geopandas/issues/967#issuecomment-842877704 # https://github.com/GenericMappingTools/pygmt/issues/2497 diff --git a/pygmt/tests/test_geopandas.py b/pygmt/tests/test_geopandas.py index 699034b3727..58e59b374d4 100644 --- a/pygmt/tests/test_geopandas.py +++ b/pygmt/tests/test_geopandas.py @@ -42,16 +42,16 @@ def fixture_gdf(): @pytest.fixture(scope="module", name="gdf_ridge") def fixture_gdf_ridge(): """ - Read a @RidgeTest.shp shapefile with geopandas.GeoDataFrame and reproject + Read a @RidgeTest.shp shapefile into a geopandas.GeoDataFrame and reproject the geometry. """ - # Read shapefile in geopandas.GeoDataFrame + # Read shapefile into a geopandas.GeoDataFrame shapefile = which( fname="@RidgeTest.shp @RidgeTest.shx @RidgeTest.dbf @RidgeTest.prj", download="c", ) gdf = gpd.read_file(shapefile[0]) - # Reproject geometry + # Reproject the geometry gdf["geometry"] = ( gdf.to_crs(crs="EPSG:3857") .buffer(distance=100000) @@ -166,7 +166,7 @@ def test_geopandas_plot3d_non_default_circle(): @pytest.mark.mpl_image_compare(filename="test_geopandas_plot_int_dtypes.png") def test_geopandas_plot_int_dtypes(gdf_ridge, dtype): """ - Check that plotting a geopandas GeoDataFrame with integer columns works, + Check that plotting a geopandas.GeoDataFrame with integer columns works, including int32 and int64 (non-nullable), Int32 and Int64 (nullable). This is a regression test for @@ -182,7 +182,6 @@ def test_geopandas_plot_int_dtypes(gdf_ridge, dtype): data=gdf_ridge, frame=True, pen="1p,black", - close=True, fill="+z", cmap=True, aspatial="Z=NPOINTS", @@ -214,7 +213,6 @@ def test_geopandas_plot_int64_as_float(gdf_ridge): data=gdf_ridge, frame=True, pen="1p,black", - close=True, fill="+z", cmap=True, aspatial="Z=NPOINTS",