diff --git a/pynsee/geodata/_add_insee_dep.py b/pynsee/geodata/_add_insee_dep.py deleted file mode 100644 index d2b63a99..00000000 --- a/pynsee/geodata/_add_insee_dep.py +++ /dev/null @@ -1,164 +0,0 @@ -from tqdm import trange -from pandas.api.types import CategoricalDtype -import numpy as np -import pandas as pd - -from ._get_geodata_with_backup import _get_geodata_with_backup - - -def _add_insee_dep(gdf): - gdf = gdf.reset_index(drop=True) - - # option 1 : get insee_dep from id_com colum - gdf = _add_insee_dep_from_id_com(gdf) - - # option 2 : get insee_dep for regions - if "insee_dep" not in gdf: - gdf = _add_insee_dep_region(gdf) - - # option 3 : get insee_dep from get_geodata and polygon intersection - if "insee_dep" not in gdf: - gdf = _add_insee_dep_from_geodata(gdf) - - if "insee_dep_geometry" not in gdf: - raise ValueError("Could not get department geometries.") - - return gdf - - -def _add_insee_dep_from_id_com(gdf): - # add insee_dep column - - if "cleabs" in gdf.columns: - if all(gdf.cleabs.str.match("^COMMUNE")): - if "code_insee_du_departement" not in gdf.columns: - - dataset_id = "ADMINEXPRESS-COG-CARTO.LATEST:commune" - com = _get_geodata_with_backup(dataset_id).to_crs("EPSG:3857") - - com = com[["cleabs", "code_insee_du_departement"]] - gdf = gdf.merge(com, on="cleabs", how="left") - - if "code_insee_du_departement" in gdf.columns: - # get departments and add the geometry - dataset_id = "ADMINEXPRESS-COG-CARTO.LATEST:departement" - dep = _get_geodata_with_backup(dataset_id).to_crs("EPSG:3857") - - dep = dep[["code_insee", "geometry"]] - dep.rename( - columns={ - "geometry": "insee_dep_geometry", - "code_insee": "code_insee_du_departement", - }, - inplace=True - ) - - gdf = gdf.merge( - dep, on="code_insee_du_departement", how="left" - ).assign( - insee_dep_geometry=lambda x: np.where( - x["code_insee_du_departement"] == "NR", - x["geometry"], - x["insee_dep_geometry"], - ) - ) - - return gdf - - -def _add_insee_dep_region(gdf): - try: - if "cleabs" in gdf.columns and all(gdf.cleabs.str.match("^REGION")): - dataset_id = "ADMINEXPRESS-COG-CARTO.LATEST:departement" - - # get departments, keep only one for each region - dep = ( - _get_geodata_with_backup(dataset_id) - .drop_duplicates( - subset="code_insee_de_la_region", keep="first" - ) - .to_crs("EPSG:3857") - ) - - dep = dep[["code_insee", "code_insee_de_la_region", "geometry"]] - dep.rename( - columns={ - "geometry": "insee_dep_geometry", - }, - inplace=True - ) - - gdf = gdf.merge(dep, on="insee_reg", how="left") - except Exception: - pass - - return gdf - - -def _add_insee_dep_from_geodata(gdf): - try: - if "insee_dep_geometry" not in gdf.columns: - gdf = gdf.reset_index(drop=True) - - list_dep = [] - list_dep_geo = [] - - dataset_id = "ADMINEXPRESS-COG-CARTO.LATEST:departement" - dep_list = _get_geodata_with_backup(dataset_id).to_crs("EPSG:3857") - - com = _get_geodata_with_backup( - "ADMINEXPRESS-COG-CARTO.LATEST:commune" - ).to_crs("EPSG:3857") - stpm = com.loc[lambda x: x["code_insee"].str.contains("^975")] - stpmGeo = stpm.geometry.union_all() - - dep_list = pd.concat( - [ - pd.DataFrame( - {"code_insee": "NR", "geometry": stpmGeo}, index=[0] - ), - dep_list, - ] - ) - - list_ovdep = ["971", "972", "974", "NR", "973", "976"] - list_other_dep = [ - d for d in dep_list.code_insee if d not in list_ovdep - ] - dep_order = list_ovdep + list_other_dep - - dep_list["code_insee"] = dep_list["code_insee"].astype( - CategoricalDtype(categories=dep_order, ordered=True) - ) - - dep_list = dep_list.sort_values(["code_insee"]).reset_index( - drop=True - ) - - for i in trange(len(gdf.index), desc="Finding departement"): - geo = gdf.loc[i, "geometry"] - dep = None - depgeo = None - - try: - for j in dep_list.index: - depgeo = dep_list.loc[j, "geometry"] - if geo.intersects(depgeo): - - dep = dep_list.loc[j, "code_insee"] - - break - else: - depgeo = None - except Exception: - pass - - list_dep += [dep] - list_dep_geo += [depgeo] - - gdf["code_insee_du_departement"] = list_dep - gdf["insee_dep_geometry"] = list_dep_geo - except Exception: - pass - - return gdf diff --git a/pynsee/geodata/translate_and_zoom.py b/pynsee/geodata/translate_and_zoom.py index 4356e90a..14966cc6 100644 --- a/pynsee/geodata/translate_and_zoom.py +++ b/pynsee/geodata/translate_and_zoom.py @@ -1,5 +1,6 @@ import logging import math +import re from typing import Optional import pandas as pd @@ -10,12 +11,55 @@ from ._make_offshore_points import _make_offshore_points from ._rescale_geom import _rescale_geom from ._get_center import _get_center -from ._add_insee_dep import _add_insee_dep - +from ._get_geodata_with_backup import _get_geodata_with_backup +from pynsee.utils.save_df import save_df logger = logging.getLogger(__name__) +@save_df(cls=GeoDataFrame, day_lapse_max=30) +def _deps_with_valid_coverage() -> GeoDataFrame: + """ + Inner function used to create a geodataframe of french departments, safe to + use for a spatial join. + + First (if overlaps detected) a valid coverage is enforced (meaning polygons + do not overlap and are sharing edges) with a simplification of 1 meter. + After that, a negative 10 meters buffer is applied to prevent duplicates + when running a spatial join with intersects predicate. + This function uses a 30 day cache storage. + + Returns + ------- + GeoDataFrame + Departments' geodataset + Only two columns : code_insee_du_departement, insee_dep_geometry + + """ + dataset_id = "ADMINEXPRESS-COG-CARTO.LATEST:departement" + dep = _get_geodata_with_backup(dataset_id).to_crs("EPSG:3857") + dep = dep[["code_insee", "geometry"]].rename( + columns={ + "code_insee": "code_insee_du_departement", + "geometry": "insee_dep_geometry", + } + ) + dep = dep.set_geometry("insee_dep_geometry") + + x = dep.sjoin(dep, how="left", predicate="overlaps").query( + "~code_insee_du_departement_right.isnull()" + ) + if not x.empty: + # Geometries overlap: + # Force coverage validity (non-overlapping, edge-matched polygons) + # (This shouldn't be necessary with modern ADMINEXPRESS datasets and + # is here as a backup safe) + dep["insee_dep_geometry"] = dep.simplify_coverage(1) + dep["insee_dep_geometry"] = dep["insee_dep_geometry"].buffer(-1) + + return dep + + def transform_overseas( gdf: GeoDataFrame, departement: tuple[str, ...] = ("971", "972", "974", "973", "976", "NR"), @@ -74,7 +118,46 @@ def transform_overseas( gdf["code_insee_du_departement"] = gdf["code_insee"] gdf["insee_dep_geometry"] = gdf["geometry"] else: - gdf = _add_insee_dep(gdf.copy()).reset_index(drop=True) + + # retrieve safe deps geometries + dep_cov = _deps_with_valid_coverage() + + # detect INSEE department's codes + # (available in some of IGN's geodatasets, but with different patterns) + pattern = "codes?.*?insee.*?departement" + dep = gdf.columns[gdf.columns.str.match(pattern, flags=re.IGNORECASE)] + if not dep.empty: + dep = dep[0] + deps = gdf[dep].str.extractall("([0-9AB]{2,3})") + deps.index = deps.index.droplevel(-1) + deps = deps.loc[~deps.index.duplicated(keep="first")] + gdf["code_insee_du_departement"] = deps[0] + + else: + + # retrieve department's codes using a spatial join. The negative + # buffer on deps should be safe to be used without any duplication + # of gdf, except when there is a valid overlapping (ie regions + # covering multiple deps, interdep epcis...). + # In case of duplication (for instance, starting from REGION) keep + # only first dep geometry + gdf = gdf.sjoin(dep_cov, how="left") + gdf = gdf.loc[~gdf.index.duplicated(keep="first")] + gdf = gdf.drop("index_right", axis=1) + + gdf["code_insee_du_departement"] = gdf[ + "code_insee_du_departement" + ].fillna("NR") + + # Retrieve simplified geometries for deps. Note that it used a negative + # buffer (10 meters) which should not alter geographic transformations + # given it's range + gdf = gdf.merge(dep_cov, on="code_insee_du_departement", how="left") + + # where dep geom is still missing, use initial geometry + gdf["insee_dep_geometry"] = gdf["insee_dep_geometry"].combine_first( + gdf.geometry + ) offshore_points = _make_offshore_points( center=Point(center), diff --git a/pynsee/localdata/get_population.py b/pynsee/localdata/get_population.py index df2a1090..ff9e4e05 100644 --- a/pynsee/localdata/get_population.py +++ b/pynsee/localdata/get_population.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from functools import lru_cache +import warnings from pynsee.geodata.get_geodata import get_geodata @@ -14,6 +15,13 @@ def get_population(): >>> pop = get_population() """ + warnings.warn( + "get_population is deprecated. Please use " + "`from pynsee import get_geodata;get_geodata('ADMINEXPRESS-COG-CARTO.LATEST:commune')` " + "instead", + category=FutureWarning, + ) + df = get_geodata("ADMINEXPRESS-COG-CARTO.LATEST:commune") return df diff --git a/pyproject.toml b/pyproject.toml index d12680e0..aa6160d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ keywords = [ "INSEE", "IGN", "statistics", "demography", "geospatial", "France", "GIS", "statistique", "démographie", "géospatial", "SIG" ] -requires-python = ">=3.9" +requires-python = ">=3.10" dynamic = ["dependencies", "optional-dependencies", "version"] classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/requirements.txt b/requirements.txt index 279716c4..1ba2325f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ pandas >= 2.2.0, < 4.0 -geopandas >= 1.0, < 2.0 +geopandas >= 1.1.0, < 2.0 +shapely >= 2.1 pyarrow >= 17.0 tqdm >= 4.56 platformdirs >= 4 unidecode >= 1.3 urllib3 >= 2.2 -shapely >= 2 requests >= 2.32 requests-ratelimiter >= 0.7 openpyxl diff --git a/tests/localdata/test_pynsee_localdata.py b/tests/localdata/test_pynsee_localdata.py index 7ea81030..11ef1bca 100644 --- a/tests/localdata/test_pynsee_localdata.py +++ b/tests/localdata/test_pynsee_localdata.py @@ -14,7 +14,6 @@ from pynsee.localdata.get_local_data import get_local_data from pynsee.localdata.get_nivgeo_list import get_nivgeo_list from pynsee.localdata.get_local_metadata import get_local_metadata -from pynsee.localdata.get_population import get_population from pynsee.localdata.get_old_city import get_old_city from pynsee.localdata.get_new_city import get_new_city from pynsee.localdata.get_area_projection import get_area_projection @@ -29,11 +28,6 @@ class TestFunction(TestCase): - def test_get_population(self): - df = get_population() - test = isinstance(df, pd.DataFrame) - self.assertTrue(test) - def test_get_insee_one_area_1(self): def get_insee_one_area_test(area_type="derf", codearea="c"): _get_insee_one_area(area_type=area_type, codearea=codearea)