diff --git a/Run_the_test.ipynb b/Run_the_test.ipynb new file mode 100644 index 0000000..f0ddebd --- /dev/null +++ b/Run_the_test.ipynb @@ -0,0 +1,471 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "df382f7e-ca94-4898-accc-8819a4e86511", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found esmf.mk at: /home/malasawedah/.conda/envs/test2/lib/esmf.mk\n" + ] + } + ], + "source": [ + "import datetime\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "import xarray as xr\n", + "import numpy as np\n", + "import shapely\n", + "from shapely import Point, box\n", + "import xagg as xa\n", + "import rioxarray\n", + "\n", + "# supress warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\", category=RuntimeWarning) " + ] + }, + { + "cell_type": "markdown", + "id": "94c73369-9b46-4b05-9fd7-050402aafcc8", + "metadata": {}, + "source": [ + "#### Test" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "afc09af7-5b96-4cd9-a106-f5f4dccbe6b9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pytest\n", + "import pandas as pd\n", + "import numpy as np\n", + "import xarray as xr\n", + "import shutil\n", + "import geopandas as gpd\n", + "from geopandas import testing as gpdt\n", + "from unittest import TestCase\n", + "from shapely.geometry import Polygon\n", + "from functools import reduce\n", + "\n", + "import pytest\n", + "import pandas as pd\n", + "import numpy as np\n", + "import xarray as xr\n", + "import shutil\n", + "import geopandas as gpd\n", + "from geopandas import testing as gpdt\n", + "from unittest import TestCase\n", + "from shapely.geometry import Polygon\n", + "\n", + "from xagg.core import (process_weights,create_raster_polygons,get_pixel_overlaps,aggregate,read_wm)\n", + "from xagg.wrappers import (pixel_overlaps)\n" + ] + }, + { + "cell_type": "markdown", + "id": "66c598e0-1bc1-4922-9809-09c7be26db2c", + "metadata": {}, + "source": [ + "### test_export" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "97f72288-2e3d-4cfc-9efd-2eefeb13bc8a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ds = xr.Dataset({'test':(['lon','lat','run'],np.array([[[0,1],[2,3]],[[0,1],[2,3]]])),\n", + "\t\t\t\t 'lat_bnds':(['lat','bnds'],np.array([[-0.5,0.5],[0.5,1.5]])),\n", + "\t\t\t\t 'lon_bnds':(['lon','bnds'],np.array([[-0.5,0.5],[0.5,1.5]]))},\n", + "\t\t\t\tcoords={'lat':(['lat'],np.array([0,1])),\n", + "\t\t\t\t\t\t'lon':(['lon'],np.array([0,1])),\n", + "\t\t\t\t\t\t'run':(['run'],np.array([0,1])),\n", + "\t\t\t\t\t\t'bnds':(['bnds'],np.array([0,1]))})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "97157f7a-d871-4e45-a1f5-afeb36a3b191", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "creating polygons for each pixel...\n", + "calculating overlaps between pixels and output polygons...\n", + "success!\n", + "aggregating test by weighted_mean...\n", + "all variables aggregated to polygons!\n" + ] + } + ], + "source": [ + "# Create polygon covering multiple pixels\n", + "gdf = {'name':['test'],\n", + "\t\t\t'geometry':[Polygon([(0,0),(0,1),(1,1),(1,0),(0,0)])]}\n", + "gdf = gpd.GeoDataFrame(gdf,crs=\"EPSG:4326\")\n", + "\n", + "# Get pixel overlaps\n", + "wm = pixel_overlaps(ds,gdf)\n", + "\n", + "# Get aggregate\n", + "agg = aggregate(ds,wm, stat=['weighted_mean'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c50a4543-a112-42a5-97ee-739d00c4fc1b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "################# Test to data set ################\n", + "ds_out = agg.to_dataset()\n", + "\n", + "# Build reference output dataset\n", + "ds_ref = xr.Dataset({'name':(['poly_idx'],np.array(['test']).astype(object)),\n", + " 'test_weighted_mean':(['poly_idx','run'],np.array([[1.0,2.0]]))},\n", + " coords={'poly_idx':(['poly_idx'],np.array([0])),\n", + " 'run':(['run'],np.array([0,1]))})\n", + "\n", + "# Assert equal within tolerance, again likely due to very slight \n", + "# variation off from actual 1.0, 2.0 due to crs\n", + "xr.testing.assert_allclose(ds_out,ds_ref,atol=0.0001)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fa970853-6809-4577-9aec-7818f7bc38f9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "################# Test to dataframe ################\n", + "\n", + "\n", + "df_out = agg.to_dataframe()\n", + "\n", + "# Build reference output dataframe\n", + "df_ref = pd.DataFrame({'poly_idx':[0,0],'run':[0,1],'name':['test','test'],'test_weighted_mean':[0.9999,1.9999]})\n", + "df_ref = df_ref.set_index(['poly_idx','run'])\n", + "\n", + "# Assert equal within tolerance, again likely due to very slight \n", + "# variation off from actual 1.0, 2.0 due to crs\n", + "pd.testing.assert_frame_equal(df_out,df_ref,atol=0.0001)" + ] + }, + { + "cell_type": "markdown", + "id": "7c5b8d78-63b7-47f4-9842-b56a8ed0dd2b", + "metadata": {}, + "source": [ + "### test_core" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1ceb2cd1-f33c-44ab-90ef-efb4e66b93a0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# build raster polygons from a simple 2x2 grid of lat/lon pixels\n", + "ds = xr.Dataset({'test':(['lon','lat'],np.array([[0,1],[2,3]])),\n", + "\t\t\t\t 'lat_bnds':(['lat','bnds'],np.array([[-0.5,0.5],[0.5,1.5]])),\n", + "\t\t\t\t 'lon_bnds':(['lon','bnds'],np.array([[-0.5,0.5],[0.5,1.5]]))},\n", + "\t\t\t\tcoords={'lat':(['lat'],np.array([0,1])),\n", + "\t\t\t\t\t\t'lon':(['lon'],np.array([0,1])),\n", + "\t\t\t\t\t\t'bnds':(['bnds'],np.array([0,1]))})\n", + "\n", + "# add a simple weights grid\n", + "weights = xr.DataArray(data=np.array([[0,1],[2,3]]),\n", + "\t\t\t\t\t\t\tdims=['lat','lon'],\n", + "\t\t\t\t\t\t\tcoords=[ds.lat,ds.lon])\n", + "\n", + "# calculate the pix_agg variable tested above, to be used in the \n", + "# tests below\n", + "pix_agg = create_raster_polygons(ds,weights=weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b4c88425-d20c-48f6-ae1b-7e0fa56eda99", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aggregating test by weighted_mean...\n", + "all variables aggregated to polygons!\n" + ] + } + ], + "source": [ + "############# test_get_pixel_overlaps_gdf_wpreexisting_index ##################\n", + "ds1 = xr.Dataset({'test':(['lon','lat'],np.array([[np.nan,np.nan],[np.nan,np.nan]])),\n", + " 'lat_bnds':(['lat','bnds'],np.array([[-0.5,0.5],[0.5,1.5]])),\n", + " 'lon_bnds':(['lon','bnds'],np.array([[-0.5,0.5],[0.5,1.5]]))},\n", + " coords={'lat':(['lat'],np.array([0,1])),\n", + " 'lon':(['lon'],np.array([0,1])),\n", + " 'bnds':(['bnds'],np.array([0,1]))})\n", + "\n", + "# get aggregation mapping\n", + "pix_agg = create_raster_polygons(ds1)\n", + "\n", + "# Create polygon covering multiple pixels\n", + "gdf = {'name':['test'],\n", + " 'geometry':[Polygon([(0,0),(0,1),(1,1),(1,0),(0,0)])]}\n", + "gdf = gpd.GeoDataFrame(gdf,crs=\"EPSG:4326\")\n", + "\n", + "\n", + "# Get pixel overlaps\n", + "wm = get_pixel_overlaps(gdf,pix_agg)\n", + "\n", + "# Get aggregate\n", + "agg = aggregate(ds,wm)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f7b1222d-66dd-44c5-a251-140a1a352a05", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aggregating test by weighted_mean...\n", + "all variables aggregated to polygons!\n" + ] + } + ], + "source": [ + "############# test_aggregate_basic #################\n", + "gdf = {'name':['test'],\n", + " 'geometry':[Polygon([(0,0),(0,1),(1,1),(1,0),(0,0)])]}\n", + "gdf = gpd.GeoDataFrame(gdf,crs=\"EPSG:4326\")\n", + "\n", + "# calculate the pix_agg variable tested above, to be used in the \n", + "# tests below\n", + "pix_agg = create_raster_polygons(ds)\n", + "\n", + "# Get pixel overlaps\n", + "wm = get_pixel_overlaps(gdf,pix_agg)\n", + "\n", + "# Get aggregate\n", + "agg = aggregate(ds,wm)\n", + "\n", + "# This requires shifting rtol to 1e-4 for some reason, in that \n", + "# it's actually 1.499981, whereas multiplying out \n", + "# np.sum(agg.agg.rel_area[0]*np.array([0,1,2,3]))gives 1.499963... \n", + "# Possibly worth examining more closely later\n", + "assert np.allclose([v for v in agg.agg.test_weighted_mean.values],1.4999,rtol=1e-4)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d062db64-b30e-4a3d-9b74-f2a114e3b6e4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aggregating test by weighted_mean...\n", + "all variables aggregated to polygons!\n" + ] + } + ], + "source": [ + "################# test_aggregate_with_weights #################\n", + "\n", + "gdf = {'name':['test'],\n", + " 'geometry':[Polygon([(0,0),(0,1),(1,1),(1,0),(0,0)])]}\n", + "gdf = gpd.GeoDataFrame(gdf,crs=\"EPSG:4326\")\n", + "\n", + "# add a simple weights grid (equator pixels have weight 1, \n", + "# 1 N pixels have weight 0)\n", + "weights = xr.DataArray(data=np.array([[1,1],[0,0]]),\n", + " dims=['lat','lon'],\n", + " coords=[ds.lat,ds.lon])\n", + "\n", + "# calculate the pix_agg variable tested above, to be used in the \n", + "# tests below\n", + "pix_agg = create_raster_polygons(ds,weights=weights)\n", + "\n", + "\n", + "# Get pixel overlaps\n", + "wm = get_pixel_overlaps(gdf,pix_agg)\n", + "\n", + "# Get aggregate\n", + "agg = aggregate(ds,wm)\n", + "\n", + "# Since the \"test\" for the input ds has [0,2] for the two \n", + "# equatorial pixels, the average should just be 1.0\n", + "assert np.allclose([v for v in agg.agg.test_weighted_mean.values],1.0)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a9e207aa-7ba2-4c62-a476-c508b7a6d005", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aggregating test by weighted_mean...\n", + "all variables aggregated to polygons!\n" + ] + } + ], + "source": [ + "################ test_aggregate_with_mismatched_grid ###############\n", + "ds = xr.Dataset({'test':(['lon','lat'],np.array([[30,40,50],[10,0,1],[20,2,3]])),\n", + " 'lat_bnds':(['lat','bnds'],np.array([[-1.5,-0.5],[-0.5,0.5],[0.5,1.5]])),\n", + " 'lon_bnds':(['lon','bnds'],np.array([[-1.5,-0.5],[-0.5,0.5],[0.5,1.5]]))},\n", + " coords={'lat':(['lat'],np.array([-1,0,1])),\n", + " 'lon':(['lon'],np.array([-1,0,1])),\n", + " 'bnds':(['bnds'],np.array([0,1]))})\n", + "\n", + "\n", + "# Create polygon covering multiple pixels\n", + "gdf = {'name':['test'],\n", + " 'geometry':[Polygon([(0,0),(0,1),(1,1),(1,0),(0,0)])]}\n", + "gdf = gpd.GeoDataFrame(gdf,crs=\"EPSG:4326\")\n", + "\n", + "# calculate the pix_agg variable tested above, to be used in the \n", + "# tests below\n", + "pix_agg = create_raster_polygons(ds)\n", + "\n", + "\n", + "# Get pixel overlaps\n", + "wm = get_pixel_overlaps(gdf,pix_agg)\n", + "\n", + "# Get aggregate\n", + "agg = aggregate(ds,wm)\n", + "\n", + "# On change in rtol, see note in test_aggregate_basic\n", + "assert np.allclose([v for v in agg.agg.test_weighted_mean.values],1.4999,rtol=1e-4)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d1cbc317-c633-4656-a062-24cdb3692425", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "aggregating test by weighted_mean...\n", + "all variables aggregated to polygons!\n" + ] + } + ], + "source": [ + "################ test_aggregate_with_all_nans ####################\n", + "ds = xr.Dataset({'test':(['lon','lat'],np.array([[np.nan,np.nan],[np.nan,np.nan]])),\n", + " 'lat_bnds':(['lat','bnds'],np.array([[-0.5,0.5],[0.5,1.5]])),\n", + " 'lon_bnds':(['lon','bnds'],np.array([[-0.5,0.5],[0.5,1.5]]))},\n", + " coords={'lat':(['lat'],np.array([0,1])),\n", + " 'lon':(['lon'],np.array([0,1])),\n", + " 'bnds':(['bnds'],np.array([0,1]))})\n", + "\n", + "# get aggregation mapping\n", + "pix_agg = create_raster_polygons(ds)\n", + "\n", + "# Create polygon covering multiple pixels\n", + "gdf = {'name':['test'],\n", + " 'geometry':[Polygon([(0,0),(0,1),(1,1),(1,0),(0,0)])]}\n", + "gdf = gpd.GeoDataFrame(gdf,crs=\"EPSG:4326\")\n", + "\n", + "\n", + "# Get pixel overlaps\n", + "wm = get_pixel_overlaps(gdf,pix_agg)\n", + "\n", + "# Get aggregate\n", + "agg = aggregate(ds,wm)\n", + "\n", + "# Should only return nan \n", + "# (this is not a great assert - but agg.agg.test[0] comes out as [array(nan)], \n", + "# which... I'm not entirely sure how to reproduce. It quaks like a single nan,\n", + "# but it's unclear to me how to get it to work)\n", + "assert np.all([np.isnan(k) for k in agg.agg.test])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f947d43-38fd-4743-b1ee-667a0f65b5b4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "test2", + "language": "python", + "name": "test2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_core.py b/tests/test_core.py index 34fc3bc..f00b0f5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -323,7 +323,7 @@ def test_get_pixel_overlaps_gdf_wpreexisting_index(pix_agg=pix_agg): # the pix_agg variable that this whole section has used. Doesn't really # matter, since this is testing an index error that would've # happened during aggregate() above. - assert np.allclose([v for v in agg.agg.test.values],2.1666,rtol=1e-4) + assert np.allclose([v for v in agg.agg.test_weighted_mean.values],2.1666,rtol=1e-4) ##### aggregate() tests ##### @@ -347,7 +347,7 @@ def test_aggregate_basic(ds=ds): # it's actually 1.499981, whereas multiplying out # np.sum(agg.agg.rel_area[0]*np.array([0,1,2,3]))gives 1.499963... # Possibly worth examining more closely later - assert np.allclose([v for v in agg.agg.test.values],1.4999,rtol=1e-4) + assert np.allclose([v for v in agg.agg.test_weighted_mean.values],1.4999,rtol=1e-4) def test_aggregate_basic_wdotproduct(ds=ds): # Create multiple polygons, to double check, since dot product @@ -400,7 +400,7 @@ def test_aggregate_with_weights(ds=ds): # Since the "test" for the input ds has [0,2] for the two # equatorial pixels, the average should just be 1.0 - assert np.allclose([v for v in agg.agg.test.values],1.0) + assert np.allclose([v for v in agg.agg.test_weighted_mean.values],1.0) @@ -432,7 +432,7 @@ def test_aggregate_with_mismatched_grid(): agg = aggregate(ds,wm) # On change in rtol, see note in test_aggregate_basic - assert np.allclose([v for v in agg.agg.test.values],1.4999,rtol=1e-4) + assert np.allclose([v for v in agg.agg.test_weighted_mean.values],1.4999,rtol=1e-4) # Should probably test multiple polygons just to be sure... diff --git a/tests/test_export.py b/tests/test_export.py index e6d85a3..38452a5 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -39,7 +39,7 @@ def test_to_dataset(agg=agg): # Build reference output dataset ds_ref = xr.Dataset({'name':(['poly_idx'],np.array(['test']).astype(object)), - 'test':(['poly_idx','run'],np.array([[1.0,2.0]]))}, + 'test_weighted_mean':(['poly_idx','run'],np.array([[1.0,2.0]]))}, coords={'poly_idx':(['poly_idx'],np.array([0])), 'run':(['run'],np.array([0,1]))}) @@ -52,7 +52,7 @@ def test_to_dataframe(agg=agg): df_out = agg.to_dataframe() # Build reference output dataframe - df_ref = pd.DataFrame({'poly_idx':[0,0],'run':[0,1],'name':['test','test'],'test':[0.9999,1.9999]}) + df_ref = pd.DataFrame({'poly_idx':[0,0],'run':[0,1],'name':['test','test'],'test_weighted_mean':[0.9999,1.9999]}) df_ref = df_ref.set_index(['poly_idx','run']) # Assert equal within tolerance, again likely due to very slight diff --git a/xagg/__init__.py b/xagg/__init__.py index cfff18a..1bec6e5 100644 --- a/xagg/__init__.py +++ b/xagg/__init__.py @@ -2,6 +2,7 @@ # everything else happening behind the scenes (and the exporting # happening as methods to the classes that are exported from those # two functions) +import xagg.esmf_setup from .wrappers import pixel_overlaps from .aux import (normalize,fix_ds,get_bnds,subset_find) from .core import (aggregate,read_wm) \ No newline at end of file diff --git a/xagg/aux.py b/xagg/aux.py index be8a5c9..0213e12 100644 --- a/xagg/aux.py +++ b/xagg/aux.py @@ -333,7 +333,8 @@ def subset_find(ds0,ds1): latlons0 = list(zip(ds1['lat'].values,ds1['lon'].values)) # Find indices of the used grid for aggregation in the input grid - loc_idxs = [latlons.index(i) for i in latlons0] + latlons_dict = {latlon: index for index, latlon in enumerate(latlons)} + loc_idxs = [latlons_dict[i] for i in latlons0] if np.allclose(len(loc_idxs),len(latlons0)): print('grid adjustment successful') diff --git a/xagg/core.py b/xagg/core.py index 16cb32b..a6fd595 100644 --- a/xagg/core.py +++ b/xagg/core.py @@ -11,6 +11,9 @@ from . aux import (find_rel_area,normalize,fix_ds,get_bnds,subset_find,list_or_first) from . classes import (weightmap,aggregated) + +SUPPORTED_STATISTICS_METHODS = ["weighted_mean", "weighted_sum","weighted_median","sum", "mean", "max", "min", "median", "count", "std"] + def read_wm(path): """ Load temporary weightmap files from wm.to_file() @@ -423,9 +426,10 @@ def get_pixel_overlaps(gdf_in,pix_agg,impl='for_loop'): # corresponding to those areas, and the lat/lon coordinates of # those pixels overlap_info = ov_groups.agg(rel_area=pd.NamedAgg(column='geometry',aggfunc=lambda ds: [normalize(ds.area)]), - pix_idxs=pd.NamedAgg(column='pix_idx',aggfunc=lambda ds: [idx for idx in ds]), - lat=pd.NamedAgg(column='lat',aggfunc=lambda ds: [x for x in ds]), - lon=pd.NamedAgg(column='lon',aggfunc=lambda ds: [x for x in ds])) + poly_area=pd.NamedAgg(column='geometry',aggfunc=lambda ds: [ds.area]), + pix_idxs=pd.NamedAgg(column='pix_idx',aggfunc=lambda ds: [idx for idx in ds]), + lat=pd.NamedAgg(column='lat',aggfunc=lambda ds: [x for x in ds]), + lon=pd.NamedAgg(column='lon',aggfunc=lambda ds: [x for x in ds])) # Zip lat, lon columns into a list of (lat,lon) coordinates # (separate from above because as of 12/20, named aggs with @@ -464,8 +468,34 @@ def get_pixel_overlaps(gdf_in,pix_agg,impl='for_loop'): return wm_out +def check_supported_statistic(stats_list): + for s in stats_list: + assert s in SUPPORTED_STATISTICS_METHODS, f"You selected {s}. This statistic is not available. Please choose one of the following: {SUPPORTED_STATISTICS_METHODS}." -def aggregate(ds,wm,impl='for_loop',silent=False): + +def comp_weighted_medians(pix_in_poly, weights): + weighted_medians = xr.DataArray(np.zeros(pix_in_poly.shape[0]), dims='t') + + # for single timestep + if pix_in_poly.ndim == 1: + df = pd.DataFrame({'pix_values': pix_in_poly, 'weights': weights}) + df.sort_values(by='pix_values',inplace=True) + cumsum = df.weights.cumsum() + cutoff = df.weights.sum() / 2.0 + weighted_medians = np.interp(cutoff, cumsum, df.pix_values.values) + return np.array(weighted_medians) + # for multiple timesteps + else: + for step in range(pix_in_poly.shape[0]): + df = pd.DataFrame({'pix_values': pix_in_poly[step], 'weights': weights}) + df.sort_values(by='pix_values',inplace=True) + cumsum = df.weights.cumsum() + cutoff = df.weights.sum() / 2.0 + weighted_medians[step] = np.interp(cutoff, cumsum, df.pix_values.values) + return weighted_medians.values + + +def aggregate(ds,wm,impl='for_loop',stat=['weighted_mean'],skipna=True,interpolate_NaN=False,silent=False): """ Aggregate raster variable(s) to polygon(s) Aggregates (N-D) raster variables in `ds` to the polygons @@ -517,6 +547,24 @@ def aggregate(ds,wm,impl='for_loop',silent=False): aggregation is calculated using a dot product, requires much more memory (due to broadcasting of variables) but may be faster in certain circumstances + + stat : :class:list (def: ``['mean']``) + only if impl = for loop + which aggregation statistic method to use, either of: + - ``'mean'`` + - ``'max'`` + - ``'min'`` + - ``'sum'`` + - ``'median'`` + - ``'count'`` + - ``'std'`` + + skipna : bool, default = `True` + if True, skip missing values (as marked by NaN).Only + skips missing values for float dtypes. + + interpolate_NaN : bool, default = `False` + if True, NaNs are interpolated silent : bool, default = `False` if True, then no status updates are printed to std out @@ -527,6 +575,11 @@ def aggregate(ds,wm,impl='for_loop',silent=False): an :class:`xagg.classes.aggregated` object with the aggregated variables """ + + # Check the supported stats + check_supported_statistic(stat) + + # Make sure pixel_overlaps was correctly run if using dot product if (impl=='dot_product') and (wm.overlap_da is None): raise ValueError("no 'overlap_da' was found in the `wm` input - since you're using the dot product implementation, "+ @@ -534,16 +587,20 @@ def aggregate(ds,wm,impl='for_loop',silent=False): # Turn into dataset if dataarray if type(ds)==xr.core.dataarray.DataArray: - if ds.name is None: - warnings.warn('An unnamed xr.DataArray was inputted instead of a xr.Dataset; the output variable will be "var"') - ds = ds.to_dataset(name='var') - else: - ds = ds.to_dataset() - + if ds.name is None: + warnings.warn('An unnamed xr.DataArray was inputted instead of a xr.Dataset; the output variable will be "var"') + ds = ds.to_dataset(name='var') + else: + ds = ds.to_dataset() + # Run ds through fix_ds (to fix lat/lon names, lon coords) ds = fix_ds(ds) + # interpolate NaNs + if interpolate_NaN == True: + ds = ds.interpolate_na() + # Stack ds = ds.stack(loc=('lat','lon')) @@ -621,10 +678,13 @@ def aggregate(ds,wm,impl='for_loop',silent=False): # bound variable if ('bnds' not in ds[var].dims) & ('loc' in ds[var].dims): if not silent: - print('aggregating '+var+'...') - # Create the column for the relevant variable - wm.agg[var] = None - + all_stat = ', '.join(stat) + print('aggregating '+var+' by '+all_stat+ '...') + + for i in stat: + var_stat = var+'_'+ i + wm.agg[var_stat] = None + # Get weighted average of variable based on pixel overlap + other weights for poly_idx in wm.agg.poly_idx: # Get average value of variable over the polygon; weighted by @@ -659,17 +719,85 @@ def aggregate(ds,wm,impl='for_loop',silent=False): # Calculate the normalized area+weight of each pixel (taking into account # nans) normed_areaweights = normalize(tmp_areas*weights[wm.agg.iloc[poly_idx,:].pix_idxs],drop_na=True) - - # Take the weighted average of all the pixel values to calculate the - # aggregated value for the shapefile - wm.agg.loc[poly_idx,var] = [[((ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)* - normed_areaweights). - sum('loc')).values]] + # weights for Sum + areaweights = tmp_areas*weights[wm.agg.iloc[poly_idx,:].pix_idxs] + + + # Take the values of all pixel values in the current polygon + pix_in_poly = ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs) + pixel_area = np.atleast_1d(np.squeeze(wm.agg.iloc[poly_idx,:].poly_area)) + + # Apply aggregation statistic method to the weighted pixel values + + for i in stat: + if i == "weighted_mean": # weighted average of all the pixel values + stat_i = "weighted_mean" + var_stat = var+'_'+stat_i + wm.agg.loc[poly_idx,var_stat] = [[(pix_in_poly*normed_areaweights).sum('loc', skipna=skipna).values]] + continue + elif i == "mean": + stat_i = "mean" + var_stat = var+'_'+stat_i + wm.agg.loc[poly_idx,var_stat] = [[pix_in_poly.mean('loc', skipna=skipna).values]] + continue + elif i == "max": + stat_i = "max" + var_stat = var+'_'+stat_i + wm.agg.loc[poly_idx,var_stat] = [[pix_in_poly.max('loc', skipna=skipna).values]] + continue + elif i == "min": + stat_i = "min" + var_stat = var+'_'+stat_i + wm.agg.loc[poly_idx,var_stat] = [[pix_in_poly.min('loc', skipna=skipna).values]] + continue + elif i == "weighted_sum": + stat_i = "weighted_sum" + var_stat = var+'_'+stat_i + wm.agg.loc[poly_idx,var_stat] = [[(pix_in_poly*weights[wm.agg.iloc[poly_idx,:].pix_idxs]*(pixel_area/pixel_area.max())).sum('loc', skipna=skipna).values]] + + continue + + elif i == "sum": + stat_i = "sum" + var_stat = var+'_'+stat_i + wm.agg.loc[poly_idx,var_stat] = [[pix_in_poly.sum('loc', skipna=skipna).values]] + continue + + + elif i == "median": + stat_i = "median" + var_stat = var+'_'+stat_i + df = pd.DataFrame({'pix_values': pix_in_poly, 'weights': areaweights}) + df.sort_values(by='pix_values',inplace=True) + cumsum = df.weights.cumsum() + cutoff = df.weights.sum() / 2.0 + wm.agg.loc[poly_idx,var_stat] = np.interp(cutoff, cumsum, df.pix_values.values) + continue + + + elif i == "weighted_median": + stat_i = "weighted_median" + var_stat = var+'_'+stat_i + wm.agg.loc[poly_idx,var_stat] = [[comp_weighted_medians(pix_in_poly, normed_areaweights)]] + continue + elif i == "count": # polygon area divided by pixel size gives exact count + stat_i = "count" + var_stat = var+'_'+stat_i + + wm.agg.loc[poly_idx,var_stat] = [[pixel_area.sum()/pixel_area.max()]] + continue + elif "std" in stat: + stat_i = "std" + var_stat = var+'_'+stat_i + wm.agg[var_stat] = None + wm.agg.loc[poly_idx,var_stat] = [[(pix_in_poly*weights[wm.agg.iloc[poly_idx,:].pix_idxs]).std('loc', skipna=skipna)]] else: + wm.agg[var] = None wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]] else: + wm.agg[var] = None wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]] # Put in class format @@ -680,4 +808,3 @@ def aggregate(ds,wm,impl='for_loop',silent=False): if not silent: print('all variables aggregated to polygons!') return agg_out - diff --git a/xagg/esmf_setup.py b/xagg/esmf_setup.py new file mode 100644 index 0000000..d277d52 --- /dev/null +++ b/xagg/esmf_setup.py @@ -0,0 +1,28 @@ +import os +import sys + +def get_env_info(): + env_path = os.path.dirname(sys.executable) + env_name = os.path.basename(env_path) + return env_name, env_path + + + +def set_esmf_mk_path(): + env_name, env_path = get_env_info() + if env_path: + if env_path.endswith('/bin'): env_path= env_path[:-4] + esmf_mk_path = os.path.join(env_path,'lib', 'esmf.mk') + + if os.path.isfile(esmf_mk_path): + os.environ['ESMFMKFILE'] = esmf_mk_path + print(f"Found esmf.mk at: {esmf_mk_path}") + else: + print('esmf.mk not found in the specified conda environment.') + else: + print('Unable to determine the Anaconda environment.') + +set_esmf_mk_path() + + + diff --git a/xagg/export.py b/xagg/export.py index d09e689..f60ecce 100644 --- a/xagg/export.py +++ b/xagg/export.py @@ -6,9 +6,12 @@ import re import warnings import shutil +from functools import reduce from . aux import (normalize,fix_ds,get_bnds,subset_find) +SUPPORTED_STATISTICS_METHODS = ["weighted_mean", "weighted_sum","weighted_median", "mean", "max", "min", "sum", "median", "count", "std"] + def export_weightmap(wm_obj,fn,overwrite=False): """ Save a copy of the weightmap, to avoid recalculating it @@ -94,22 +97,28 @@ def prep_for_nc(agg_obj,loc_dim='poly_idx'): # Create xarray dataset with the aggregation polygons (poly_idx) # there. ds_out = xr.Dataset(coords={'poly_idx':(['poly_idx'],agg_obj.agg.poly_idx.values)}) - + # Add other polygon attributes - for var in [c for c in agg_obj.agg.columns if c not in ['poly_idx','rel_area','pix_idxs','coords']]: - if var not in agg_obj.ds_in.var(): + for var in [c for c in agg_obj.agg.columns if c not in ['poly_idx','rel_area','poly_area', 'pix_idxs','coords']]: + original_var = reduce(lambda s, sub: s.replace(sub, ''), SUPPORTED_STATISTICS_METHODS, var) + original_var = original_var.replace('_', '') + if (original_var not in agg_obj.ds_in.var()): + # For auxiliary variables (from the shapefile), just copy them wholesale into the dataset ds_out[var] = xr.DataArray(data=agg_obj.agg[var],coords=[agg_obj.agg.poly_idx],dims=['poly_idx']) else: + original_var = reduce(lambda s, sub: s.replace(sub, ''), SUPPORTED_STATISTICS_METHODS, var) + original_var = original_var.replace('_', '') # For data variables (from the input grid), create empty array ds_out[var] = xr.DataArray(data=np.zeros((len(agg_obj.agg), - *[agg_obj.ds_in[var].sizes[k] for k in agg_obj.ds_in[var].sizes.keys() if k not in ['lat','lon','loc']]))*np.nan, - dims=['poly_idx',*[k for k in agg_obj.ds_in[var].sizes.keys() if k not in ['lat','lon','loc']]], - coords=[[k for k in agg_obj.agg.poly_idx],*[agg_obj.ds_in[var][k].values for k in agg_obj.ds_in[var].sizes.keys() if k not in ['lat','lon','loc']]]) - + *[agg_obj.ds_in[original_var].sizes[k] for k in agg_obj.ds_in[original_var].sizes.keys() if k not in ['lat','lon','loc']]))*np.nan, + dims=['poly_idx',*[k for k in agg_obj.ds_in[original_var].sizes.keys() if k not in ['lat','lon','loc']]], + coords=[[k for k in agg_obj.agg.poly_idx],*[agg_obj.ds_in[original_var][k].values for k in agg_obj.ds_in[original_var].sizes.keys() if k not in ['lat','lon','loc']]]) + # Now insert aggregated values for poly_idx in agg_obj.agg.poly_idx: ds_out[var].loc[{'poly_idx':poly_idx}] = np.squeeze(agg_obj.agg.loc[poly_idx,var]) + # Add non-geographic coordinates for the variables to be aggregated for crd in [k for k in agg_obj.ds_in.sizes.keys() if (k not in ['lat','lon','loc','bnds'])]: @@ -163,34 +172,39 @@ def prep_for_csv(agg_obj,add_geom=False): """ # For output into csv, work with existing geopandas data frame - csv_out = agg_obj.agg.drop(columns=['rel_area','pix_idxs','coords','poly_idx']) + csv_out = agg_obj.agg.drop(columns=['rel_area','poly_area','pix_idxs','coords','poly_idx']) # Now expand the aggregated variable into multiple columns - for var in [c for c in agg_obj.agg.columns if ((c not in ['poly_idx','rel_area','pix_idxs','coords']) & (c in agg_obj.ds_in.var()))]: - # NOT YET IMPLEMENTED: dynamic column naming - so if it recognizes - # it as a date, then instead of doing var0, var1, var2,... it does - # varYYYYMMDD etc. - # These are the coordinates of the variable in the original raster - dimsteps = [agg_obj.ds_in[var][d].values for d in agg_obj.ds_in[var].sizes.keys() if d not in ['lat','lon','loc']] - # ALSO SHOULD check to see if the variables are multi-D - if they are - # there are two options: - # - a multi-index column title (var0-0, var0-1) - # - or an error saying csv output is not supported for this - - # Reshape the variable wide and name the columns [var]0, [var]1,... - if len(dimsteps) == 0: - # (in this case, all it does is move from one list per row to - # one value per row) - expanded_var = (pd.DataFrame(pd.DataFrame(csv_out[var].to_list())[0].to_list(), - columns=[var])) - else: - expanded_var = (pd.DataFrame(pd.DataFrame(csv_out[var].to_list())[0].to_list(), - columns=[var+str(idx) for idx in np.arange(0,len(csv_out[var][0][0]))])) - # Append to existing series - csv_out = pd.concat([csv_out.drop(columns=(var)), - expanded_var], - axis=1) - del expanded_var + for var in [c for c in agg_obj.agg.columns if ((c not in ['poly_idx','rel_area','poly_area','pix_idxs','coords']) )]: + print(var) + original_var = reduce(lambda s, sub: s.replace(sub, ''), SUPPORTED_STATISTICS_METHODS, var) + original_var = original_var.replace('_', '') + if original_var in agg_obj.ds_in.var(): + print(original_var) + # NOT YET IMPLEMENTED: dynamic column naming - so if it recognizes + # it as a date, then instead of doing var0, var1, var2,... it does + # varYYYYMMDD etc. + # These are the coordinates of the variable in the original raster + dimsteps = [agg_obj.ds_in[original_var][d].values for d in agg_obj.ds_in[original_var].sizes.keys() if d not in ['lat','lon','loc']] + # ALSO SHOULD check to see if the variables are multi-D - if they are + # there are two options: + # - a multi-index column title (var0-0, var0-1) + # - or an error saying csv output is not supported for this + + # Reshape the variable wide and name the columns [var]0, [var]1,... + if len(dimsteps) == 0: + # (in this case, all it does is move from one list per row to + # one value per row) + expanded_var = (pd.DataFrame(pd.DataFrame(csv_out[var].to_list())[0].to_list(), + columns=[var])) + else: + expanded_var = (pd.DataFrame(pd.DataFrame(csv_out[var].to_list())[0].to_list(), + columns=[var+str(idx) for idx in np.arange(0,len(csv_out[var][0][0]))])) + # Append to existing series + csv_out = pd.concat([csv_out.drop(columns=(var)), + expanded_var], + axis=1) + del expanded_var if add_geom: # Return the geometry from the original geopandas.GeoDataFrame