ks905383 · masawdah · Jul 24, 2023 · Jul 24, 2023 · Jul 24, 2023 · Jul 24, 2023
diff --git a/Run_the_test.ipynb b/Run_the_test.ipynb
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -323,7 +323,7 @@ def test_get_pixel_overlaps_gdf_wpreexisting_index(pix_agg=pix_agg):
 	# the pix_agg variable that this whole section has used. Doesn't really 
 	# matter, since this is testing an index error that would've 
 	# happened during aggregate() above. 
-	assert np.allclose([v for v in agg.agg.test.values],2.1666,rtol=1e-4)
+	assert np.allclose([v for v in agg.agg.test_weighted_mean.values],2.1666,rtol=1e-4)
 
 
 ##### aggregate() tests #####
@@ -347,7 +347,7 @@ def test_aggregate_basic(ds=ds):
 	# it's actually 1.499981, whereas multiplying out 
 	# np.sum(agg.agg.rel_area[0]*np.array([0,1,2,3]))gives 1.499963... 
 	# Possibly worth examining more closely later
-	assert np.allclose([v for v in agg.agg.test.values],1.4999,rtol=1e-4)
+	assert np.allclose([v for v in agg.agg.test_weighted_mean.values],1.4999,rtol=1e-4)
 
 def test_aggregate_basic_wdotproduct(ds=ds):
     # Create multiple polygons, to double check, since dot product
@@ -400,7 +400,7 @@ def test_aggregate_with_weights(ds=ds):
 
 	# Since the "test" for the input ds has [0,2] for the two 
 	# equatorial pixels, the average should just be 1.0
-	assert np.allclose([v for v in agg.agg.test.values],1.0)
+	assert np.allclose([v for v in agg.agg.test_weighted_mean.values],1.0)
 
 
 
@@ -432,7 +432,7 @@ def test_aggregate_with_mismatched_grid():
 	agg = aggregate(ds,wm)
 
 	# On change in rtol, see note in test_aggregate_basic
-	assert np.allclose([v for v in agg.agg.test.values],1.4999,rtol=1e-4)
+	assert np.allclose([v for v in agg.agg.test_weighted_mean.values],1.4999,rtol=1e-4)
 
 
 # Should probably test multiple polygons just to be sure... 

diff --git a/tests/test_export.py b/tests/test_export.py
@@ -39,7 +39,7 @@ def test_to_dataset(agg=agg):
 
 	# Build reference output dataset
 	ds_ref = xr.Dataset({'name':(['poly_idx'],np.array(['test']).astype(object)),
-						 'test':(['poly_idx','run'],np.array([[1.0,2.0]]))},
+						 'test_weighted_mean':(['poly_idx','run'],np.array([[1.0,2.0]]))},
 					coords={'poly_idx':(['poly_idx'],np.array([0])),
 							'run':(['run'],np.array([0,1]))})
 
@@ -52,7 +52,7 @@ def test_to_dataframe(agg=agg):
 	df_out = agg.to_dataframe()
 
 	# Build reference output dataframe
-	df_ref = pd.DataFrame({'poly_idx':[0,0],'run':[0,1],'name':['test','test'],'test':[0.9999,1.9999]})
+	df_ref = pd.DataFrame({'poly_idx':[0,0],'run':[0,1],'name':['test','test'],'test_weighted_mean':[0.9999,1.9999]})
 	df_ref = df_ref.set_index(['poly_idx','run'])
 
 	# Assert equal within tolerance, again likely due to very slight 

diff --git a/xagg/__init__.py b/xagg/__init__.py
@@ -2,6 +2,7 @@
 # everything else happening behind the scenes (and the exporting 
 # happening as methods to the classes that are exported from those
 # two functions)
+import xagg.esmf_setup
 from .wrappers import pixel_overlaps
 from .aux import (normalize,fix_ds,get_bnds,subset_find)
 from .core import (aggregate,read_wm)
diff --git a/xagg/aux.py b/xagg/aux.py
@@ -333,7 +333,8 @@ def subset_find(ds0,ds1):
         latlons0 = list(zip(ds1['lat'].values,ds1['lon'].values))
 
         # Find indices of the used grid for aggregation in the input grid
-        loc_idxs = [latlons.index(i) for i in latlons0]
+        latlons_dict = {latlon: index for index, latlon in enumerate(latlons)}
+        loc_idxs = [latlons_dict[i] for i in latlons0]
 
         if np.allclose(len(loc_idxs),len(latlons0)):
             print('grid adjustment successful')

diff --git a/xagg/core.py b/xagg/core.py
@@ -11,6 +11,9 @@
 from . aux import (find_rel_area,normalize,fix_ds,get_bnds,subset_find,list_or_first)
 from . classes import (weightmap,aggregated)
 
+
+SUPPORTED_STATISTICS_METHODS = ["weighted_mean", "weighted_sum","weighted_median","sum", "mean", "max", "min", "median", "count", "std"]
+
 def read_wm(path):
     """ Load temporary weightmap files from wm.to_file()
 
@@ -423,9 +426,10 @@ def get_pixel_overlaps(gdf_in,pix_agg,impl='for_loop'):
         # corresponding to those areas, and the lat/lon coordinates of 
         # those pixels
         overlap_info = ov_groups.agg(rel_area=pd.NamedAgg(column='geometry',aggfunc=lambda ds: [normalize(ds.area)]),
-                                       pix_idxs=pd.NamedAgg(column='pix_idx',aggfunc=lambda ds: [idx for idx in ds]),
-                                       lat=pd.NamedAgg(column='lat',aggfunc=lambda ds: [x for x in ds]),
-                                       lon=pd.NamedAgg(column='lon',aggfunc=lambda ds: [x for x in ds]))
+                                     poly_area=pd.NamedAgg(column='geometry',aggfunc=lambda ds: [ds.area]),
+                                     pix_idxs=pd.NamedAgg(column='pix_idx',aggfunc=lambda ds: [idx for idx in ds]),
+                                     lat=pd.NamedAgg(column='lat',aggfunc=lambda ds: [x for x in ds]),
+                                     lon=pd.NamedAgg(column='lon',aggfunc=lambda ds: [x for x in ds]))
 
     # Zip lat, lon columns into a list of (lat,lon) coordinates
     # (separate from above because as of 12/20, named aggs with 
@@ -464,8 +468,34 @@ def get_pixel_overlaps(gdf_in,pix_agg,impl='for_loop'):
 
     return wm_out
 
+def check_supported_statistic(stats_list):
+    for s in stats_list:
+        assert s in SUPPORTED_STATISTICS_METHODS, f"You selected {s}. This statistic is not available. Please choose one of the following: {SUPPORTED_STATISTICS_METHODS}."
 
-def aggregate(ds,wm,impl='for_loop',silent=False):
+
+def comp_weighted_medians(pix_in_poly, weights):
+    weighted_medians = xr.DataArray(np.zeros(pix_in_poly.shape[0]), dims='t')
+
+    # for single timestep
+    if pix_in_poly.ndim == 1:
+        df = pd.DataFrame({'pix_values': pix_in_poly, 'weights': weights})
+        df.sort_values(by='pix_values',inplace=True)
+        cumsum = df.weights.cumsum()
+        cutoff = df.weights.sum() / 2.0
+        weighted_medians = np.interp(cutoff, cumsum, df.pix_values.values)
+        return np.array(weighted_medians)
+    # for multiple timesteps
+    else:
+        for step in range(pix_in_poly.shape[0]):
+            df = pd.DataFrame({'pix_values': pix_in_poly[step], 'weights': weights})
+            df.sort_values(by='pix_values',inplace=True)
+            cumsum = df.weights.cumsum()
+            cutoff = df.weights.sum() / 2.0
+            weighted_medians[step] = np.interp(cutoff, cumsum, df.pix_values.values)
+        return weighted_medians.values
+
+
+def aggregate(ds,wm,impl='for_loop',stat=['weighted_mean'],skipna=True,interpolate_NaN=False,silent=False):
     """ Aggregate raster variable(s) to polygon(s)
 
     Aggregates (N-D) raster variables in `ds` to the polygons
@@ -517,6 +547,24 @@ def aggregate(ds,wm,impl='for_loop',silent=False):
             aggregation is calculated using a dot product, 
             requires much more memory (due to broadcasting of
             variables) but may be faster in certain circumstances
+
+    stat : :class:list (def: ``['mean']``)
+        only if impl = for loop
+        which aggregation statistic method to use, either of:
+        - ``'mean'``
+        - ``'max'``
+        - ``'min'``
+        - ``'sum'``
+        - ``'median'``
+        - ``'count'``
+        - ``'std'``         
+
+    skipna : bool, default = `True`
+        if True, skip missing values (as marked by NaN).Only
+            skips missing values for float dtypes.
+
+    interpolate_NaN : bool, default = `False`
+        if True, NaNs are interpolated
 
     silent : bool, default = `False`
         if True, then no status updates are printed to std out
@@ -527,23 +575,32 @@ def aggregate(ds,wm,impl='for_loop',silent=False):
         an :class:`xagg.classes.aggregated` object with the aggregated variables 
 
     """
+
+    # Check the supported stats       
+    check_supported_statistic(stat)
+
+
     # Make sure pixel_overlaps was correctly run if using dot product
     if (impl=='dot_product') and (wm.overlap_da is None):
         raise ValueError("no 'overlap_da' was found in the `wm` input - since you're using the dot product implementation, "+
                          "make sure to run `pixel_overlaps()` with `impl='dot_product'` to avoid this error.")
 
     # Turn into dataset if dataarray
     if type(ds)==xr.core.dataarray.DataArray:
-      if ds.name is None:
-        warnings.warn('An unnamed xr.DataArray was inputted instead of a xr.Dataset; the output variable will be "var"')
-        ds = ds.to_dataset(name='var')
-      else:
-        ds = ds.to_dataset()
-
+        if ds.name is None:
+            warnings.warn('An unnamed xr.DataArray was inputted instead of a xr.Dataset; the output variable will be "var"')
+            ds = ds.to_dataset(name='var')
+        else:
+            ds = ds.to_dataset()
 
+
     # Run ds through fix_ds (to fix lat/lon names, lon coords)
     ds = fix_ds(ds)
 
+    # interpolate NaNs
+    if interpolate_NaN == True:
+        ds = ds.interpolate_na()  
+
     # Stack 
     ds = ds.stack(loc=('lat','lon'))
 
@@ -621,10 +678,13 @@ def aggregate(ds,wm,impl='for_loop',silent=False):
             # bound variable
             if ('bnds' not in ds[var].dims) & ('loc' in ds[var].dims):
                 if not silent:
-                    print('aggregating '+var+'...')
-                # Create the column for the relevant variable
-                wm.agg[var] = None
-
+                    all_stat = ', '.join(stat)
+                    print('aggregating '+var+' by '+all_stat+ '...')
+
+                for i in stat:
+                    var_stat = var+'_'+ i
+                    wm.agg[var_stat] = None
+
                 # Get weighted average of variable based on pixel overlap + other weights
                 for poly_idx in wm.agg.poly_idx:
                     # Get average value of variable over the polygon; weighted by 
@@ -659,17 +719,85 @@ def aggregate(ds,wm,impl='for_loop',silent=False):
                             # Calculate the normalized area+weight of each pixel (taking into account
                             # nans)
                             normed_areaweights = normalize(tmp_areas*weights[wm.agg.iloc[poly_idx,:].pix_idxs],drop_na=True)
-
-                            # Take the weighted average of all the pixel values to calculate the 
-                            # aggregated value for the shapefile
-                            wm.agg.loc[poly_idx,var] = [[((ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)*
-                                                             normed_areaweights).
-                                                            sum('loc')).values]]
+                            # weights for Sum
+                            areaweights = tmp_areas*weights[wm.agg.iloc[poly_idx,:].pix_idxs]
+
+
+                            # Take the values of all pixel values in the current polygon
+                            pix_in_poly = ds[var].isel(loc=wm.agg.iloc[poly_idx,:].pix_idxs)
+                            pixel_area = np.atleast_1d(np.squeeze(wm.agg.iloc[poly_idx,:].poly_area))
+
+                            # Apply aggregation statistic method to the weighted pixel values
+
+                            for i in stat: 
+                                if i == "weighted_mean": # weighted average of all the pixel values
+                                    stat_i = "weighted_mean"
+                                    var_stat = var+'_'+stat_i
+                                    wm.agg.loc[poly_idx,var_stat] = [[(pix_in_poly*normed_areaweights).sum('loc', skipna=skipna).values]]
+                                    continue
+                                elif i == "mean":
+                                    stat_i = "mean"
+                                    var_stat = var+'_'+stat_i
+                                    wm.agg.loc[poly_idx,var_stat] = [[pix_in_poly.mean('loc', skipna=skipna).values]]
+                                    continue
+                                elif i == "max":
+                                    stat_i = "max"
+                                    var_stat = var+'_'+stat_i
+                                    wm.agg.loc[poly_idx,var_stat] = [[pix_in_poly.max('loc', skipna=skipna).values]]
+                                    continue
+                                elif i == "min":
+                                    stat_i = "min"
+                                    var_stat = var+'_'+stat_i
+                                    wm.agg.loc[poly_idx,var_stat] = [[pix_in_poly.min('loc', skipna=skipna).values]]
+                                    continue
+                                elif i == "weighted_sum": 
+                                    stat_i = "weighted_sum"
+                                    var_stat = var+'_'+stat_i
+                                    wm.agg.loc[poly_idx,var_stat] = [[(pix_in_poly*weights[wm.agg.iloc[poly_idx,:].pix_idxs]*(pixel_area/pixel_area.max())).sum('loc', skipna=skipna).values]]
+
+                                    continue
+
+                                elif i == "sum":
+                                    stat_i = "sum"
+                                    var_stat = var+'_'+stat_i
+                                    wm.agg.loc[poly_idx,var_stat] = [[pix_in_poly.sum('loc', skipna=skipna).values]]
+                                    continue
+
+
+                                elif i == "median":
+                                    stat_i = "median"
+                                    var_stat = var+'_'+stat_i                                
+                                    df = pd.DataFrame({'pix_values': pix_in_poly, 'weights': areaweights})
+                                    df.sort_values(by='pix_values',inplace=True)
+                                    cumsum = df.weights.cumsum()
+                                    cutoff = df.weights.sum() / 2.0
+                                    wm.agg.loc[poly_idx,var_stat] = np.interp(cutoff, cumsum, df.pix_values.values) 
+                                    continue
+
+
+                                elif i == "weighted_median":
+                                    stat_i = "weighted_median"
+                                    var_stat = var+'_'+stat_i
+                                    wm.agg.loc[poly_idx,var_stat] = [[comp_weighted_medians(pix_in_poly, normed_areaweights)]]
+                                    continue
+                                elif i == "count": # polygon area divided by pixel size gives exact count
+                                    stat_i = "count"
+                                    var_stat = var+'_'+stat_i
+
+                                    wm.agg.loc[poly_idx,var_stat] = [[pixel_area.sum()/pixel_area.max()]]
+                                    continue
+                                elif "std" in stat:
+                                    stat_i = "std"
+                                    var_stat = var+'_'+stat_i
+                                    wm.agg[var_stat] = None                               
+                                    wm.agg.loc[poly_idx,var_stat] = [[(pix_in_poly*weights[wm.agg.iloc[poly_idx,:].pix_idxs]).std('loc', skipna=skipna)]]
 
                         else:
+                            wm.agg[var] = None
                             wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]]
 
                     else:
+                        wm.agg[var] = None
                         wm.agg.loc[poly_idx,var] = [[(ds[var].isel(loc=0)*np.nan).values]]
 
         # Put in class format
@@ -680,4 +808,3 @@ def aggregate(ds,wm,impl='for_loop',silent=False):
     if not silent:
         print('all variables aggregated to polygons!')
     return agg_out
-
diff --git a/xagg/esmf_setup.py b/xagg/esmf_setup.py
@@ -0,0 +1,28 @@
+import os
+import sys
+
+def get_env_info():
+    env_path = os.path.dirname(sys.executable)
+    env_name = os.path.basename(env_path)
+    return env_name, env_path
+
+
+
+def set_esmf_mk_path():
+    env_name, env_path = get_env_info()
+    if env_path:
+        if env_path.endswith('/bin'): env_path= env_path[:-4]
+        esmf_mk_path = os.path.join(env_path,'lib', 'esmf.mk')
+
+        if os.path.isfile(esmf_mk_path):
+            os.environ['ESMFMKFILE'] = esmf_mk_path
+            print(f"Found esmf.mk at: {esmf_mk_path}")
+        else:
+            print('esmf.mk not found in the specified conda environment.')
+    else:
+        print('Unable to determine the Anaconda environment.')
+
+set_esmf_mk_path()
+
+
+