From 981d676ef235313630cadd6985e26de668b4caf1 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 28 Oct 2024 14:34:04 +0100 Subject: [PATCH 01/96] First attempt at adding derived forcings --- mllam_data_prep/config.py | 1 + mllam_data_prep/create_dataset.py | 66 ++++++++++++++++++++- mllam_data_prep/ops/derive_variables.py | 79 +++++++++++++++++++++++++ 3 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 mllam_data_prep/ops/derive_variables.py diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 6112a0c..2cd439f 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -171,6 +171,7 @@ class InputDataset: dim_mapping: Dict[str, DimMapping] target_output_variable: str attributes: Dict[str, Any] = None + derive_variables: bool = False @dataclass diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index ad14704..69249d5 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,6 +10,10 @@ from . import __version__ from .config import Config, InvalidConfigException +from .ops.derive_variables import ( + derive_toa_radiation, + get_variables_for_deriving_toa_radiation, +) from .ops.loading import load_and_subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs @@ -116,9 +120,20 @@ def create_dataset(config: Config): output_dims = output_config.variables[target_output_var] + # Check if the variables should be derived/calculated + derive_input_variables = input_config.derive_variables or False + + if derive_input_variables: + logger.info( + f"Get variables needed to derive additional/external forcings: {variables}" + ) + variables_to_extract = get_variables_for_forcing_derivation(variables) + else: + variables_to_extract = variables + logger.info(f"Loading dataset {dataset_name} from {path}") try: - ds = load_and_subset_dataset(fp=path, variables=variables) + ds = load_and_subset_dataset(fp=path, variables=variables_to_extract) except Exception as ex: raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex _check_dataset_attributes( @@ -127,6 +142,9 @@ def create_dataset(config: Config): dataset_name=dataset_name, ) + if derive_input_variables: + ds = derive_forcings(ds, variables, variables_to_extract) + dim_mapping = input_config.dim_mapping # check that there is an entry for each arch dimension @@ -266,3 +284,49 @@ def create_dataset_zarr(fp_config, fp_zarr: str = None): logger.info(f"Wrote training-ready dataset to {fp_zarr}") logger.info(ds) + + +def get_variables_for_forcing_derivation(variables): + """ + Extract the variables needed for deriving the external/additional forcings + """ + if isinstance(variables, dict): + raise Exception("Not implemented yet") + elif isinstance(variables, list): + variables_to_extract = set() + for var in variables: + if var == "toa_radiation": + vars = get_variables_for_deriving_toa_radiation() + else: + raise Exception(f"Function for deriving {var} is not implemented yet!") + + # Add variable names to set (only adds unique variables) + variables_to_extract.update(vars) + + # Turn the set into a list + variables_to_extract = list(variables_to_extract) + + return variables_to_extract + + +def derive_forcings(ds, variables, variables_to_extract): + """ + Derive the specified forcings + """ + if isinstance(variables, dict): + raise Exception("Not implemented yet") + elif isinstance(variables, list): + for var in variables: + if var == "toa_radiation": + ds = derive_toa_radiation(ds) + else: + raise Exception(f"Function for deriving {var} is not implemented yet!") + + # Drop all the unneeded variables that have only been used to derive the + # forcing variables. Need to keep any variables that are also coordinates! + variables_to_drop = [ + var for var in variables_to_extract if var not in list(ds._coord_names) + ] + ds = ds.drop_vars(variables_to_drop, errors="ignore") + + return ds diff --git a/mllam_data_prep/ops/derive_variables.py b/mllam_data_prep/ops/derive_variables.py new file mode 100644 index 0000000..9ad3be9 --- /dev/null +++ b/mllam_data_prep/ops/derive_variables.py @@ -0,0 +1,79 @@ +import dask.array as da +import numpy as np +import xarray as xr + + +def derive_toa_radiation(ds): + """ + Derive approximate TOA radiation (instantaneous values [W*m**-2]) + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive TOA radiation + + Returns + ------- + ds: xr.Dataset + The dataset with TOA radiation added + """ + + # Need to construct a new dataset with chunks since + # lat and lon are coordinates and are therefore eagerly loaded + ds_dict = {} + ds_dict["lat"] = (list(ds.lat.dims), da.from_array(ds.lat.values, chunks=(-1, -1))) + ds_dict["lon"] = (list(ds.lon.dims), da.from_array(ds.lon.values, chunks=(-1, -1))) + ds_dict["t"] = (list(ds.time.dims), da.from_array(ds.time.values, chunks=(10))) + ds_chunks = xr.Dataset(ds_dict) + + # Calculate TOA radiation + toa_radiation = calc_toa_radiation(ds_chunks) + + # Assign to the dataset + ds = ds.assign(toa_radiation=toa_radiation) + + return ds + + +def calc_toa_radiation(ds): + """ + Function for calculation top-of-the-atmosphere radiation + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive TOA radiation + + Returns + ------- + toa_radiation: xr.DataArray + TOA radiation data-array + """ + # Solar constant + E0 = 1366 # W*m**-2 + + day = ds.t.dt.dayofyear + hr_utc = ds.t.dt.hour + + # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. + dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) + + hr_lst = hr_utc + ds.lon / 15 + hr_angle = 15 * (hr_lst - 12) + + # Eq. 1.6.2 with beta=0 in Solar Engineering of Thermal Processes 4th ed. + cos_sza = np.sin(ds.lat * np.pi / 180) * np.sin(dec) + np.cos( + ds.lat * np.pi / 180 + ) * np.cos(dec) * np.cos(hr_angle * np.pi / 180) + + # Where TOA radiation is negative, set to 0 + toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) + + return toa_radiation + + +def get_variables_for_deriving_toa_radiation(): + """ + Get list of variables needed for the TOA radiation calculation + """ + return ["lat", "lon", "time"] From 79a94db939ff089d221a190b836dd1a03160c452 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 6 Nov 2024 08:09:29 +0000 Subject: [PATCH 02/96] Re-structure approach - Update the configuration file so that we list the dependencies and the method used to calculate the derived variable instead of having a flag to say that the variables should be derived. This approach is temporary and might be revised soon. - Add a new class in mllam_data_prep/config.py for derived variables to distinguish them from non-derived variables. - Updates to mllam_data_prep/ops/loading.py to distinguish between derived and non-derived variables. - Move all functions related to forcing derivations to a new and renamed function (mllam_data_prep/ops/forcings.py). --- example.danra.yaml | 22 ++++++ mllam_data_prep/config.py | 22 +++++- mllam_data_prep/create_dataset.py | 68 ++----------------- .../ops/{derive_variables.py => forcings.py} | 55 +++++++++++++-- mllam_data_prep/ops/loading.py | 21 +++++- 5 files changed, 113 insertions(+), 75 deletions(-) rename mllam_data_prep/ops/{derive_variables.py => forcings.py} (57%) diff --git a/example.danra.yaml b/example.danra.yaml index 28ae1af..0e6e7be 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -73,6 +73,28 @@ inputs: name_format: f"{var_name}" target_output_variable: forcing + danra_additional_forcings: + path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr + dims: [time, x, y] + variables: + - toa_radiation: + dependencies: + - time + - lat + - lon + method: derive_toa_radiation + dim_mapping: + time: + method: rename + dim: time + grid_index: + method: stack + dims: [x, y] + forcing_feature: + method: stack_variables_by_var_name + name_format: f"{var_name}" + target_output_variable: forcing + danra_lsm: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/lsm.zarr dims: [x, y] diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 2cd439f..354aaf9 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -64,6 +64,22 @@ class ValueSelection: units: str = None +@dataclass +class DerivedVariable: + """ + Defines a derived variables, where the dependencies (variables used + in the calculation) and the method (function for calculations) are + specified. + + Attributes: + dependencies: The variables to use in the calculation. + method: The methpd with which to derive the variable. + """ + + dependencies: List[str] + method: str = None + + @dataclass class DimMapping: """ @@ -167,11 +183,13 @@ class InputDataset: path: str dims: List[str] - variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] + variables: Union[ + List[Union[str, Dict[str, DerivedVariable]]], + Dict[str, Dict[str, ValueSelection]], + ] dim_mapping: Dict[str, DimMapping] target_output_variable: str attributes: Dict[str, Any] = None - derive_variables: bool = False @dataclass diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 69249d5..a6813c3 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,10 +10,7 @@ from . import __version__ from .config import Config, InvalidConfigException -from .ops.derive_variables import ( - derive_toa_radiation, - get_variables_for_deriving_toa_radiation, -) +from .ops.forcings import derive_forcings from .ops.loading import load_and_subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs @@ -120,20 +117,9 @@ def create_dataset(config: Config): output_dims = output_config.variables[target_output_var] - # Check if the variables should be derived/calculated - derive_input_variables = input_config.derive_variables or False - - if derive_input_variables: - logger.info( - f"Get variables needed to derive additional/external forcings: {variables}" - ) - variables_to_extract = get_variables_for_forcing_derivation(variables) - else: - variables_to_extract = variables - logger.info(f"Loading dataset {dataset_name} from {path}") try: - ds = load_and_subset_dataset(fp=path, variables=variables_to_extract) + ds = load_and_subset_dataset(fp=path, variables=variables) except Exception as ex: raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex _check_dataset_attributes( @@ -142,8 +128,8 @@ def create_dataset(config: Config): dataset_name=dataset_name, ) - if derive_input_variables: - ds = derive_forcings(ds, variables, variables_to_extract) + # Derive forcing variables (if applicable) + ds = derive_forcings(ds, variables) dim_mapping = input_config.dim_mapping @@ -284,49 +270,3 @@ def create_dataset_zarr(fp_config, fp_zarr: str = None): logger.info(f"Wrote training-ready dataset to {fp_zarr}") logger.info(ds) - - -def get_variables_for_forcing_derivation(variables): - """ - Extract the variables needed for deriving the external/additional forcings - """ - if isinstance(variables, dict): - raise Exception("Not implemented yet") - elif isinstance(variables, list): - variables_to_extract = set() - for var in variables: - if var == "toa_radiation": - vars = get_variables_for_deriving_toa_radiation() - else: - raise Exception(f"Function for deriving {var} is not implemented yet!") - - # Add variable names to set (only adds unique variables) - variables_to_extract.update(vars) - - # Turn the set into a list - variables_to_extract = list(variables_to_extract) - - return variables_to_extract - - -def derive_forcings(ds, variables, variables_to_extract): - """ - Derive the specified forcings - """ - if isinstance(variables, dict): - raise Exception("Not implemented yet") - elif isinstance(variables, list): - for var in variables: - if var == "toa_radiation": - ds = derive_toa_radiation(ds) - else: - raise Exception(f"Function for deriving {var} is not implemented yet!") - - # Drop all the unneeded variables that have only been used to derive the - # forcing variables. Need to keep any variables that are also coordinates! - variables_to_drop = [ - var for var in variables_to_extract if var not in list(ds._coord_names) - ] - ds = ds.drop_vars(variables_to_drop, errors="ignore") - - return ds diff --git a/mllam_data_prep/ops/derive_variables.py b/mllam_data_prep/ops/forcings.py similarity index 57% rename from mllam_data_prep/ops/derive_variables.py rename to mllam_data_prep/ops/forcings.py index 9ad3be9..207ded5 100644 --- a/mllam_data_prep/ops/derive_variables.py +++ b/mllam_data_prep/ops/forcings.py @@ -1,6 +1,53 @@ import dask.array as da import numpy as np import xarray as xr +from loguru import logger + + +def derive_forcings(ds, variables): + """ + Derive the specified forcings + + Parameters + --------- + ds : xr.Dataset + The loaded and subsetted dataset + variables: list/dict + List or dictionary with variables + + Returns + ------- + ds : xr.Dataset + Dataset with derived variables included + """ + variables_to_derive = { + k: v for elem in variables if isinstance(elem, dict) for (k, v) in elem.items() + } + + if variables_to_derive == {}: + pass + else: + logger.info("Deriving additional forcings") + for _, derived_var in variables_to_derive.items(): + # Get the function defined in the config file + func = globals()[derived_var.method] + # Currently, we're passing the whole dataset + ds = func(ds) + + # Drop all the unneeded variables that have only been used to derive the + # forcing variables. HOWEVER, it's necessary to keep variables that are + # also coordinates! + derived_variable_dependencies = [] + for _, derived_var in variables_to_derive.items(): + derived_variable_dependencies += derived_var.dependencies + variables_to_drop = [ + var + for var in derived_variable_dependencies + if var not in list(ds._coord_names) + ] + ds = ds.drop_vars(variables_to_drop) + + return ds def derive_toa_radiation(ds): @@ -17,6 +64,7 @@ def derive_toa_radiation(ds): ds: xr.Dataset The dataset with TOA radiation added """ + logger.info("Calculating top-of-atmosphere radiation") # Need to construct a new dataset with chunks since # lat and lon are coordinates and are therefore eagerly loaded @@ -70,10 +118,3 @@ def calc_toa_radiation(ds): toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) return toa_radiation - - -def get_variables_for_deriving_toa_radiation(): - """ - Get list of variables needed for the TOA radiation calculation - """ - return ["lat", "lon", "time"] diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index 955fafd..43b1372 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -45,13 +45,30 @@ def load_and_subset_dataset(fp, variables): ) ds_subset[var] = da elif isinstance(variables, list): + # Check if the variables in a section are all derived variables or not + if all(isinstance(var, dict) for var in variables): + variables_to_extract = set() + for var in variables: + for _, var_dict in var.items(): + variables_to_extract.update(var_dict.dependencies) + elif all(isinstance(var, str) for var in variables): + variables_to_extract = variables + else: + raise TypeError( + "Expected either a list of strings or a list of dicts " + "but got a list of mixed types. If you are trying to derive " + "variables they should go in its own input section." + ) + + # Subset the dataset try: - ds_subset = ds[variables] + ds_subset = ds[variables_to_extract] except KeyError as ex: raise KeyError( - f"Could not find the all variables `{variables}` in the dataset. " + f"Could not find the all variables `{variables_to_extract}` in the dataset. " f"The available variables are {list(ds.data_vars)}" ) from ex else: raise ValueError("The `variables` argument should be a list or a dictionary") + return ds_subset From f37161c827f1c85d13cc4144adca616b3622b61a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 6 Nov 2024 08:20:10 +0000 Subject: [PATCH 03/96] Add derivation of cyclic encoded hour of day and day of year --- example.danra.yaml | 8 ++++ mllam_data_prep/ops/forcings.py | 78 +++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/example.danra.yaml b/example.danra.yaml index 0e6e7be..4f1c29a 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -83,6 +83,14 @@ inputs: - lat - lon method: derive_toa_radiation + - hour_of_day: + dependencies: + - time + method: derive_hour_of_day + - day_of_year: + dependencies: + - time + method: derive_day_of_year dim_mapping: time: method: rename diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index 207ded5..b945e7e 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -118,3 +118,81 @@ def calc_toa_radiation(ds): toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) return toa_radiation + + +def derive_hour_of_day(ds): + """ + Derive hour of day features with a cyclic encoding + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive hour of day + + Returns + ------- + ds: xr.Dataset + The dataset with hour of day added + """ + logger.info("Calculating hour of day") + + # Get the hour of the day + hour_of_day = ds.time.dt.hour + + # Cyclic encoding of hour of day + hour_of_day_cos, hour_of_day_sin = cyclic_encoding(hour_of_day, 24) + + # Assign to the dataset + ds = ds.assign(hour_of_day_sin=hour_of_day_sin) + ds = ds.assign(hour_of_day_cos=hour_of_day_cos) + + return ds + + +def derive_day_of_year(ds): + """ + Derive day of year features with a cyclic encoding + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive day of year + + Returns + ------- + ds: xr.Dataset + The dataset with day of year added + """ + logger.info("Calculating day of year") + + # Get the day of year + day_of_year = ds.time.dt.dayofyear + + # Cyclic encoding of day of year - use 366 to include leap years! + day_of_year_cos, day_of_year_sin = cyclic_encoding(day_of_year, 366) + + # Assign to the dataset + ds = ds.assign(day_of_year_sin=day_of_year_sin) + ds = ds.assign(day_of_year_cos=day_of_year_cos) + + return ds + + +def cyclic_encoding(da, da_max): + """Cyclic encoding of data + + Parameters + ---------- + data : xr.DataArray + xarray data-array of the variable which should be cyclically encoded + data_max: int/float + maximum value of the data variable + + Returns + ------- + """ + + da_sin = np.sin((da / da_max) * 2 * np.pi) + da_cos = np.cos((da / da_max) * 2 * np.pi) + + return da_cos, da_sin From 71afd3af125dc2c913471250a7b618c2ad20af58 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 6 Nov 2024 11:55:01 +0000 Subject: [PATCH 04/96] Add derivation of cyclic encoded time of year --- mllam_data_prep/ops/forcings.py | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index b945e7e..470109f 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -178,6 +178,45 @@ def derive_day_of_year(ds): return ds +def derive_time_of_year(ds): + logger.info("Calculating time of year") + + # Get the number of seconds a datetime corresponds to + number_of_seconds = ( + (ds.time.dt.dayofyear - 1) * 60 * 60 * 24 + + ds.time.dt.hour * 60 * 60 + + ds.time.dt.minute * 60 + + ds.time.dt.second + ) + + # Cyclic encoding using both leap year and non-leap year + # number of seconds to be able to choose later where to + # include which values using xr.where() + time_of_year_cos_non_leap, time_of_year_sin_non_leap = cyclic_encoding( + number_of_seconds, 31622400 + ) + time_of_year_cos_leap, time_of_year_sin_leap = cyclic_encoding( + number_of_seconds, 31536000 + ) + + time_of_year_cos = xr.where( + ds.time.dt.is_leap_year, + time_of_year_cos_leap, + time_of_year_cos_non_leap, + ) + time_of_year_sin = xr.where( + ds.time.dt.is_leap_year, + time_of_year_sin_leap, + time_of_year_sin_non_leap, + ) + + # Assign to the dataset + ds = ds.assign(time_of_year_sin=time_of_year_sin) + ds = ds.assign(time_of_year_cos=time_of_year_cos) + + return ds + + def cyclic_encoding(da, da_max): """Cyclic encoding of data From abb626b92cee9a5b7ee7d85e7ae30ceff1d4d79b Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 6 Nov 2024 12:02:34 +0000 Subject: [PATCH 05/96] Update and add docstrings --- mllam_data_prep/ops/forcings.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index 470109f..12e6396 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -179,6 +179,20 @@ def derive_day_of_year(ds): def derive_time_of_year(ds): + """ + Derive time of year features with a cyclic encoding + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive time of year + + Returns + ------- + ds: xr.Dataset + The dataset with time of year added + """ + logger.info("Calculating time of year") # Get the number of seconds a datetime corresponds to @@ -218,17 +232,22 @@ def derive_time_of_year(ds): def cyclic_encoding(da, da_max): - """Cyclic encoding of data + """ + Cyclic encoding of data Parameters ---------- - data : xr.DataArray - xarray data-array of the variable which should be cyclically encoded - data_max: int/float - maximum value of the data variable + da : xr.DataArray + xarray data-array that should be cyclically encoded + da_max: int/float + Maximum possible value of input data-array Returns ------- + da_cos: xr.DataArray + Cosine part of cyclically encoded input data-array + da_sin: xr.DataArray + Sine part of cyclically encoded input data-array """ da_sin = np.sin((da / da_max) * 2 * np.pi) From 8b1f18e19566a02499d83c8f8b51ea2163295621 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 12 Nov 2024 09:30:57 +0000 Subject: [PATCH 06/96] Remove time_of_year --- mllam_data_prep/ops/forcings.py | 53 --------------------------------- 1 file changed, 53 deletions(-) diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index 12e6396..5d5aa69 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -178,59 +178,6 @@ def derive_day_of_year(ds): return ds -def derive_time_of_year(ds): - """ - Derive time of year features with a cyclic encoding - - Parameters - ---------- - ds : xr.Dataset - The dataset with variables needed to derive time of year - - Returns - ------- - ds: xr.Dataset - The dataset with time of year added - """ - - logger.info("Calculating time of year") - - # Get the number of seconds a datetime corresponds to - number_of_seconds = ( - (ds.time.dt.dayofyear - 1) * 60 * 60 * 24 - + ds.time.dt.hour * 60 * 60 - + ds.time.dt.minute * 60 - + ds.time.dt.second - ) - - # Cyclic encoding using both leap year and non-leap year - # number of seconds to be able to choose later where to - # include which values using xr.where() - time_of_year_cos_non_leap, time_of_year_sin_non_leap = cyclic_encoding( - number_of_seconds, 31622400 - ) - time_of_year_cos_leap, time_of_year_sin_leap = cyclic_encoding( - number_of_seconds, 31536000 - ) - - time_of_year_cos = xr.where( - ds.time.dt.is_leap_year, - time_of_year_cos_leap, - time_of_year_cos_non_leap, - ) - time_of_year_sin = xr.where( - ds.time.dt.is_leap_year, - time_of_year_sin_leap, - time_of_year_sin_non_leap, - ) - - # Assign to the dataset - ds = ds.assign(time_of_year_sin=time_of_year_sin) - ds = ds.assign(time_of_year_cos=time_of_year_cos) - - return ds - - def cyclic_encoding(da, da_max): """ Cyclic encoding of data From 78540133a7a825ee71a8f9a8a86885e37398993e Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 12 Nov 2024 13:30:29 +0000 Subject: [PATCH 07/96] Provide the full namespace of the function --- example.danra.yaml | 2 +- mllam_data_prep/ops/forcings.py | 53 +++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/example.danra.yaml b/example.danra.yaml index 4f1c29a..65af8bf 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -82,7 +82,7 @@ inputs: - time - lat - lon - method: derive_toa_radiation + method: mllam_data_prep.ops.forcings.derive_toa_radiation - hour_of_day: dependencies: - time diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index 5d5aa69..352d04f 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -1,3 +1,6 @@ +import importlib +import sys + import dask.array as da import numpy as np import xarray as xr @@ -29,8 +32,8 @@ def derive_forcings(ds, variables): else: logger.info("Deriving additional forcings") for _, derived_var in variables_to_derive.items(): - # Get the function defined in the config file - func = globals()[derived_var.method] + # Get the function + func = get_derived_variable_function(derived_var.method) # Currently, we're passing the whole dataset ds = func(ds) @@ -50,6 +53,52 @@ def derive_forcings(ds, variables): return ds +def get_derived_variable_function(function_namespace): + """ + Function for returning the function to be used to derive + the specified variable. + + 1. Check if the function to use is in globals() + 2. If it is in globals then call it + 3. If it isn't in globals() then import the necessary module + before calling it + """ + # Get the name of the calling module + calling_module = globals()["__name__"] + + if "." in function_namespace: + # If the function name is a full namespace, get module and function names + module_name, function_name = function_namespace.rsplit(".", 1) + + # Check if the module_name is pointing to here (the calling module), + # and if it does then use globals() to get the function otherwise + # import the correct module and get the correct function + if module_name == calling_module: + function = globals().get(function_name) + else: + # Check if the module is already imported + if module_name in sys.modules: + module = module_name + else: + module = importlib.import_module(module_name) + + # Get the function from the module + function = getattr(module, function_name) + else: + # If function name only get it from the calling module (here) + function = globals().get(function_namespace) + if not function: + raise TypeError( + f"Function '{function_namespace}' was not found in '{calling_module}'." + f" Check that you have specified the correct function name" + " and/or that you have defined the full function namespace if you" + " want to use a function defined outside of of the current module" + f" '{calling_module}'." + ) + + return function + + def derive_toa_radiation(ds): """ Derive approximate TOA radiation (instantaneous values [W*m**-2]) From 7fa90bf04963b35461a8dfcd130a6d86f37cce28 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 12 Nov 2024 13:44:14 +0000 Subject: [PATCH 08/96] Rename the module with derived variables --- example.danra.yaml | 2 +- mllam_data_prep/create_dataset.py | 2 +- mllam_data_prep/{ops/forcings.py => derived_variables.py} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename mllam_data_prep/{ops/forcings.py => derived_variables.py} (100%) diff --git a/example.danra.yaml b/example.danra.yaml index 65af8bf..50a67a7 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -82,7 +82,7 @@ inputs: - time - lat - lon - method: mllam_data_prep.ops.forcings.derive_toa_radiation + method: mllam_data_prep.derived_variables.derive_toa_radiation - hour_of_day: dependencies: - time diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index a6813c3..df814d1 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,7 +10,7 @@ from . import __version__ from .config import Config, InvalidConfigException -from .ops.forcings import derive_forcings +from .derived_variables import derive_forcings from .ops.loading import load_and_subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/derived_variables.py similarity index 100% rename from mllam_data_prep/ops/forcings.py rename to mllam_data_prep/derived_variables.py From 48c9e3e73fc0d3066473c8823d1bc6bb42a754af Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 12 Nov 2024 13:45:50 +0000 Subject: [PATCH 09/96] Rename the function used for deriving variables --- mllam_data_prep/create_dataset.py | 6 +++--- mllam_data_prep/derived_variables.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index df814d1..df7939a 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,7 +10,7 @@ from . import __version__ from .config import Config, InvalidConfigException -from .derived_variables import derive_forcings +from .derived_variables import derive_variables from .ops.loading import load_and_subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs @@ -128,8 +128,8 @@ def create_dataset(config: Config): dataset_name=dataset_name, ) - # Derive forcing variables (if applicable) - ds = derive_forcings(ds, variables) + # Derive variables (if applicable) + ds = derive_variables(ds, variables) dim_mapping = input_config.dim_mapping diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 352d04f..2f0a861 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -7,9 +7,9 @@ from loguru import logger -def derive_forcings(ds, variables): +def derive_variables(ds, variables): """ - Derive the specified forcings + Derive the specified variables Parameters --------- @@ -30,7 +30,7 @@ def derive_forcings(ds, variables): if variables_to_derive == {}: pass else: - logger.info("Deriving additional forcings") + logger.info("Deriving additional variables") for _, derived_var in variables_to_derive.items(): # Get the function func = get_derived_variable_function(derived_var.method) @@ -38,7 +38,7 @@ def derive_forcings(ds, variables): ds = func(ds) # Drop all the unneeded variables that have only been used to derive the - # forcing variables. HOWEVER, it's necessary to keep variables that are + # additional variables. HOWEVER, it's necessary to keep variables that are # also coordinates! derived_variable_dependencies = [] for _, derived_var in variables_to_derive.items(): From 8de9404911c8c21c59774251b4a93495d3799ad3 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 15 Nov 2024 07:55:34 +0000 Subject: [PATCH 10/96] Redefine the config file for derived variables and how they are calculated --- example.danra.yaml | 22 +++---- mllam_data_prep/config.py | 20 +++---- mllam_data_prep/create_dataset.py | 30 ++++++---- mllam_data_prep/derived_variables.py | 90 +++++++++++++++------------- 4 files changed, 82 insertions(+), 80 deletions(-) diff --git a/example.danra.yaml b/example.danra.yaml index 50a67a7..4152896 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -76,21 +76,13 @@ inputs: danra_additional_forcings: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr dims: [time, x, y] - variables: - - toa_radiation: - dependencies: - - time - - lat - - lon - method: mllam_data_prep.derived_variables.derive_toa_radiation - - hour_of_day: - dependencies: - - time - method: derive_hour_of_day - - day_of_year: - dependencies: - - time - method: derive_day_of_year + derived_variables: + toa_radiation: + kwargs: + time: time + lat: lat + lon: lon + function: mllam_data_prep.derived_variables.derive_toa_radiation dim_mapping: time: method: rename diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 354aaf9..be72de9 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -67,17 +67,17 @@ class ValueSelection: @dataclass class DerivedVariable: """ - Defines a derived variables, where the dependencies (variables used - in the calculation) and the method (function for calculations) are - specified. + Defines a derived variables, where the kwargs (variables required + for the calculation) and the function (for calculating the variable) + are specified. Attributes: - dependencies: The variables to use in the calculation. - method: The methpd with which to derive the variable. + kwargs: Variables required for calculating the derived variable. + function: Function used to calculate the derived variable. """ - dependencies: List[str] - method: str = None + kwargs: Dict[str, str] + function: str @dataclass @@ -183,12 +183,10 @@ class InputDataset: path: str dims: List[str] - variables: Union[ - List[Union[str, Dict[str, DerivedVariable]]], - Dict[str, Dict[str, ValueSelection]], - ] dim_mapping: Dict[str, DimMapping] target_output_variable: str + variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] = None + derived_variables: Dict[str, DerivedVariable] = None attributes: Dict[str, Any] = None diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index df7939a..2b37b91 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -111,25 +111,33 @@ def create_dataset(config: Config): for dataset_name, input_config in config.inputs.items(): path = input_config.path variables = input_config.variables + derived_variables = input_config.derived_variables target_output_var = input_config.target_output_variable expected_input_attributes = input_config.attributes or {} expected_input_var_dims = input_config.dims output_dims = output_config.variables[target_output_var] - logger.info(f"Loading dataset {dataset_name} from {path}") - try: - ds = load_and_subset_dataset(fp=path, variables=variables) - except Exception as ex: - raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex - _check_dataset_attributes( - ds=ds, - expected_attributes=expected_input_attributes, - dataset_name=dataset_name, - ) + if variables: + logger.info(f"Loading dataset {dataset_name} from {path} and subsetting") + try: + ds = load_and_subset_dataset(fp=path, variables=variables) + except Exception as ex: + raise Exception( + f"Error loading dataset {dataset_name} from {path}" + ) from ex + _check_dataset_attributes( + ds=ds, + expected_attributes=expected_input_attributes, + dataset_name=dataset_name, + ) # Derive variables (if applicable) - ds = derive_variables(ds, variables) + if derived_variables: + logger.info( + f"Loading dataset {dataset_name} from {path} and deriving variables" + ) + ds = derive_variables(fp=path, derived_variables=derived_variables) dim_mapping = input_config.dim_mapping diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 2f0a861..7453470 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -7,48 +7,47 @@ from loguru import logger -def derive_variables(ds, variables): +def derive_variables(fp, derived_variables): """ - Derive the specified variables + Load the dataset, and derive the specified variables Parameters --------- - ds : xr.Dataset - The loaded and subsetted dataset - variables: list/dict - List or dictionary with variables + fp : str + Filepath to the source dataset, for example the path to a zarr dataset + or a netCDF file (anything that is supported by `xarray.open_dataset` will work) + derived_variables: dict + Dictionary with the variables to derive with keys as the variable names and + values with entries for kwargs and function to be used to derive them Returns ------- ds : xr.Dataset Dataset with derived variables included """ - variables_to_derive = { - k: v for elem in variables if isinstance(elem, dict) for (k, v) in elem.items() - } - - if variables_to_derive == {}: - pass - else: - logger.info("Deriving additional variables") - for _, derived_var in variables_to_derive.items(): - # Get the function - func = get_derived_variable_function(derived_var.method) - # Currently, we're passing the whole dataset - ds = func(ds) - - # Drop all the unneeded variables that have only been used to derive the - # additional variables. HOWEVER, it's necessary to keep variables that are - # also coordinates! - derived_variable_dependencies = [] - for _, derived_var in variables_to_derive.items(): - derived_variable_dependencies += derived_var.dependencies - variables_to_drop = [ - var - for var in derived_variable_dependencies - if var not in list(ds._coord_names) - ] - ds = ds.drop_vars(variables_to_drop) + logger.info("Deriving variables") + + try: + ds = xr.open_zarr(fp) + except ValueError: + ds = xr.open_dataset(fp) + + ds_subset = xr.Dataset() + ds_subset.attrs.update(ds.attrs) + # Iterate derived variables + for _, derived_variable in derived_variables.items(): + required_variables = derived_variable.kwargs + function_name = derived_variable.function + # Create the input dataset containing the required variables to derive + # the specified variable + ds_input = ds[required_variables.keys()] + kwargs = {v: ds_input[v] for v in required_variables.values()} + # Get the function to be used to derive the variable + func = get_derived_variable_function(function_name) + # Calculate the derived variable + derived_field = func(**kwargs) + # Add the derived variable(s) to the subsetted dataset + ds_subset[derived_field.name] = derived_field return ds @@ -99,37 +98,42 @@ def get_derived_variable_function(function_namespace): return function -def derive_toa_radiation(ds): +def derive_toa_radiation(lat, lon, time): """ Derive approximate TOA radiation (instantaneous values [W*m**-2]) Parameters ---------- - ds : xr.Dataset - The dataset with variables needed to derive TOA radiation + lat : xr.DataArray + Latitude values + lon : xr.DataArray + Longitude values + time : xr.DataArray + Time Returns ------- - ds: xr.Dataset - The dataset with TOA radiation added + toa_radiation: xr.DataArray + TOA radiation data-array """ logger.info("Calculating top-of-atmosphere radiation") # Need to construct a new dataset with chunks since # lat and lon are coordinates and are therefore eagerly loaded ds_dict = {} - ds_dict["lat"] = (list(ds.lat.dims), da.from_array(ds.lat.values, chunks=(-1, -1))) - ds_dict["lon"] = (list(ds.lon.dims), da.from_array(ds.lon.values, chunks=(-1, -1))) - ds_dict["t"] = (list(ds.time.dims), da.from_array(ds.time.values, chunks=(10))) + ds_dict["lat"] = (list(lat.dims), da.from_array(lat.values, chunks=(-1, -1))) + ds_dict["lon"] = (list(lon.dims), da.from_array(lon.values, chunks=(-1, -1))) + ds_dict["t"] = (list(time.dims), da.from_array(time.values, chunks=(10))) ds_chunks = xr.Dataset(ds_dict) # Calculate TOA radiation toa_radiation = calc_toa_radiation(ds_chunks) - # Assign to the dataset - ds = ds.assign(toa_radiation=toa_radiation) + if isinstance(toa_radiation, xr.DataArray): + # Add attributes + toa_radiation.name = "toa_radiation" - return ds + return toa_radiation def calc_toa_radiation(ds): From ffc030cdf0ec3abeda73a599680de4c87b36a077 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 15 Nov 2024 08:10:10 +0000 Subject: [PATCH 11/96] Remove derived variables from 'load_and_subset_dataset' --- mllam_data_prep/ops/loading.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index 43b1372..e97360a 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -45,27 +45,11 @@ def load_and_subset_dataset(fp, variables): ) ds_subset[var] = da elif isinstance(variables, list): - # Check if the variables in a section are all derived variables or not - if all(isinstance(var, dict) for var in variables): - variables_to_extract = set() - for var in variables: - for _, var_dict in var.items(): - variables_to_extract.update(var_dict.dependencies) - elif all(isinstance(var, str) for var in variables): - variables_to_extract = variables - else: - raise TypeError( - "Expected either a list of strings or a list of dicts " - "but got a list of mixed types. If you are trying to derive " - "variables they should go in its own input section." - ) - - # Subset the dataset try: - ds_subset = ds[variables_to_extract] + ds_subset = ds[variables] except KeyError as ex: raise KeyError( - f"Could not find the all variables `{variables_to_extract}` in the dataset. " + f"Could not find the all variables `{variables}` in the dataset. " f"The available variables are {list(ds.data_vars)}" ) from ex else: From 692cdd33d467a0a364daa03b933a18e2cd9f9540 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 15 Nov 2024 08:12:41 +0000 Subject: [PATCH 12/96] Add try/except for derived variables when loading the dataset --- mllam_data_prep/create_dataset.py | 17 +++++++++++++---- mllam_data_prep/derived_variables.py | 7 ++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 2b37b91..c3e3faf 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -110,8 +110,8 @@ def create_dataset(config: Config): for dataset_name, input_config in config.inputs.items(): path = input_config.path - variables = input_config.variables - derived_variables = input_config.derived_variables + variables = input_config.variables or None + derived_variables = input_config.derived_variables or None target_output_var = input_config.target_output_variable expected_input_attributes = input_config.attributes or {} expected_input_var_dims = input_config.dims @@ -132,12 +132,21 @@ def create_dataset(config: Config): dataset_name=dataset_name, ) - # Derive variables (if applicable) if derived_variables: logger.info( f"Loading dataset {dataset_name} from {path} and deriving variables" ) - ds = derive_variables(fp=path, derived_variables=derived_variables) + try: + ds = derive_variables(fp=path, derived_variables=derived_variables) + except Exception as ex: + raise Exception( + f"Error loading dataset {dataset_name} from {path}" + ) from ex + _check_dataset_attributes( + ds=ds, + expected_attributes=expected_input_attributes, + dataset_name=dataset_name, + ) dim_mapping = input_config.dim_mapping diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 7453470..27075ce 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -16,9 +16,10 @@ def derive_variables(fp, derived_variables): fp : str Filepath to the source dataset, for example the path to a zarr dataset or a netCDF file (anything that is supported by `xarray.open_dataset` will work) - derived_variables: dict - Dictionary with the variables to derive with keys as the variable names and - values with entries for kwargs and function to be used to derive them + derived_variables : dict + Dictionary with the variables to derive + with keys as the variable names and values with entries for + kwargs and function to use in the calculation Returns ------- From c0cd87541d475635034a8a0ab45018949df017dc Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 5 Dec 2024 08:54:20 +0000 Subject: [PATCH 13/96] Chunk the input data with the defined output chunks --- mllam_data_prep/create_dataset.py | 12 +++- mllam_data_prep/derived_variables.py | 98 ++++++++++++---------------- mllam_data_prep/ops/loading.py | 8 ++- 3 files changed, 58 insertions(+), 60 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index c3e3faf..1a2f389 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -105,6 +105,7 @@ def create_dataset(config: Config): """ output_config = config.output output_coord_ranges = output_config.coord_ranges + chunking_config = config.output.chunking or {} dataarrays_by_target = defaultdict(list) @@ -121,7 +122,9 @@ def create_dataset(config: Config): if variables: logger.info(f"Loading dataset {dataset_name} from {path} and subsetting") try: - ds = load_and_subset_dataset(fp=path, variables=variables) + ds = load_and_subset_dataset( + fp=path, variables=variables, chunking=chunking_config + ) except Exception as ex: raise Exception( f"Error loading dataset {dataset_name} from {path}" @@ -137,7 +140,11 @@ def create_dataset(config: Config): f"Loading dataset {dataset_name} from {path} and deriving variables" ) try: - ds = derive_variables(fp=path, derived_variables=derived_variables) + ds = derive_variables( + fp=path, + derived_variables=derived_variables, + chunking=chunking_config, + ) except Exception as ex: raise Exception( f"Error loading dataset {dataset_name} from {path}" @@ -196,7 +203,6 @@ def create_dataset(config: Config): # default to making a single chunk for each dimension if chunksize is not specified # in the config - chunking_config = config.output.chunking or {} logger.info(f"Chunking dataset with {chunking_config}") chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims} ds = ds.chunk(chunks) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 27075ce..be05c72 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -1,13 +1,12 @@ import importlib import sys -import dask.array as da import numpy as np import xarray as xr from loguru import logger -def derive_variables(fp, derived_variables): +def derive_variables(fp, derived_variables, chunking): """ Load the dataset, and derive the specified variables @@ -20,6 +19,9 @@ def derive_variables(fp, derived_variables): Dictionary with the variables to derive with keys as the variable names and values with entries for kwargs and function to use in the calculation + chunking: dict + Dictionary with keys as the dimensions to chunk along and values + with the chunk size Returns ------- @@ -35,19 +37,31 @@ def derive_variables(fp, derived_variables): ds_subset = xr.Dataset() ds_subset.attrs.update(ds.attrs) - # Iterate derived variables for _, derived_variable in derived_variables.items(): required_variables = derived_variable.kwargs function_name = derived_variable.function - # Create the input dataset containing the required variables to derive - # the specified variable ds_input = ds[required_variables.keys()] + + # Any coordinates needed for the derivation, for which chunking should be performed, + # should be converted to variables since it is not possible for coordinates to be + # chunked dask arrays + chunks = {d: chunking.get(d, int(ds_input[d].count())) for d in ds_input.dims} + required_coordinates = [ + req_var for req_var in required_variables if req_var in ds_input.coords + ] + ds_input = ds_input.drop_indexes(required_coordinates, errors="ignore") + for req_var in required_variables.keys(): + if req_var in ds_input.coords and req_var in chunks: + ds_input = ds_input.reset_coords(req_var) + + # Chunk the data variables + ds_input = ds_input.chunk(chunks) + + # Calculate the derived variable kwargs = {v: ds_input[v] for v in required_variables.values()} - # Get the function to be used to derive the variable func = get_derived_variable_function(function_name) - # Calculate the derived variable derived_field = func(**kwargs) - # Add the derived variable(s) to the subsetted dataset + ds_subset[derived_field.name] = derived_field return ds @@ -99,78 +113,50 @@ def get_derived_variable_function(function_namespace): return function -def derive_toa_radiation(lat, lon, time): +def calculate_toa_radiation(lat, lon, time): """ - Derive approximate TOA radiation (instantaneous values [W*m**-2]) + Function for calculating top-of-the-atmosphere radiation Parameters ---------- - lat : xr.DataArray + lat : xr.DataArray or float Latitude values - lon : xr.DataArray + lon : xr.DataArray or float Longitude values - time : xr.DataArray + time : xr.DataArray or datetime object Time Returns ------- - toa_radiation: xr.DataArray - TOA radiation data-array + toa_radiation: xr.DataArray or float + TOA radiation data """ logger.info("Calculating top-of-atmosphere radiation") - # Need to construct a new dataset with chunks since - # lat and lon are coordinates and are therefore eagerly loaded - ds_dict = {} - ds_dict["lat"] = (list(lat.dims), da.from_array(lat.values, chunks=(-1, -1))) - ds_dict["lon"] = (list(lon.dims), da.from_array(lon.values, chunks=(-1, -1))) - ds_dict["t"] = (list(time.dims), da.from_array(time.values, chunks=(10))) - ds_chunks = xr.Dataset(ds_dict) - - # Calculate TOA radiation - toa_radiation = calc_toa_radiation(ds_chunks) - - if isinstance(toa_radiation, xr.DataArray): - # Add attributes - toa_radiation.name = "toa_radiation" - - return toa_radiation - - -def calc_toa_radiation(ds): - """ - Function for calculation top-of-the-atmosphere radiation - - Parameters - ---------- - ds : xr.Dataset - The dataset with variables needed to derive TOA radiation - - Returns - ------- - toa_radiation: xr.DataArray - TOA radiation data-array - """ # Solar constant E0 = 1366 # W*m**-2 - day = ds.t.dt.dayofyear - hr_utc = ds.t.dt.hour + day = time.dt.dayofyear + hr_utc = time.dt.hour # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) - hr_lst = hr_utc + ds.lon / 15 + hr_lst = hr_utc + lon / 15 hr_angle = 15 * (hr_lst - 12) # Eq. 1.6.2 with beta=0 in Solar Engineering of Thermal Processes 4th ed. - cos_sza = np.sin(ds.lat * np.pi / 180) * np.sin(dec) + np.cos( - ds.lat * np.pi / 180 + cos_sza = np.sin(lat * np.pi / 180) * np.sin(dec) + np.cos( + lat * np.pi / 180 ) * np.cos(dec) * np.cos(hr_angle * np.pi / 180) # Where TOA radiation is negative, set to 0 toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) + if isinstance(toa_radiation, xr.DataArray): + # Add attributes + toa_radiation.name = "toa_radiation" + return toa_radiation @@ -232,7 +218,7 @@ def derive_day_of_year(ds): return ds -def cyclic_encoding(da, da_max): +def cyclic_encoding(data_array, da_max): """ Cyclic encoding of data @@ -251,7 +237,7 @@ def cyclic_encoding(da, da_max): Sine part of cyclically encoded input data-array """ - da_sin = np.sin((da / da_max) * 2 * np.pi) - da_cos = np.cos((da / da_max) * 2 * np.pi) + data_array_sin = np.sin((data_array / da_max) * 2 * np.pi) + data_array_cos = np.cos((data_array / da_max) * 2 * np.pi) - return da_cos, da_sin + return data_array_cos, data_array_sin diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index e97360a..fc5d5bc 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -1,7 +1,7 @@ import xarray as xr -def load_and_subset_dataset(fp, variables): +def load_and_subset_dataset(fp, variables, chunking): """ Load the dataset, subset the variables along the specified coordinates and check coordinate units @@ -15,6 +15,9 @@ def load_and_subset_dataset(fp, variables): Dictionary with the variables to subset with keys as the variable names and values with entries for each coordinate and coordinate values to extract + chunking: dict + Dictionary with keys as the dimensions to chunk along and values + with the chunk size """ try: @@ -55,4 +58,7 @@ def load_and_subset_dataset(fp, variables): else: raise ValueError("The `variables` argument should be a list or a dictionary") + chunks = {d: chunking.get(d, int(ds_subset[d].count())) for d in ds_subset.dims} + ds_subset = ds_subset.chunk(chunks) + return ds_subset From 55224f34f91654b44680c277eb1f06e5df5a1a92 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 5 Dec 2024 14:17:38 +0000 Subject: [PATCH 14/96] Update toa_radiation function name --- example.danra.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example.danra.yaml b/example.danra.yaml index 4152896..b351a52 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -82,7 +82,7 @@ inputs: time: time lat: lat lon: lon - function: mllam_data_prep.derived_variables.derive_toa_radiation + function: mllam_data_prep.derived_variables.calculate_toa_radiation dim_mapping: time: method: rename From 678ea523c39794f93831a6fe362fc6fba4b1c23d Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 5 Dec 2024 14:20:02 +0000 Subject: [PATCH 15/96] Correct kwargs usage, add back dropped coordinates and return correct dataset --- mllam_data_prep/derived_variables.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index be05c72..114638e 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -47,24 +47,28 @@ def derive_variables(fp, derived_variables, chunking): # chunked dask arrays chunks = {d: chunking.get(d, int(ds_input[d].count())) for d in ds_input.dims} required_coordinates = [ - req_var for req_var in required_variables if req_var in ds_input.coords + req_var + for req_var in required_variables.keys() + if req_var in ds_input.coords ] ds_input = ds_input.drop_indexes(required_coordinates, errors="ignore") - for req_var in required_variables.keys(): - if req_var in ds_input.coords and req_var in chunks: - ds_input = ds_input.reset_coords(req_var) + for req_coord in required_coordinates: + if req_coord in chunks: + ds_input = ds_input.reset_coords(req_coord) # Chunk the data variables ds_input = ds_input.chunk(chunks) # Calculate the derived variable - kwargs = {v: ds_input[v] for v in required_variables.values()} + kwargs = {v: ds_input[k] for k, v in required_variables.items()} func = get_derived_variable_function(function_name) derived_field = func(**kwargs) - + derived_field = _return_dropped_coordinates( + derived_field, ds_input, required_coordinates, chunks + ) ds_subset[derived_field.name] = derived_field - return ds + return ds_subset def get_derived_variable_function(function_namespace): @@ -113,6 +117,15 @@ def get_derived_variable_function(function_namespace): return function +def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, chunks): + """Return coordinates that have been reset.""" + for req_coord in required_coordinates: + if req_coord in chunks: + derived_field.coords[req_coord] = ds_input[req_coord] + + return derived_field + + def calculate_toa_radiation(lat, lon, time): """ Function for calculating top-of-the-atmosphere radiation From 9d2db079309b258dc41c66b8b81ef83dcda04f5e Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 5 Dec 2024 14:22:54 +0000 Subject: [PATCH 16/96] Prepare for hour_of_day and day_of_year --- mllam_data_prep/derived_variables.py | 74 ++++++++++++++++++---------- 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 114638e..760e0b3 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -63,10 +63,20 @@ def derive_variables(fp, derived_variables, chunking): kwargs = {v: ds_input[k] for k, v in required_variables.items()} func = get_derived_variable_function(function_name) derived_field = func(**kwargs) - derived_field = _return_dropped_coordinates( - derived_field, ds_input, required_coordinates, chunks - ) - ds_subset[derived_field.name] = derived_field + + # Some of the derived variables include two components, since + # they are cyclically encoded (cos and sin parts) + if isinstance(derived_field, xr.DataArray): + derived_field = _return_dropped_coordinates( + derived_field, ds_input, required_coordinates, chunks + ) + ds_subset[derived_field.name] = derived_field + elif isinstance(derived_field, tuple): + for field in derived_field: + field = _return_dropped_coordinates( + field, ds_input, required_coordinates, chunks + ) + ds_subset[field.name] = field return ds_subset @@ -173,62 +183,74 @@ def calculate_toa_radiation(lat, lon, time): return toa_radiation -def derive_hour_of_day(ds): +def calculate_hour_of_day(time): """ - Derive hour of day features with a cyclic encoding + Function for calculating hour of day features with a cyclic encoding Parameters ---------- - ds : xr.Dataset - The dataset with variables needed to derive hour of day + time : xr.DataArray or datetime object + Time Returns ------- - ds: xr.Dataset - The dataset with hour of day added + hour_of_day_cos: xr.DataArray or float + cosine of the hour of day + hour_of_day_sin: xr.DataArray or float + sine of the hour of day """ logger.info("Calculating hour of day") # Get the hour of the day - hour_of_day = ds.time.dt.hour + hour_of_day = time.dt.hour # Cyclic encoding of hour of day hour_of_day_cos, hour_of_day_sin = cyclic_encoding(hour_of_day, 24) - # Assign to the dataset - ds = ds.assign(hour_of_day_sin=hour_of_day_sin) - ds = ds.assign(hour_of_day_cos=hour_of_day_cos) + if isinstance(hour_of_day_cos, xr.DataArray): + # Add attributes + hour_of_day_cos.name = "hour_of_day_cos" + + if isinstance(hour_of_day_sin, xr.DataArray): + # Add attributes + hour_of_day_sin.name = "hour_of_day_sin" - return ds + return hour_of_day_cos, hour_of_day_sin -def derive_day_of_year(ds): +def calculate_day_of_year(time): """ - Derive day of year features with a cyclic encoding + Function for calculating day of year features with a cyclic encoding Parameters ---------- - ds : xr.Dataset - The dataset with variables needed to derive day of year + time : xr.DataArray or datetime object + Time Returns ------- - ds: xr.Dataset - The dataset with day of year added + day_of_year_cos: xr.DataArray or float + cosine of the day of year + day_of_year_sin: xr.DataArray or float + sine of the day of year """ logger.info("Calculating day of year") # Get the day of year - day_of_year = ds.time.dt.dayofyear + day_of_year = time.dt.dayofyear # Cyclic encoding of day of year - use 366 to include leap years! day_of_year_cos, day_of_year_sin = cyclic_encoding(day_of_year, 366) - # Assign to the dataset - ds = ds.assign(day_of_year_sin=day_of_year_sin) - ds = ds.assign(day_of_year_cos=day_of_year_cos) + if isinstance(day_of_year_cos, xr.DataArray): + # Add attributes + day_of_year_cos.name = "day_of_year_cos" + + if isinstance(day_of_year_sin, xr.DataArray): + # Add attributes + day_of_year_sin.name = "day_of_year_sin" - return ds + return day_of_year_cos, day_of_year_sin def cyclic_encoding(data_array, da_max): From 26455bc2aff82e40a2c04cd526afd9e2577457d6 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 6 Dec 2024 11:24:19 +0000 Subject: [PATCH 17/96] Add optional 'attributes' to the config of 'derived_variables' and check the attributes of the derived variable data-array --- mllam_data_prep/config.py | 10 +- mllam_data_prep/create_dataset.py | 1 + mllam_data_prep/derived_variables.py | 172 +++++++++++++++++++++++---- 3 files changed, 162 insertions(+), 21 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index be72de9..c6192d1 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -78,6 +78,7 @@ class DerivedVariable: kwargs: Dict[str, str] function: str + attributes: Dict[str, Any] = None @dataclass @@ -148,7 +149,8 @@ class InputDataset: 1) the path to the dataset, 2) the expected dimensions of the dataset, 3) the variables to select from the dataset (and optionally subsection - along the coordinates for each variable) and finally + along the coordinates for each variable) and/or the variables to derive + from the dataset, and finally 4) the method by which the dimensions and variables of the dataset are mapped to one of the output variables (this includes stacking of all the selected variables into a new single variable along a new coordinate, @@ -179,6 +181,12 @@ class InputDataset: (e.g. two datasets that coincide in space and time will only differ in the feature dimension, so the two will be combined by concatenating along the feature dimension). If a single shared coordinate cannot be found then an exception will be raised. + derived_variables: Dict[str, DerivedVariable] + Dictionary of variables to derive from the dataset, where the keys are the variable names and + the values are dictionaries defining the necessary function and kwargs. E.g. + `{"toa_radiation": {"kwargs": {"time": "time", "lat": "lat", "lon": "lon"}, "function": "calculate_toa_radiation"}}` + would derive the "toa_radiation" variable using the `calculate_toa_radiation` function, which + takes `time`, `lat` and `lon` as arguments. """ path: str diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 1a2f389..4ce5e14 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -148,6 +148,7 @@ def create_dataset(config: Config): except Exception as ex: raise Exception( f"Error loading dataset {dataset_name} from {path}" + f" or deriving variables '{', '.join(list(derived_variables.keys()))}'." ) from ex _check_dataset_attributes( ds=ds, diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 760e0b3..cda1bdf 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -40,6 +40,7 @@ def derive_variables(fp, derived_variables, chunking): for _, derived_variable in derived_variables.items(): required_variables = derived_variable.kwargs function_name = derived_variable.function + derived_variable_attributes = derived_variable.attributes or {} ds_input = ds[required_variables.keys()] # Any coordinates needed for the derivation, for which chunking should be performed, @@ -61,35 +62,50 @@ def derive_variables(fp, derived_variables, chunking): # Calculate the derived variable kwargs = {v: ds_input[k] for k, v in required_variables.items()} - func = get_derived_variable_function(function_name) + func = _get_derived_variable_function(function_name) derived_field = func(**kwargs) - # Some of the derived variables include two components, since - # they are cyclically encoded (cos and sin parts) + # Check the derived field(s) + derived_field = _check_field( + derived_field, + derived_variable_attributes, + ds_input, + required_coordinates, + chunks, + ) + + # Add the derived field(s) to the subset if isinstance(derived_field, xr.DataArray): - derived_field = _return_dropped_coordinates( - derived_field, ds_input, required_coordinates, chunks - ) ds_subset[derived_field.name] = derived_field - elif isinstance(derived_field, tuple): + elif isinstance(derived_field, tuple) and all( + isinstance(field, xr.DataArray) for field in derived_field + ): for field in derived_field: - field = _return_dropped_coordinates( - field, ds_input, required_coordinates, chunks - ) ds_subset[field.name] = field + else: + raise TypeError( + "Expected an instance of xr.DataArray or tuple(xr.DataArray)," + f" but got {type(derived_field)}." + ) return ds_subset -def get_derived_variable_function(function_namespace): +def _get_derived_variable_function(function_namespace): """ - Function for returning the function to be used to derive + Function for getting the function for deriving the specified variable. - 1. Check if the function to use is in globals() - 2. If it is in globals then call it - 3. If it isn't in globals() then import the necessary module - before calling it + Parameters + ---------- + function_namespace: str + The full function namespace or just the function name + if it is a function included in this module. + + Returns + ------- + function: object + Function for deriving the specified variable """ # Get the name of the calling module calling_module = globals()["__name__"] @@ -127,13 +143,111 @@ def get_derived_variable_function(function_namespace): return function -def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, chunks): - """Return coordinates that have been reset.""" +def _check_field( + derived_field, derived_field_attributes, ds_input, required_coordinates, chunks +): + """ + Check the derived field. + + Parameters + ---------- + derived_field: xr.DataArray or tuple + The derived variable + derived_field_attributes: dict + Dictionary with attributes for the derived variables. + Defined in the config file. + ds_input: xr.Dataset + xarray dataset with variables needed to derive the specified variable + required_coordinates: list + List of coordinates required for deriving the specified variable + chunks: dict + Dictionary with keys as the dimensions to chunk along and values + with the chunk size, only inbcluding the dimensions that are included + in the output as well. + + Returns + ------- + derived_field: xr.DataArray or tuple + The derived field + """ + if isinstance(derived_field, xr.DataArray): + derived_field = _check_attributes(derived_field, derived_field_attributes) + derived_field = _return_dropped_coordinates( + derived_field, ds_input, required_coordinates, chunks + ) + elif isinstance(derived_field, tuple) and all( + isinstance(field, xr.DataArray) for field in derived_field + ): + for field in derived_field: + field = _check_attributes(field, derived_field_attributes) + field = _return_dropped_coordinates( + field, ds_input, required_coordinates, chunks + ) + else: + raise TypeError( + "Expected an instance of xr.DataArray or tuple(xr.DataArray)," + f" but got {type(derived_field)}." + ) + + return derived_field + + +def _check_attributes(field, field_attributes): + """ + Check the attributes of the derived variable. + + Parameters + ---------- + field: xr.DataArray or tuple + The derived field + field_attributes: dict + Dictionary with attributes for the derived variables. + Defined in the config file. + + Returns + ------- + field: xr.DataArray or tuple + The derived field + """ + for attribute in ["units", "long_name"]: + if attribute not in field.attrs or field.attrs[attribute] is None: + if attribute in field_attributes.keys(): + field.attrs[attribute] = field_attributes[attribute] + else: + # The expected attributes are empty and the attributes have not been + # set during the calculation of the derived variable + raise ValueError( + f"The attribute '{attribute}' has not been set for the derived" + f" variable '{field.name}' (most likely because you are using a" + " function external to `mlllam-data-prep` to derive the field)." + " This attribute has not been defined in the 'attributes' section" + " of the config file either. Make sure that you add it to the" + f" 'attributes' section of the derived variable '{field.name}'." + ) + else: + if attribute in field_attributes.keys(): + logger.warning( + f"The attribute '{attribute}' of the derived field" + f" {field.name} is being overwritten from" + f" '{field.attrs[attribute]}' to" + f" '{field_attributes[attribute]}' according" + " to specification in the config file." + ) + field.attrs[attribute] = field_attributes[attribute] + else: + # Attributes are set and nothing has been defined in the config file + pass + + return field + + +def _return_dropped_coordinates(field, ds_input, required_coordinates, chunks): + """Return the coordinates that have been reset.""" for req_coord in required_coordinates: if req_coord in chunks: - derived_field.coords[req_coord] = ds_input[req_coord] + field.coords[req_coord] = ds_input[req_coord] - return derived_field + return field def calculate_toa_radiation(lat, lon, time): @@ -179,6 +293,8 @@ def calculate_toa_radiation(lat, lon, time): if isinstance(toa_radiation, xr.DataArray): # Add attributes toa_radiation.name = "toa_radiation" + toa_radiation.attrs["long_name"] = "top-of-the-atmosphere radiation" + toa_radiation.attrs["units"] = "W*m**-2" return toa_radiation @@ -210,10 +326,18 @@ def calculate_hour_of_day(time): if isinstance(hour_of_day_cos, xr.DataArray): # Add attributes hour_of_day_cos.name = "hour_of_day_cos" + hour_of_day_cos.attrs[ + "long_name" + ] = "Cosine component of cyclically encoded hour of day" + hour_of_day_cos.attrs["units"] = "1" if isinstance(hour_of_day_sin, xr.DataArray): # Add attributes hour_of_day_sin.name = "hour_of_day_sin" + hour_of_day_sin.attrs[ + "long_name" + ] = "Sine component of cyclically encoded hour of day" + hour_of_day_sin.attrs["units"] = "1" return hour_of_day_cos, hour_of_day_sin @@ -245,10 +369,18 @@ def calculate_day_of_year(time): if isinstance(day_of_year_cos, xr.DataArray): # Add attributes day_of_year_cos.name = "day_of_year_cos" + day_of_year_cos.attrs[ + "long_name" + ] = "Cosine component of cyclically encoded day of year" + day_of_year_cos.attrs["units"] = "1" if isinstance(day_of_year_sin, xr.DataArray): # Add attributes day_of_year_sin.name = "day_of_year_sin" + day_of_year_sin.attrs[ + "long_name" + ] = "Sine component of cyclically encoded day of year" + day_of_year_sin.attrs["units"] = "1" return day_of_year_cos, day_of_year_sin From fbb606504b48df6fa1b7925ca52f67f721760812 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 07:42:16 +0000 Subject: [PATCH 18/96] Add dummy function for getting lat,lon (preparation for #33) --- mllam_data_prep/derived_variables.py | 35 +++++++++++++++++++++------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index cda1bdf..b6b67db 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -38,19 +38,25 @@ def derive_variables(fp, derived_variables, chunking): ds_subset = xr.Dataset() ds_subset.attrs.update(ds.attrs) for _, derived_variable in derived_variables.items(): - required_variables = derived_variable.kwargs + required_kwargs = derived_variable.kwargs function_name = derived_variable.function derived_variable_attributes = derived_variable.attributes or {} - ds_input = ds[required_variables.keys()] + + # Separate the lat,lon from the required variables as these will be derived separately + latlon_coords_to_include = {} + for k, v in list(required_kwargs.items()): + if k in ["lat", "lon"]: + latlon_coords_to_include[k] = required_kwargs.pop(k) + + # Subset the dataset + ds_input = ds[required_kwargs.keys()] # Any coordinates needed for the derivation, for which chunking should be performed, - # should be converted to variables since it is not possible for coordinates to be - # chunked dask arrays + # should be converted to variables since it is not possible for *indexed* coordinates + # to be chunked dask arrays chunks = {d: chunking.get(d, int(ds_input[d].count())) for d in ds_input.dims} required_coordinates = [ - req_var - for req_var in required_variables.keys() - if req_var in ds_input.coords + req_var for req_var in required_kwargs.keys() if req_var in ds_input.coords ] ds_input = ds_input.drop_indexes(required_coordinates, errors="ignore") for req_coord in required_coordinates: @@ -60,9 +66,15 @@ def derive_variables(fp, derived_variables, chunking): # Chunk the data variables ds_input = ds_input.chunk(chunks) - # Calculate the derived variable - kwargs = {v: ds_input[k] for k, v in required_variables.items()} + # Add function arguments to kwargs + kwargs = {} + if len(latlon_coords_to_include): + latlon = get_latlon_coords_for_input(ds) + for k, v in latlon_coords_to_include.items(): + kwargs[v] = latlon[k] + kwargs.update({v: ds_input[k] for k, v in required_kwargs.items()}) func = _get_derived_variable_function(function_name) + # Calculate the derived variable derived_field = func(**kwargs) # Check the derived field(s) @@ -408,3 +420,8 @@ def cyclic_encoding(data_array, da_max): data_array_cos = np.cos((data_array / da_max) * 2 * np.pi) return data_array_cos, data_array_sin + + +def get_latlon_coords_for_input(ds_input): + """Dummy function for getting lat and lon.""" + return ds_input[["lat", "lon"]].chunk(-1, -1) From 3a12f4839fcfaa72f11cc5e05736db7fcbb6cd0f Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 10:37:13 +0000 Subject: [PATCH 19/96] Add function for chunking data and checking the chunk size --- mllam_data_prep/derived_variables.py | 53 ++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index b6b67db..a985520 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -63,8 +63,8 @@ def derive_variables(fp, derived_variables, chunking): if req_coord in chunks: ds_input = ds_input.reset_coords(req_coord) - # Chunk the data variables - ds_input = ds_input.chunk(chunks) + # Chunk the dataset + ds_input = _chunk_dataset(ds_input, chunks) # Add function arguments to kwargs kwargs = {} @@ -103,6 +103,55 @@ def derive_variables(fp, derived_variables, chunking): return ds_subset +def _chunk_dataset(ds, chunks): + """ + Chunk dataset and check the chunk size. + + Parameters + ---------- + ds: xr.Dataset + Dataset to be chunked + chunks: dict + Dictionary with keys as dimensions to be chunked and + chunk sizes as the values + + Returns + ------- + ds: xr.Dataset + Dataset with chunking applied + """ + # Define the memory limit check + memory_limit_check = 1 * 1024**3 # 1 GB + + # Check the chunk size + for var_name, var_data in ds.data_vars.items(): + total_size = 1 + + for dim, chunk_size in chunks.items(): + dim_size = ds.sizes.get(dim, None) + if dim_size is None: + raise KeyError(f"Dimension '{dim}' not found in the dataset.") + total_size *= chunk_size + + dtype = var_data.dtype + bytes_per_element = np.dtype(dtype).itemsize + + memory_usage = total_size * bytes_per_element + + if memory_usage > memory_limit_check: + logger.warning( + f"The chunk size for '{var_name}' exceeds '{memory_limit_check}' GB." + ) + + # Try chunking + try: + ds = ds.chunk(chunks) + except Exception as ex: + raise Exception(f"Error chunking dataset: {ex}") + + return ds + + def _get_derived_variable_function(function_namespace): """ Function for getting the function for deriving From 3ace21989c4642d29b3a24b4b0abcc91bb6522f7 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 11:56:23 +0000 Subject: [PATCH 20/96] Add back coordinates on the subset instead of for each derived variable individually --- mllam_data_prep/derived_variables.py | 37 ++++++++-------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index a985520..d0fe2fa 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -78,13 +78,7 @@ def derive_variables(fp, derived_variables, chunking): derived_field = func(**kwargs) # Check the derived field(s) - derived_field = _check_field( - derived_field, - derived_variable_attributes, - ds_input, - required_coordinates, - chunks, - ) + derived_field = _check_field(derived_field, derived_variable_attributes) # Add the derived field(s) to the subset if isinstance(derived_field, xr.DataArray): @@ -100,6 +94,11 @@ def derive_variables(fp, derived_variables, chunking): f" but got {type(derived_field)}." ) + # Add back dropped coordinates + ds_subset = _return_dropped_coordinates( + ds_subset, ds_input, required_coordinates, chunks + ) + return ds_subset @@ -204,9 +203,7 @@ def _get_derived_variable_function(function_namespace): return function -def _check_field( - derived_field, derived_field_attributes, ds_input, required_coordinates, chunks -): +def _check_field(derived_field, derived_field_attributes): """ Check the derived field. @@ -217,14 +214,6 @@ def _check_field( derived_field_attributes: dict Dictionary with attributes for the derived variables. Defined in the config file. - ds_input: xr.Dataset - xarray dataset with variables needed to derive the specified variable - required_coordinates: list - List of coordinates required for deriving the specified variable - chunks: dict - Dictionary with keys as the dimensions to chunk along and values - with the chunk size, only inbcluding the dimensions that are included - in the output as well. Returns ------- @@ -233,17 +222,11 @@ def _check_field( """ if isinstance(derived_field, xr.DataArray): derived_field = _check_attributes(derived_field, derived_field_attributes) - derived_field = _return_dropped_coordinates( - derived_field, ds_input, required_coordinates, chunks - ) elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: field = _check_attributes(field, derived_field_attributes) - field = _return_dropped_coordinates( - field, ds_input, required_coordinates, chunks - ) else: raise TypeError( "Expected an instance of xr.DataArray or tuple(xr.DataArray)," @@ -302,13 +285,13 @@ def _check_attributes(field, field_attributes): return field -def _return_dropped_coordinates(field, ds_input, required_coordinates, chunks): +def _return_dropped_coordinates(ds_subset, ds_input, required_coordinates, chunks): """Return the coordinates that have been reset.""" for req_coord in required_coordinates: if req_coord in chunks: - field.coords[req_coord] = ds_input[req_coord] + ds_subset.coords[req_coord] = ds_input[req_coord] - return field + return ds_subset def calculate_toa_radiation(lat, lon, time): From a6b61b0ac6c00768a2c7c9f88bf11175fd3d3f3a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 11:57:05 +0000 Subject: [PATCH 21/96] Add 'hour_of_day' to example config --- example.danra.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/example.danra.yaml b/example.danra.yaml index b351a52..378c78a 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -83,6 +83,10 @@ inputs: lat: lat lon: lon function: mllam_data_prep.derived_variables.calculate_toa_radiation + hour_of_day: + kwargs: + time: time + function: mllam_data_prep.derived_variables.calculate_hour_of_day dim_mapping: time: method: rename From 9dcace68963a359af644ed1219e24099c218d29a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 13:15:15 +0000 Subject: [PATCH 22/96] Rename derived variables dataset section in the example config --- example.danra.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example.danra.yaml b/example.danra.yaml index bbf3dc7..9d3f1cf 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -73,7 +73,7 @@ inputs: name_format: "{var_name}" target_output_variable: forcing - danra_additional_forcings: + danra_derived_forcings: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr dims: [time, x, y] derived_variables: From aba675764def61e9797b80933236a5e9c3d5b2b9 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 10 Dec 2024 07:35:20 +0000 Subject: [PATCH 23/96] Remove f-string from 'name_format' --- example.danra.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example.danra.yaml b/example.danra.yaml index 9d3f1cf..f1fa443 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -96,7 +96,7 @@ inputs: dims: [x, y] forcing_feature: method: stack_variables_by_var_name - name_format: f"{var_name}" + name_format: "{var_name}" target_output_variable: forcing danra_lsm: From 143edb638a78ef897265b415fd4a176e06c2a491 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 10 Dec 2024 07:52:00 +0000 Subject: [PATCH 24/96] Update README --- README.md | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 5f5fcdf..7d9f947 100644 --- a/README.md +++ b/README.md @@ -187,6 +187,32 @@ inputs: name_format: "{var_name}" target_output_variable: forcing + danra_derived_forcings: + path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr + dims: [time, x, y] + derived_variables: + toa_radiation: + kwargs: + time: time + lat: lat + lon: lon + function: mllam_data_prep.derived_variables.calculate_toa_radiation + hour_of_day: + kwargs: + time: time + function: mllam_data_prep.derived_variables.calculate_hour_of_day + dim_mapping: + time: + method: rename + dim: time + grid_index: + method: stack + dims: [x, y] + forcing_feature: + method: stack_variables_by_var_name + name_format: "{var_name}" + target_output_variable: forcing + danra_lsm: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/lsm.zarr dims: [x, y] @@ -286,15 +312,40 @@ inputs: grid_index: method: stack dims: [x, y] - target_architecture_variable: state + target_output_variable: state danra_surface: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr dims: [time, x, y] variables: - # shouldn't really be using sea-surface pressure as "forcing", but don't - # have radiation varibles in danra yet - - pres_seasurface + # use surface incoming shortwave radiation as forcing + - swavr0m + dim_mapping: + time: + method: rename + dim: time + grid_index: + method: stack + dims: [x, y] + forcing_feature: + method: stack_variables_by_var_name + name_format: "{var_name}" + target_output_variable: forcing + + danra_derived_forcings: + path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr + dims: [time, x, y] + derived_variables: + toa_radiation: + kwargs: + time: time + lat: lat + lon: lon + function: mllam_data_prep.derived_variables.calculate_toa_radiation + hour_of_day: + kwargs: + time: time + function: mllam_data_prep.derived_variables.calculate_hour_of_day dim_mapping: time: method: rename @@ -305,7 +356,7 @@ inputs: forcing_feature: method: stack_variables_by_var_name name_format: "{var_name}" - target_architecture_variable: forcing + target_output_variable: forcing ... ``` @@ -315,11 +366,15 @@ The `inputs` section defines the source datasets to extract data from. Each sour - `path`: the path to the source dataset. This can be a local path or a URL to e.g. a zarr dataset or netCDF file, anything that can be read by `xarray.open_dataset(...)`. - `dims`: the dimensions that the source dataset is expected to have. This is used to check that the source dataset has the expected dimensions and also makes it clearer in the config file what the dimensions of the source dataset are. - `variables`: selects which variables to extract from the source dataset. This may either be a list of variable names, or a dictionary where each key is the variable name and the value defines a dictionary of coordinates to do selection on. When doing selection you may also optionally define the units of the variable to check that the units of the variable match the units of the variable in the model architecture. -- `target_architecture_variable`: the variable in the model architecture that the source dataset should be mapped to. +- `target_output_variable`: the variable in the model architecture that the source dataset should be mapped to. - `dim_mapping`: defines how the dimensions of the source dataset should be mapped to the dimensions of the model architecture. This is done by defining a method to apply to each dimension. The methods are: - `rename`: simply rename the dimension to the new name - `stack`: stack the listed dimension to create the dimension in the output - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable. +- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with additional information. +- `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.derived_variables` module it is enough with the function name only. +- `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the variable name to select from the source dataset and each value is the named argument to `function`. +- `attributes`: section where users can specify the attributes `units` and `long_name` as a dictionary (not included in the example config file), where the keys are the attribute names and the values are strings. If using a function defined in `mllam_data_prep.derived_variables` this section is optional as the attributes should already be defined. In this case, adding the attributes to the config file will overwrite the already-defined ones. If using an external function, where the attributes `units` and `long_name` are not set, this section is a requirement. ### Config schema versioning From 12e057571e6e9bcc78a5bd7706198ea31536f0a2 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 11 Dec 2024 07:57:11 +0000 Subject: [PATCH 25/96] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbb8ea1..da3b7b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- add ability to derive variables from input datasets [\#34](https://github.com/mllam/mllam-data-prep/pull/34) - add github PR template to guide development process on github [\#44](https://github.com/mllam/mllam-data-prep/pull/44), @leifdenby ## [v0.5.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.5.0) From 000ce925f51b4c5add0d3c08f53ccc522f4cfb47 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 11 Dec 2024 12:57:06 +0000 Subject: [PATCH 26/96] Make functions for deriving toa_radiation and datetime forcings actually handle both xr.DataArray and scalars --- mllam_data_prep/derived_variables.py | 35 ++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index d0fe2fa..e5d0889 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -1,3 +1,4 @@ +import datetime import importlib import sys @@ -317,8 +318,18 @@ def calculate_toa_radiation(lat, lon, time): # Solar constant E0 = 1366 # W*m**-2 - day = time.dt.dayofyear - hr_utc = time.dt.hour + # Different handling if xr.DataArray or datetime object + if isinstance(time, xr.DataArray): + day = time.dt.dayofyear + hr_utc = time.dt.hour + elif isinstance(time, datetime.datetime): + day = time.timetuple().tm_yday + hr_utc = time.hour + else: + raise TypeError( + "Expected an instance of xr.DataArray or datetime object," + f" but got {type(time)}." + ) # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) @@ -362,7 +373,15 @@ def calculate_hour_of_day(time): logger.info("Calculating hour of day") # Get the hour of the day - hour_of_day = time.dt.hour + if isinstance(time, xr.DataArray): + hour_of_day = time.dt.hour + elif isinstance(time, datetime.datetime): + hour_of_day = time.hour + else: + raise TypeError( + "Expected an instance of xr.DataArray or datetime object," + f" but got {type(time)}." + ) # Cyclic encoding of hour of day hour_of_day_cos, hour_of_day_sin = cyclic_encoding(hour_of_day, 24) @@ -405,7 +424,15 @@ def calculate_day_of_year(time): logger.info("Calculating day of year") # Get the day of year - day_of_year = time.dt.dayofyear + if isinstance(time, xr.DataArray): + day_of_year = time.dt.dayofyear + elif isinstance(time, datetime.datetime): + day_of_year = time.timetuple().tm_yday + else: + raise TypeError( + "Expected an instance of xr.DataArray or datetime object," + f" but got {type(time)}." + ) # Cyclic encoding of day of year - use 366 to include leap years! day_of_year_cos, day_of_year_sin = cyclic_encoding(day_of_year, 366) From 0af6319922b075e8a44a89c6af091939fdfa89cc Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 11 Dec 2024 13:05:22 +0000 Subject: [PATCH 27/96] Update docstring and variable names in 'cyclic_encoding' --- mllam_data_prep/derived_variables.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index e5d0889..6217ade 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -456,29 +456,29 @@ def calculate_day_of_year(time): return day_of_year_cos, day_of_year_sin -def cyclic_encoding(data_array, da_max): +def cyclic_encoding(data, data_max): """ Cyclic encoding of data Parameters ---------- - da : xr.DataArray - xarray data-array that should be cyclically encoded - da_max: int/float - Maximum possible value of input data-array + data : xr.DataArray, float, or int + Data that should be cyclically encoded + data_max: int or float + Maximum possible value of input data. Should be greater than 0. Returns ------- - da_cos: xr.DataArray - Cosine part of cyclically encoded input data-array - da_sin: xr.DataArray - Sine part of cyclically encoded input data-array + data_cos: xr.DataArray, float, or int + Cosine part of cyclically encoded input data + data_sin: xr.DataArray, float, or int + Sine part of cyclically encoded input data """ - data_array_sin = np.sin((data_array / da_max) * 2 * np.pi) - data_array_cos = np.cos((data_array / da_max) * 2 * np.pi) + data_sin = np.sin((data / data_max) * 2 * np.pi) + data_cos = np.cos((data / data_max) * 2 * np.pi) - return data_array_cos, data_array_sin + return data_cos, data_sin def get_latlon_coords_for_input(ds_input): From 284db913e5dd7e190a764c075df1d96d2080a092 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 10:15:56 +0000 Subject: [PATCH 28/96] Add ranges to lat and lon in docstring --- mllam_data_prep/derived_variables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 6217ade..37ccc67 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -302,9 +302,9 @@ def calculate_toa_radiation(lat, lon, time): Parameters ---------- lat : xr.DataArray or float - Latitude values + Latitude values. Should be in the range [-90, 90] lon : xr.DataArray or float - Longitude values + Longitude values. Should be in the range [-180, 180] or [0, 360] time : xr.DataArray or datetime object Time From ba161d23c5948797465297576024baf7e8c335ad Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 10:20:22 +0000 Subject: [PATCH 29/96] Add github username to CHANGELOG entry --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da3b7b6..c30bb81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- add ability to derive variables from input datasets [\#34](https://github.com/mllam/mllam-data-prep/pull/34) +- add ability to derive variables from input datasets [\#34](https://github.com/mllam/mllam-data-prep/pull/34), @ealerskans - add github PR template to guide development process on github [\#44](https://github.com/mllam/mllam-data-prep/pull/44), @leifdenby ## [v0.5.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.5.0) From e3d590cae070498d7240268d98c8515850b8480c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 10:24:19 +0000 Subject: [PATCH 30/96] Update DerivedVariable attributes to be Dict[str, str] --- mllam_data_prep/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index c6069f4..82bad84 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -66,7 +66,7 @@ class DerivedVariable: kwargs: Dict[str, str] function: str - attributes: Dict[str, Any] = None + attributes: Dict[str, str] = None @dataclass From f8cae4ffb9bff918131f73c92a64e1812702800a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 10:27:24 +0000 Subject: [PATCH 31/96] Add missing attribute to docstring --- mllam_data_prep/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 82bad84..f20c3b2 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -62,6 +62,7 @@ class DerivedVariable: Attributes: kwargs: Variables required for calculating the derived variable. function: Function used to calculate the derived variable. + attributes: Attributes (e.g. `units` and `long_name`) for the derived variable. """ kwargs: Dict[str, str] From 8470c8263fcde06b46f606d7ae5045348ae41997 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 11:54:03 +0000 Subject: [PATCH 32/96] Change var names in 'calculate_toa_radiation' --- mllam_data_prep/derived_variables.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 37ccc67..57701f6 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -316,15 +316,15 @@ def calculate_toa_radiation(lat, lon, time): logger.info("Calculating top-of-atmosphere radiation") # Solar constant - E0 = 1366 # W*m**-2 + solar_constant = 1366 # W*m**-2 # Different handling if xr.DataArray or datetime object if isinstance(time, xr.DataArray): day = time.dt.dayofyear - hr_utc = time.dt.hour + hour_utc = time.dt.hour elif isinstance(time, datetime.datetime): day = time.timetuple().tm_yday - hr_utc = time.hour + hour_utc = time.hour else: raise TypeError( "Expected an instance of xr.DataArray or datetime object," @@ -332,18 +332,21 @@ def calculate_toa_radiation(lat, lon, time): ) # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. + # dec: declination - angular position of the sun at solar noon w.r.t. + # the plane of the equator dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) - hr_lst = hr_utc + lon / 15 - hr_angle = 15 * (hr_lst - 12) + utc_solar_time = hour_utc + lon / 15 + hour_angle = 15 * (utc_solar_time - 12) # Eq. 1.6.2 with beta=0 in Solar Engineering of Thermal Processes 4th ed. + # cos_sza: Cosine of solar zenith angle cos_sza = np.sin(lat * np.pi / 180) * np.sin(dec) + np.cos( lat * np.pi / 180 - ) * np.cos(dec) * np.cos(hr_angle * np.pi / 180) + ) * np.cos(dec) * np.cos(hour_angle * np.pi / 180) # Where TOA radiation is negative, set to 0 - toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) + toa_radiation = xr.where(solar_constant * cos_sza < 0, 0, solar_constant * cos_sza) if isinstance(toa_radiation, xr.DataArray): # Add attributes From 69afdd3d1d54e6c7c59b74c34ab2762c2cfd95a5 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 12:47:25 +0000 Subject: [PATCH 33/96] Remove unnecessary 'or None' --- mllam_data_prep/create_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 1684e61..73ae043 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -127,8 +127,8 @@ def create_dataset(config: Config): for dataset_name, input_config in config.inputs.items(): path = input_config.path - variables = input_config.variables or None - derived_variables = input_config.derived_variables or None + variables = input_config.variables + derived_variables = input_config.derived_variables target_output_var = input_config.target_output_variable expected_input_attributes = input_config.attributes or {} expected_input_var_dims = input_config.dims From e17ed8b7d1f2ac654ee7bf22e5ab1f6172f3e69c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 12:51:24 +0000 Subject: [PATCH 34/96] Use var name 'dim' instead of 'd' --- mllam_data_prep/create_dataset.py | 2 +- mllam_data_prep/derived_variables.py | 4 +++- mllam_data_prep/ops/loading.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 73ae043..ce95b58 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -221,7 +221,7 @@ def create_dataset(config: Config): # default to making a single chunk for each dimension if chunksize is not specified # in the config logger.info(f"Chunking dataset with {chunking_config}") - chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims} + chunks = {dim: chunking_config.get(dim, int(ds[dim].count())) for dim in ds.dims} ds = ds.chunk(chunks) splitting = config.output.splitting diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 57701f6..6ec212e 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -55,7 +55,9 @@ def derive_variables(fp, derived_variables, chunking): # Any coordinates needed for the derivation, for which chunking should be performed, # should be converted to variables since it is not possible for *indexed* coordinates # to be chunked dask arrays - chunks = {d: chunking.get(d, int(ds_input[d].count())) for d in ds_input.dims} + chunks = { + dim: chunking.get(dim, int(ds_input[dim].count())) for dim in ds_input.dims + } required_coordinates = [ req_var for req_var in required_kwargs.keys() if req_var in ds_input.coords ] diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index fc5d5bc..5275c57 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -58,7 +58,9 @@ def load_and_subset_dataset(fp, variables, chunking): else: raise ValueError("The `variables` argument should be a list or a dictionary") - chunks = {d: chunking.get(d, int(ds_subset[d].count())) for d in ds_subset.dims} + chunks = { + dim: chunking.get(dim, int(ds_subset[dim].count())) for dim in ds_subset.dims + } ds_subset = ds_subset.chunk(chunks) return ds_subset From 23b119f7bd3972223de7410d028ab8bc42ccecac Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 12:56:36 +0000 Subject: [PATCH 35/96] Use var names 'key, val' instead of 'k, v' --- mllam_data_prep/create_dataset.py | 7 +++++-- mllam_data_prep/derived_variables.py | 12 ++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index ce95b58..b013ac3 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -31,11 +31,14 @@ def _check_dataset_attributes(ds, expected_attributes, dataset_name): # check for attributes having the wrong value incorrect_attributes = { - k: v for k, v in expected_attributes.items() if ds.attrs[k] != v + key: val for key, val in expected_attributes.items() if ds.attrs[key] != val } if len(incorrect_attributes) > 0: s_list = "\n".join( - [f"{k}: {v} != {ds.attrs[k]}" for k, v in incorrect_attributes.items()] + [ + f"{key}: {val} != {ds.attrs[key]}" + for key, val in incorrect_attributes.items() + ] ) raise ValueError( f"Dataset {dataset_name} has the following incorrect attributes: {s_list}" diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 6ec212e..f4bc516 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -45,9 +45,9 @@ def derive_variables(fp, derived_variables, chunking): # Separate the lat,lon from the required variables as these will be derived separately latlon_coords_to_include = {} - for k, v in list(required_kwargs.items()): - if k in ["lat", "lon"]: - latlon_coords_to_include[k] = required_kwargs.pop(k) + for key in list(required_kwargs.keys()): + if key in ["lat", "lon"]: + latlon_coords_to_include[key] = required_kwargs.pop(key) # Subset the dataset ds_input = ds[required_kwargs.keys()] @@ -73,9 +73,9 @@ def derive_variables(fp, derived_variables, chunking): kwargs = {} if len(latlon_coords_to_include): latlon = get_latlon_coords_for_input(ds) - for k, v in latlon_coords_to_include.items(): - kwargs[v] = latlon[k] - kwargs.update({v: ds_input[k] for k, v in required_kwargs.items()}) + for key, val in latlon_coords_to_include.items(): + kwargs[val] = latlon[key] + kwargs.update({val: ds_input[key] for key, val in required_kwargs.items()}) func = _get_derived_variable_function(function_name) # Calculate the derived variable derived_field = func(**kwargs) From 2ce53c7a485549ae54f09262eac8e59692f37629 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 13:35:02 +0000 Subject: [PATCH 36/96] Move '_check_dataset_attributes' outside if statement --- mllam_data_prep/create_dataset.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index b013ac3..f38b619 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -148,11 +148,6 @@ def create_dataset(config: Config): raise Exception( f"Error loading dataset {dataset_name} from {path}" ) from ex - _check_dataset_attributes( - ds=ds, - expected_attributes=expected_input_attributes, - dataset_name=dataset_name, - ) if derived_variables: logger.info( @@ -169,11 +164,11 @@ def create_dataset(config: Config): f"Error loading dataset {dataset_name} from {path}" f" or deriving variables '{', '.join(list(derived_variables.keys()))}'." ) from ex - _check_dataset_attributes( - ds=ds, - expected_attributes=expected_input_attributes, - dataset_name=dataset_name, - ) + _check_dataset_attributes( + ds=ds, + expected_attributes=expected_input_attributes, + dataset_name=dataset_name, + ) dim_mapping = input_config.dim_mapping From f1e3d778cbe034203c4b455278d65a11215af434 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 13:49:59 +0000 Subject: [PATCH 37/96] Set '{}' as default for 'attributes' and 'chunking' --- mllam_data_prep/config.py | 6 +++--- mllam_data_prep/create_dataset.py | 4 ++-- mllam_data_prep/derived_variables.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index f20c3b2..9bbc783 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -67,7 +67,7 @@ class DerivedVariable: kwargs: Dict[str, str] function: str - attributes: Dict[str, str] = None + attributes: Dict[str, str] = field(default_factory=dict) @dataclass @@ -184,7 +184,7 @@ class InputDataset: target_output_variable: str variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] = None derived_variables: Dict[str, DerivedVariable] = None - attributes: Dict[str, Any] = None + attributes: Dict[str, Any] = field(default_factory=dict) @dataclass @@ -284,7 +284,7 @@ class Output: variables: Dict[str, List[str]] coord_ranges: Dict[str, Range] = None - chunking: Dict[str, int] = None + chunking: Dict[str, int] = field(default_factory=dict) splitting: Splitting = None diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index f38b619..113b703 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -124,7 +124,7 @@ def create_dataset(config: Config): output_config = config.output output_coord_ranges = output_config.coord_ranges - chunking_config = config.output.chunking or {} + chunking_config = config.output.chunking dataarrays_by_target = defaultdict(list) @@ -133,7 +133,7 @@ def create_dataset(config: Config): variables = input_config.variables derived_variables = input_config.derived_variables target_output_var = input_config.target_output_variable - expected_input_attributes = input_config.attributes or {} + expected_input_attributes = input_config.attributes expected_input_var_dims = input_config.dims output_dims = output_config.variables[target_output_var] diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index f4bc516..f881bcb 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -41,7 +41,7 @@ def derive_variables(fp, derived_variables, chunking): for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function - derived_variable_attributes = derived_variable.attributes or {} + derived_variable_attributes = derived_variable.attributes # Separate the lat,lon from the required variables as these will be derived separately latlon_coords_to_include = {} From 2afbb356bde396448dd4273de92a1837c83758c3 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 07:32:28 +0000 Subject: [PATCH 38/96] Make types more explicit --- mllam_data_prep/derived_variables.py | 69 ++++++++++++++++++---------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index f881bcb..4669a36 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -16,17 +16,17 @@ def derive_variables(fp, derived_variables, chunking): fp : str Filepath to the source dataset, for example the path to a zarr dataset or a netCDF file (anything that is supported by `xarray.open_dataset` will work) - derived_variables : dict + derived_variables : Dict[str, DerivedVariable] Dictionary with the variables to derive with keys as the variable names and values with entries for kwargs and function to use in the calculation - chunking: dict + chunking: Dict[str, int] Dictionary with keys as the dimensions to chunk along and values with the chunk size Returns ------- - ds : xr.Dataset + xr.Dataset Dataset with derived variables included """ logger.info("Deriving variables") @@ -113,7 +113,7 @@ def _chunk_dataset(ds, chunks): ---------- ds: xr.Dataset Dataset to be chunked - chunks: dict + chunks: Dict[str, int] Dictionary with keys as dimensions to be chunked and chunk sizes as the values @@ -212,15 +212,15 @@ def _check_field(derived_field, derived_field_attributes): Parameters ---------- - derived_field: xr.DataArray or tuple + derived_field: Union[xr.DataArray, Tuple[xr.DataArray]] The derived variable - derived_field_attributes: dict + derived_field_attributes: Dict[str, str] Dictionary with attributes for the derived variables. Defined in the config file. Returns ------- - derived_field: xr.DataArray or tuple + derived_field: Union[xr.DataArray, Tuple[xr.DataArray]] The derived field """ if isinstance(derived_field, xr.DataArray): @@ -245,15 +245,15 @@ def _check_attributes(field, field_attributes): Parameters ---------- - field: xr.DataArray or tuple + field: Union[xr.DataArray, Tuple[xr.DataArray]] The derived field - field_attributes: dict + field_attributes: Dict[str, str] Dictionary with attributes for the derived variables. Defined in the config file. Returns ------- - field: xr.DataArray or tuple + field: Union[xr.DataArray, Tuple[xr.DataArray]] The derived field """ for attribute in ["units", "long_name"]: @@ -289,7 +289,26 @@ def _check_attributes(field, field_attributes): def _return_dropped_coordinates(ds_subset, ds_input, required_coordinates, chunks): - """Return the coordinates that have been reset.""" + """ + Return the coordinates that have been reset. + + Parameters + ---------- + ds_subset: xr.Dataset + Subsetted dataset with derived variables + ds_input: xr.Dataset + Input dataset for deriving variables + required_coordinates: List[str] + List of coordinates required for the derived variable + chunks: Dict[str, int] + Dictionary with keys as dimensions to be chunked and + chunk sizes as the values + + Returns + ------- + ds_subset: xr.Dataset + Subsetted dataset with dropped coordinates returned + """ for req_coord in required_coordinates: if req_coord in chunks: ds_subset.coords[req_coord] = ds_input[req_coord] @@ -303,16 +322,16 @@ def calculate_toa_radiation(lat, lon, time): Parameters ---------- - lat : xr.DataArray or float + lat : Union[xr.DataArray, float] Latitude values. Should be in the range [-90, 90] - lon : xr.DataArray or float + lon : Union[xr.DataArray, float] Longitude values. Should be in the range [-180, 180] or [0, 360] - time : xr.DataArray or datetime object + time : Union[xr.DataArray, datetime.datetime] Time Returns ------- - toa_radiation: xr.DataArray or float + toa_radiation : Union[xr.DataArray, float] TOA radiation data """ logger.info("Calculating top-of-atmosphere radiation") @@ -365,14 +384,14 @@ def calculate_hour_of_day(time): Parameters ---------- - time : xr.DataArray or datetime object + time : Union[xr.DataArray, datetime.datetime] Time Returns ------- - hour_of_day_cos: xr.DataArray or float + hour_of_day_cos: Union[xr.DataArray, float] cosine of the hour of day - hour_of_day_sin: xr.DataArray or float + hour_of_day_sin: Union[xr.DataArray, float] sine of the hour of day """ logger.info("Calculating hour of day") @@ -416,14 +435,14 @@ def calculate_day_of_year(time): Parameters ---------- - time : xr.DataArray or datetime object + time : Union[xr.DataArray, datetime.datetime] Time Returns ------- - day_of_year_cos: xr.DataArray or float + day_of_year_cos: Union[xr.DataArray, float] cosine of the day of year - day_of_year_sin: xr.DataArray or float + day_of_year_sin: Union[xr.DataArray, float] sine of the day of year """ logger.info("Calculating day of year") @@ -467,16 +486,16 @@ def cyclic_encoding(data, data_max): Parameters ---------- - data : xr.DataArray, float, or int + data : Union[xr.DataArray, float, int] Data that should be cyclically encoded - data_max: int or float + data_max: Union[int, float] Maximum possible value of input data. Should be greater than 0. Returns ------- - data_cos: xr.DataArray, float, or int + data_cos: Union[xr.DataArray, float, int] Cosine part of cyclically encoded input data - data_sin: xr.DataArray, float, or int + data_sin: Union[xr.DataArray, float, int] Sine part of cyclically encoded input data """ From 75797a206b8be9da6aa7b5376852ac6744524050 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 07:40:26 +0000 Subject: [PATCH 39/96] Rename 'ds_subset' to 'ds_derived_vars' and update comment for 'ds_input' --- mllam_data_prep/derived_variables.py | 34 +++++++++++++++------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 4669a36..a1898c1 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -36,8 +36,8 @@ def derive_variables(fp, derived_variables, chunking): except ValueError: ds = xr.open_dataset(fp) - ds_subset = xr.Dataset() - ds_subset.attrs.update(ds.attrs) + ds_derived_vars = xr.Dataset() + ds_derived_vars.attrs.update(ds.attrs) for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function @@ -49,7 +49,7 @@ def derive_variables(fp, derived_variables, chunking): if key in ["lat", "lon"]: latlon_coords_to_include[key] = required_kwargs.pop(key) - # Subset the dataset + # Get input dataset for calculating derived variables ds_input = ds[required_kwargs.keys()] # Any coordinates needed for the derivation, for which chunking should be performed, @@ -83,14 +83,14 @@ def derive_variables(fp, derived_variables, chunking): # Check the derived field(s) derived_field = _check_field(derived_field, derived_variable_attributes) - # Add the derived field(s) to the subset + # Add the derived field(s) to the dataset if isinstance(derived_field, xr.DataArray): - ds_subset[derived_field.name] = derived_field + ds_derived_vars[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: - ds_subset[field.name] = field + ds_derived_vars[field.name] = field else: raise TypeError( "Expected an instance of xr.DataArray or tuple(xr.DataArray)," @@ -98,11 +98,11 @@ def derive_variables(fp, derived_variables, chunking): ) # Add back dropped coordinates - ds_subset = _return_dropped_coordinates( - ds_subset, ds_input, required_coordinates, chunks + ds_derived_vars = _return_dropped_coordinates( + ds_derived_vars, ds_input, required_coordinates, chunks ) - return ds_subset + return ds_derived_vars def _chunk_dataset(ds, chunks): @@ -288,14 +288,16 @@ def _check_attributes(field, field_attributes): return field -def _return_dropped_coordinates(ds_subset, ds_input, required_coordinates, chunks): +def _return_dropped_coordinates( + ds_derived_vars, ds_input, required_coordinates, chunks +): """ Return the coordinates that have been reset. Parameters ---------- - ds_subset: xr.Dataset - Subsetted dataset with derived variables + ds_derived_vars: xr.Dataset + Dataset with derived variables ds_input: xr.Dataset Input dataset for deriving variables required_coordinates: List[str] @@ -306,14 +308,14 @@ def _return_dropped_coordinates(ds_subset, ds_input, required_coordinates, chunk Returns ------- - ds_subset: xr.Dataset - Subsetted dataset with dropped coordinates returned + ds_derived_vars: xr.Dataset + Dataset with derived variables, now also with dropped coordinates returned """ for req_coord in required_coordinates: if req_coord in chunks: - ds_subset.coords[req_coord] = ds_input[req_coord] + ds_derived_vars.coords[req_coord] = ds_input[req_coord] - return ds_subset + return ds_derived_vars def calculate_toa_radiation(lat, lon, time): From 31578e81abc44725a38080b900fb9055e188274e Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 07:56:58 +0000 Subject: [PATCH 40/96] Add 'Optional[...]' to optional attributes --- mllam_data_prep/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 9bbc783..0029313 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -67,7 +67,7 @@ class DerivedVariable: kwargs: Dict[str, str] function: str - attributes: Dict[str, str] = field(default_factory=dict) + attributes: Optional[Dict[str, str]] = field(default_factory=dict) @dataclass @@ -182,9 +182,9 @@ class InputDataset: dims: List[str] dim_mapping: Dict[str, DimMapping] target_output_variable: str - variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] = None - derived_variables: Dict[str, DerivedVariable] = None - attributes: Dict[str, Any] = field(default_factory=dict) + variables: Optional[Union[List[str], Dict[str, Dict[str, ValueSelection]]]] = None + derived_variables: Optional[Dict[str, DerivedVariable]] = None + attributes: Optional[Dict[str, Any]] = field(default_factory=dict) @dataclass From 90e4cf2cb2046280b646975ead8ff9e12b7e10e6 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 08:39:02 +0000 Subject: [PATCH 41/96] Move loading of dataset to a separate function --- mllam_data_prep/create_dataset.py | 27 +++++++++++++--------- mllam_data_prep/derived_variables.py | 19 +++++----------- mllam_data_prep/ops/loading.py | 34 +++++++++++++++++++++------- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 113b703..181292a 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -11,7 +11,7 @@ from . import __version__ from .config import Config, InvalidConfigException from .derived_variables import derive_variables -from .ops.loading import load_and_subset_dataset +from .ops.loading import load_dataset, subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs from .ops.statistics import calc_stats @@ -138,32 +138,37 @@ def create_dataset(config: Config): output_dims = output_config.variables[target_output_var] + logger.info(f"Loading dataset {dataset_name} from {path}") + try: + ds_source = load_dataset(fp=path) + except Exception as ex: + raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex + if variables: - logger.info(f"Loading dataset {dataset_name} from {path} and subsetting") + logger.info(f"Subsetting dataset {dataset_name}") try: - ds = load_and_subset_dataset( - fp=path, variables=variables, chunking=chunking_config + ds = subset_dataset( + ds=ds_source, variables=variables, chunking=chunking_config ) except Exception as ex: raise Exception( - f"Error loading dataset {dataset_name} from {path}" + f"Error subsetting dataset {dataset_name} from {path}" ) from ex if derived_variables: - logger.info( - f"Loading dataset {dataset_name} from {path} and deriving variables" - ) + logger.info(f"Deriving variables from {dataset_name}") try: ds = derive_variables( - fp=path, + ds=ds_source, derived_variables=derived_variables, chunking=chunking_config, ) except Exception as ex: raise Exception( - f"Error loading dataset {dataset_name} from {path}" - f" or deriving variables '{', '.join(list(derived_variables.keys()))}'." + f"Error deriving variables '{', '.join(list(derived_variables.keys()))}'" + f" from dataset {dataset_name} from {path}" ) from ex + _check_dataset_attributes( ds=ds, expected_attributes=expected_input_attributes, diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index a1898c1..8091e64 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -7,19 +7,18 @@ from loguru import logger -def derive_variables(fp, derived_variables, chunking): +def derive_variables(ds, derived_variables, chunking): """ Load the dataset, and derive the specified variables Parameters --------- - fp : str - Filepath to the source dataset, for example the path to a zarr dataset - or a netCDF file (anything that is supported by `xarray.open_dataset` will work) + ds : xr.Dataset + Source dataset derived_variables : Dict[str, DerivedVariable] - Dictionary with the variables to derive - with keys as the variable names and values with entries for - kwargs and function to use in the calculation + Dictionary with the variables to derive with keys as the variable + names and values with entries for kwargs and function to use in + the calculation chunking: Dict[str, int] Dictionary with keys as the dimensions to chunk along and values with the chunk size @@ -29,12 +28,6 @@ def derive_variables(fp, derived_variables, chunking): xr.Dataset Dataset with derived variables included """ - logger.info("Deriving variables") - - try: - ds = xr.open_zarr(fp) - except ValueError: - ds = xr.open_dataset(fp) ds_derived_vars = xr.Dataset() ds_derived_vars.attrs.update(ds.attrs) diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index 5275c57..a8c2d24 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -1,16 +1,39 @@ import xarray as xr -def load_and_subset_dataset(fp, variables, chunking): +def load_dataset(fp): """ - Load the dataset, subset the variables along the specified coordinates and - check coordinate units + Load the dataset Parameters ---------- fp : str Filepath to the source dataset, for example the path to a zarr dataset or a netCDF file (anything that is supported by `xarray.open_dataset` will work) + + Returns + ------- + ds: xr.Dataset + Source dataset + """ + + try: + ds = xr.open_zarr(fp) + except ValueError: + ds = xr.open_dataset(fp) + + return ds + + +def subset_dataset(ds, variables, chunking): + """ + Load the dataset, subset the variables along the specified coordinates and + check coordinate units + + Parameters + ---------- + ds : xr.Dataset + Source dataset variables : dict Dictionary with the variables to subset with keys as the variable names and values with entries for each @@ -20,11 +43,6 @@ def load_and_subset_dataset(fp, variables, chunking): with the chunk size """ - try: - ds = xr.open_zarr(fp) - except ValueError: - ds = xr.open_dataset(fp) - ds_subset = xr.Dataset() ds_subset.attrs.update(ds.attrs) if isinstance(variables, dict): From 717c6a526215e1961d93626d5298b755427ae2e9 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 08:46:05 +0000 Subject: [PATCH 42/96] Simplify if loops --- mllam_data_prep/derived_variables.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 8091e64..cce450e 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -264,19 +264,18 @@ def _check_attributes(field, field_attributes): " of the config file either. Make sure that you add it to the" f" 'attributes' section of the derived variable '{field.name}'." ) + elif attribute in field_attributes.keys(): + logger.warning( + f"The attribute '{attribute}' of the derived field" + f" {field.name} is being overwritten from" + f" '{field.attrs[attribute]}' to" + f" '{field_attributes[attribute]}' according" + " to specification in the config file." + ) + field.attrs[attribute] = field_attributes[attribute] else: - if attribute in field_attributes.keys(): - logger.warning( - f"The attribute '{attribute}' of the derived field" - f" {field.name} is being overwritten from" - f" '{field.attrs[attribute]}' to" - f" '{field_attributes[attribute]}' according" - " to specification in the config file." - ) - field.attrs[attribute] = field_attributes[attribute] - else: - # Attributes are set and nothing has been defined in the config file - pass + # Attributes are set and nothing has been defined in the config file + pass return field From 2856c6b9acf932e8b949be0e9dcc1e7de8ef7c9a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 09:24:16 +0000 Subject: [PATCH 43/96] Update '_get_derived_variable_function' --- mllam_data_prep/derived_variables.py | 42 +++++++++++++++------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index cce450e..4f64509 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -166,27 +166,20 @@ def _get_derived_variable_function(function_namespace): # Get the name of the calling module calling_module = globals()["__name__"] - if "." in function_namespace: - # If the function name is a full namespace, get module and function names - module_name, function_name = function_namespace.rsplit(".", 1) - - # Check if the module_name is pointing to here (the calling module), - # and if it does then use globals() to get the function otherwise - # import the correct module and get the correct function - if module_name == calling_module: - function = globals().get(function_name) - else: - # Check if the module is already imported - if module_name in sys.modules: - module = module_name - else: - module = importlib.import_module(module_name) - - # Get the function from the module - function = getattr(module, function_name) + # Get module and function names + function_namespace_list = function_namespace.rsplit(".") + if len(function_namespace_list) > 1: + function_name = function_namespace_list[-1] + module_name = ".".join(elem for elem in function_namespace_list[:-1]) else: - # If function name only get it from the calling module (here) - function = globals().get(function_namespace) + module_name = "" + function_name = function_namespace_list[0] + + # Check if the module_name is pointing to here (the calling module or empty "") + # If it does, then use globals() to get the function otherwise import the + # correct module and get the correct function + if module_name in [calling_module, ""]: + function = globals().get(function_name) if not function: raise TypeError( f"Function '{function_namespace}' was not found in '{calling_module}'." @@ -195,6 +188,15 @@ def _get_derived_variable_function(function_namespace): " want to use a function defined outside of of the current module" f" '{calling_module}'." ) + else: + # Check if the module is already imported + if module_name in sys.modules: + module = module_name + else: + module = importlib.import_module(module_name) + + # Get the function from the module + function = getattr(module, function_name) return function From 98673ee227e2779a1ed2c9f5771dcce91524b7c7 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 09:46:24 +0000 Subject: [PATCH 44/96] Simplify checks of the derived fields --- mllam_data_prep/derived_variables.py | 42 ++++------------------------ 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 4f64509..586f412 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -73,16 +73,17 @@ def derive_variables(ds, derived_variables, chunking): # Calculate the derived variable derived_field = func(**kwargs) - # Check the derived field(s) - derived_field = _check_field(derived_field, derived_variable_attributes) - - # Add the derived field(s) to the dataset + # Check the derived field(s) and add it to the dataset if isinstance(derived_field, xr.DataArray): + derived_field = _check_attributes( + derived_field, derived_variable_attributes + ) ds_derived_vars[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: + field = _check_attributes(field, derived_variable_attributes) ds_derived_vars[field.name] = field else: raise TypeError( @@ -201,39 +202,6 @@ def _get_derived_variable_function(function_namespace): return function -def _check_field(derived_field, derived_field_attributes): - """ - Check the derived field. - - Parameters - ---------- - derived_field: Union[xr.DataArray, Tuple[xr.DataArray]] - The derived variable - derived_field_attributes: Dict[str, str] - Dictionary with attributes for the derived variables. - Defined in the config file. - - Returns - ------- - derived_field: Union[xr.DataArray, Tuple[xr.DataArray]] - The derived field - """ - if isinstance(derived_field, xr.DataArray): - derived_field = _check_attributes(derived_field, derived_field_attributes) - elif isinstance(derived_field, tuple) and all( - isinstance(field, xr.DataArray) for field in derived_field - ): - for field in derived_field: - field = _check_attributes(field, derived_field_attributes) - else: - raise TypeError( - "Expected an instance of xr.DataArray or tuple(xr.DataArray)," - f" but got {type(derived_field)}." - ) - - return derived_field - - def _check_attributes(field, field_attributes): """ Check the attributes of the derived variable. From 8940e82c9383486af88e5c14d16f2457c5f2c50f Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 09:53:06 +0000 Subject: [PATCH 45/96] Issue warning saying that we assume coordinates are named 'lat' and 'lon' --- mllam_data_prep/derived_variables.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 586f412..1b275fd 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -37,6 +37,10 @@ def derive_variables(ds, derived_variables, chunking): derived_variable_attributes = derived_variable.attributes # Separate the lat,lon from the required variables as these will be derived separately + logger.warning( + "Assuming that the lat/lon coordinates are given as variables called" + " 'lat' and 'lon'." + ) latlon_coords_to_include = {} for key in list(required_kwargs.keys()): if key in ["lat", "lon"]: From e12e328534bd6ae81b4300ac15a7a8a91a2cfa8a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 12:11:56 +0000 Subject: [PATCH 46/96] Update README to make it clear that 'attributes' is associated with 'derived_variables' --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7d9f947..de089dc 100644 --- a/README.md +++ b/README.md @@ -371,10 +371,10 @@ The `inputs` section defines the source datasets to extract data from. Each sour - `rename`: simply rename the dimension to the new name - `stack`: stack the listed dimension to create the dimension in the output - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable. -- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with additional information. -- `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.derived_variables` module it is enough with the function name only. -- `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the variable name to select from the source dataset and each value is the named argument to `function`. -- `attributes`: section where users can specify the attributes `units` and `long_name` as a dictionary (not included in the example config file), where the keys are the attribute names and the values are strings. If using a function defined in `mllam_data_prep.derived_variables` this section is optional as the attributes should already be defined. In this case, adding the attributes to the config file will overwrite the already-defined ones. If using an external function, where the attributes `units` and `long_name` are not set, this section is a requirement. +- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with the following additional information. + - `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.derived_variables` module it is enough with the function name only. + - `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the variable name to select from the source dataset and each value is the named argument to `function`. + - `attributes`: section where users can specify attributes (e.g. `units` and `long_name`) as a dictionary (not included in the example config file), where the keys are the attribute names and the values are strings. If using a function defined in `mllam_data_prep.derived_variables` this section is optional as the attributes should already be defined. In this case, adding the attributes to the config file will overwrite the already-defined ones. If using an external function, where the attributes `units` and `long_name` are not set, this section is a requirement. ### Config schema versioning From ecdea30e323e85f893a4e446ea8eddbb946c2a4b Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 12:17:45 +0000 Subject: [PATCH 47/96] Indicate that 'variables' and 'derived_variables' are mutually exclusive --- mllam_data_prep/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 0029313..1190088 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -138,7 +138,7 @@ class InputDataset: 1) the path to the dataset, 2) the expected dimensions of the dataset, 3) the variables to select from the dataset (and optionally subsection - along the coordinates for each variable) and/or the variables to derive + along the coordinates for each variable) or the variables to derive from the dataset, and finally 4) the method by which the dimensions and variables of the dataset are mapped to one of the output variables (this includes stacking of all From e3c0f223575b16cc9aac94b82c87600a7c885c7c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 12:31:18 +0000 Subject: [PATCH 48/96] Update docstring of 'InputDataset' class --- mllam_data_prep/config.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 1190088..2bc42a9 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -153,11 +153,6 @@ class InputDataset: dims: List[str] List of the expected dimensions of the dataset. E.g. `["time", "x", "y"]`. These will be checked to ensure consistency of the dataset being read. - variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] - List of the variables to select from the dataset. E.g. `["temperature", "precipitation"]` - or a dictionary where the keys are the variable names and the values are dictionaries - defining the selection for each variable. E.g. `{"temperature": levels: {"values": [1000, 950, 900]}}` - would select the "temperature" variable and only the levels 1000, 950, and 900. dim_mapping: Dict[str, DimMapping] Mapping of the variables and dimensions in the input dataset to the dimensions of the output variable (`target_output_variable`). The key is the name of the output dimension to map to @@ -170,12 +165,19 @@ class InputDataset: (e.g. two datasets that coincide in space and time will only differ in the feature dimension, so the two will be combined by concatenating along the feature dimension). If a single shared coordinate cannot be found then an exception will be raised. + variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] + List of the variables to select from the dataset. E.g. `["temperature", "precipitation"]` + or a dictionary where the keys are the variable names and the values are dictionaries + defining the selection for each variable. E.g. `{"temperature": levels: {"values": [1000, 950, 900]}}` + would select the "temperature" variable and only the levels 1000, 950, and 900. derived_variables: Dict[str, DerivedVariable] Dictionary of variables to derive from the dataset, where the keys are the variable names and the values are dictionaries defining the necessary function and kwargs. E.g. `{"toa_radiation": {"kwargs": {"time": "time", "lat": "lat", "lon": "lon"}, "function": "calculate_toa_radiation"}}` would derive the "toa_radiation" variable using the `calculate_toa_radiation` function, which takes `time`, `lat` and `lon` as arguments. + attributes: Dict[str, Any] + Optional dictionary with dataset attributes. """ path: str From e907a6ddedf523a08f31423453ee1e341f6a13cf Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 14:10:30 +0000 Subject: [PATCH 49/96] Correct types in '_check_attributes' docstring --- mllam_data_prep/derived_variables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 1b275fd..f693f64 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -212,7 +212,7 @@ def _check_attributes(field, field_attributes): Parameters ---------- - field: Union[xr.DataArray, Tuple[xr.DataArray]] + field: xr.DataArray The derived field field_attributes: Dict[str, str] Dictionary with attributes for the derived variables. @@ -220,7 +220,7 @@ def _check_attributes(field, field_attributes): Returns ------- - field: Union[xr.DataArray, Tuple[xr.DataArray]] + field: xr.DataArray The derived field """ for attribute in ["units", "long_name"]: From bb9be1375bcce3cbdc0ef658efee8660efbe788c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 14:15:24 +0000 Subject: [PATCH 50/96] Use 'rpartition' to get 'module_name' and 'function_name' --- mllam_data_prep/derived_variables.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index f693f64..4861324 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -172,13 +172,7 @@ def _get_derived_variable_function(function_namespace): calling_module = globals()["__name__"] # Get module and function names - function_namespace_list = function_namespace.rsplit(".") - if len(function_namespace_list) > 1: - function_name = function_namespace_list[-1] - module_name = ".".join(elem for elem in function_namespace_list[:-1]) - else: - module_name = "" - function_name = function_namespace_list[0] + module_name, _, function_name = function_namespace.rpartition(".") # Check if the module_name is pointing to here (the calling module or empty "") # If it does, then use globals() to get the function otherwise import the From 49de0b3a81f261c8d87dea27c314c74290d0bb86 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 14:23:57 +0000 Subject: [PATCH 51/96] Add some initial tests for 'derived_variables' --- tests/test_derived_variables.py | 117 ++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 tests/test_derived_variables.py diff --git a/tests/test_derived_variables.py b/tests/test_derived_variables.py new file mode 100644 index 0000000..70a9810 --- /dev/null +++ b/tests/test_derived_variables.py @@ -0,0 +1,117 @@ +import datetime +import random +from unittest.mock import patch + +import isodate +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +import mllam_data_prep as mdp + +NCOORD = 10 +NTIME = 10 +LAT_MIN = -90 +LAT_MAX = 90 +LON_MIN = 0 +LON_MAX = 360 +LATITUDE = [ + 55.711, + xr.DataArray( + np.random.uniform(LAT_MIN, LAT_MAX, size=(NCOORD, NCOORD)), + dims=["x", "y"], + coords={"x": np.arange(NCOORD), "y": np.arange(NCOORD)}, + name="lat", + ), +] +LONGITUDE = [ + 12.564, + xr.DataArray( + np.random.uniform(LON_MIN, LON_MAX, size=(NCOORD, NCOORD)), + dims=["x", "y"], + coords={"x": np.arange(NCOORD), "y": np.arange(NCOORD)}, + name="lon", + ), +] +TIME = [ + np.datetime64("2004-06-11T00:00:00"), # invalid type + isodate.parse_datetime("1999-03-21T00:00"), + xr.DataArray( + pd.date_range( + start=isodate.parse_datetime("1999-03-21T00:00"), + periods=NTIME, + freq=isodate.parse_duration("PT1H"), + ), + dims=["time"], + name="time", + ), +] + + +def mock_cyclic_encoding(data, data_max): + """Mock the `cyclic_encoding` function from mllam_data_prep.derived_variables.""" + if isinstance(data, xr.DataArray): + data_cos = xr.DataArray( + random.uniform(-1, 1), + coords=data.coords, + dims=data.dims, + ) + data_sin = xr.DataArray( + random.uniform(-1, 1), + coords=data.coords, + dims=data.dims, + ) + return data_cos, data_sin + elif isinstance(data, (float, int)): + return random.uniform(-1, 1), random.uniform(-1, 1) + + +@pytest.mark.parametrize("lat", LATITUDE) +@pytest.mark.parametrize("lon", LONGITUDE) +@pytest.mark.parametrize("time", TIME) +def test_toa_radiation(lat, lon, time): + """ + Test the `calculate_toa_radiation` function from mllam_data_prep.derived_variables + """ + with patch( + "mllam_data_prep.derived_variables.cyclic_encoding", + side_effect=mock_cyclic_encoding, + ): + if isinstance(time, (xr.DataArray, datetime.datetime)): + mdp.derived_variables.calculate_toa_radiation(lat, lon, time) + else: + with pytest.raises(TypeError): + mdp.derived_variables.calculate_toa_radiation(lat, lon, time) + + +@pytest.mark.parametrize("time", TIME) +def test_hour_of_day(time): + """ + Test the `calculate_hour_of_day` function from mllam_data_prep.derived_variables + """ + with patch( + "mllam_data_prep.derived_variables.cyclic_encoding", + side_effect=mock_cyclic_encoding, + ): + if isinstance(time, (xr.DataArray, datetime.datetime)): + mdp.derived_variables.calculate_hour_of_day(time) + else: + with pytest.raises(TypeError): + mdp.derived_variables.calculate_hour_of_day(time) + + +@pytest.mark.parametrize("time", TIME) +def test_day_of_year(time): + """ + Test the `calculate_day_of_year` function from mllam_data_prep.derived_variables + """ + with patch( + "mllam_data_prep.derived_variables.cyclic_encoding", + side_effect=mock_cyclic_encoding, + ): + if isinstance(time, (xr.DataArray, datetime.datetime)): + mdp.derived_variables.calculate_day_of_year(time) + else: + with pytest.raises(TypeError): + mdp.derived_variables.calculate_day_of_year(time) From b268f01b7e099eafaaf817caf86010b5c5ce70c0 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:22:47 +0000 Subject: [PATCH 52/96] Update docstrings and rename 'DerivedVariable.attributes' to 'DerivedVariable.attrs' --- mllam_data_prep/config.py | 17 +++++++++-------- mllam_data_prep/derived_variables.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 2bc42a9..bfd20e9 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -57,17 +57,21 @@ class DerivedVariable: """ Defines a derived variables, where the kwargs (variables required for the calculation) and the function (for calculating the variable) - are specified. + are specified. Optionally, in case a function does not return an + `xr.DataArray` with the required attributes (`units` and `long_name`) set, + these should be specified in `attrs`, e.g. + {"attrs": "units": "W*m**-2, "long_name": "top-of-the-atmosphere radiation"}. + Additional attributes can also be set if desired. Attributes: kwargs: Variables required for calculating the derived variable. function: Function used to calculate the derived variable. - attributes: Attributes (e.g. `units` and `long_name`) for the derived variable. + attrs: Attributes (e.g. `units` and `long_name`) to set for the derived variable. """ kwargs: Dict[str, str] function: str - attributes: Optional[Dict[str, str]] = field(default_factory=dict) + attrs: Optional[Dict[str, str]] = field(default_factory=dict) @dataclass @@ -171,11 +175,8 @@ class InputDataset: defining the selection for each variable. E.g. `{"temperature": levels: {"values": [1000, 950, 900]}}` would select the "temperature" variable and only the levels 1000, 950, and 900. derived_variables: Dict[str, DerivedVariable] - Dictionary of variables to derive from the dataset, where the keys are the variable names and - the values are dictionaries defining the necessary function and kwargs. E.g. - `{"toa_radiation": {"kwargs": {"time": "time", "lat": "lat", "lon": "lon"}, "function": "calculate_toa_radiation"}}` - would derive the "toa_radiation" variable using the `calculate_toa_radiation` function, which - takes `time`, `lat` and `lon` as arguments. + Dictionary of variables to derive from the dataset, where the keys are the names variables will be given and + the values are `DerivedVariable` definitions that specify how to derive a variable. attributes: Dict[str, Any] Optional dictionary with dataset attributes. """ diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 4861324..80d6cae 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -34,7 +34,7 @@ def derive_variables(ds, derived_variables, chunking): for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function - derived_variable_attributes = derived_variable.attributes + derived_variable_attributes = derived_variable.attrs # Separate the lat,lon from the required variables as these will be derived separately logger.warning( From dbd5bfd5fbd5f3473e58ac9cc477478ed78f8c7b Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:29:19 +0000 Subject: [PATCH 53/96] Do not add 'attributes' to docstring --- mllam_data_prep/config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index bfd20e9..f114f60 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -177,8 +177,6 @@ class InputDataset: derived_variables: Dict[str, DerivedVariable] Dictionary of variables to derive from the dataset, where the keys are the names variables will be given and the values are `DerivedVariable` definitions that specify how to derive a variable. - attributes: Dict[str, Any] - Optional dictionary with dataset attributes. """ path: str From 474a83db1ac56828f66701649bbc8c70a1d4b1ee Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:31:46 +0000 Subject: [PATCH 54/96] Remove unnecessary exception handling --- mllam_data_prep/create_dataset.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 181292a..bd53cd2 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -146,28 +146,17 @@ def create_dataset(config: Config): if variables: logger.info(f"Subsetting dataset {dataset_name}") - try: - ds = subset_dataset( - ds=ds_source, variables=variables, chunking=chunking_config - ) - except Exception as ex: - raise Exception( - f"Error subsetting dataset {dataset_name} from {path}" - ) from ex + ds = subset_dataset( + ds=ds_source, variables=variables, chunking=chunking_config + ) if derived_variables: logger.info(f"Deriving variables from {dataset_name}") - try: - ds = derive_variables( - ds=ds_source, - derived_variables=derived_variables, - chunking=chunking_config, - ) - except Exception as ex: - raise Exception( - f"Error deriving variables '{', '.join(list(derived_variables.keys()))}'" - f" from dataset {dataset_name} from {path}" - ) from ex + ds = derive_variables( + ds=ds_source, + derived_variables=derived_variables, + chunking=chunking_config, + ) _check_dataset_attributes( ds=ds, From 1da66e2d3b9a6e572be31ddc16a365963940d636 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:44:03 +0000 Subject: [PATCH 55/96] Move 'subset_dataset' to 'ops.subsetting' --- mllam_data_prep/create_dataset.py | 9 +++-- mllam_data_prep/ops/loading.py | 61 +------------------------------ mllam_data_prep/ops/subsetting.py | 60 ++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 64 deletions(-) create mode 100644 mllam_data_prep/ops/subsetting.py diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index bd53cd2..a034eaa 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -11,10 +11,11 @@ from . import __version__ from .config import Config, InvalidConfigException from .derived_variables import derive_variables -from .ops.loading import load_dataset, subset_dataset +from .ops.loading import load_input_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs from .ops.statistics import calc_stats +from .ops.subsetting import subset_dataset # the `extra` field in the config that was added between v0.2.0 and v0.5.0 is # optional, so we can support both v0.2.0 and v0.5.0 @@ -140,20 +141,20 @@ def create_dataset(config: Config): logger.info(f"Loading dataset {dataset_name} from {path}") try: - ds_source = load_dataset(fp=path) + ds_input = load_input_dataset(fp=path) except Exception as ex: raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex if variables: logger.info(f"Subsetting dataset {dataset_name}") ds = subset_dataset( - ds=ds_source, variables=variables, chunking=chunking_config + ds=ds_input, variables=variables, chunking=chunking_config ) if derived_variables: logger.info(f"Deriving variables from {dataset_name}") ds = derive_variables( - ds=ds_source, + ds=ds_input, derived_variables=derived_variables, chunking=chunking_config, ) diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index a8c2d24..f6bfc34 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -1,7 +1,7 @@ import xarray as xr -def load_dataset(fp): +def load_input_dataset(fp): """ Load the dataset @@ -23,62 +23,3 @@ def load_dataset(fp): ds = xr.open_dataset(fp) return ds - - -def subset_dataset(ds, variables, chunking): - """ - Load the dataset, subset the variables along the specified coordinates and - check coordinate units - - Parameters - ---------- - ds : xr.Dataset - Source dataset - variables : dict - Dictionary with the variables to subset - with keys as the variable names and values with entries for each - coordinate and coordinate values to extract - chunking: dict - Dictionary with keys as the dimensions to chunk along and values - with the chunk size - """ - - ds_subset = xr.Dataset() - ds_subset.attrs.update(ds.attrs) - if isinstance(variables, dict): - for var, coords_to_sample in variables.items(): - da = ds[var] - for coord, sampling in coords_to_sample.items(): - coord_values = sampling.values - try: - da = da.sel(**{coord: coord_values}) - except KeyError as ex: - raise KeyError( - f"Could not find the all coordinate values `{coord_values}` in " - f"coordinate `{coord}` in the dataset" - ) from ex - expected_units = sampling.units - coord_units = da[coord].attrs.get("units", None) - if coord_units is not None and coord_units != expected_units: - raise ValueError( - f"Expected units {expected_units} for coordinate {coord}" - f" in variable {var} but got {coord_units}" - ) - ds_subset[var] = da - elif isinstance(variables, list): - try: - ds_subset = ds[variables] - except KeyError as ex: - raise KeyError( - f"Could not find the all variables `{variables}` in the dataset. " - f"The available variables are {list(ds.data_vars)}" - ) from ex - else: - raise ValueError("The `variables` argument should be a list or a dictionary") - - chunks = { - dim: chunking.get(dim, int(ds_subset[dim].count())) for dim in ds_subset.dims - } - ds_subset = ds_subset.chunk(chunks) - - return ds_subset diff --git a/mllam_data_prep/ops/subsetting.py b/mllam_data_prep/ops/subsetting.py new file mode 100644 index 0000000..8cfa8ca --- /dev/null +++ b/mllam_data_prep/ops/subsetting.py @@ -0,0 +1,60 @@ +import xarray as xr + + +def subset_dataset(ds, variables, chunking): + """ + Select specific variables from the provided the dataset, subset the + variables along the specified coordinates and check coordinate units + + Parameters + ---------- + ds : xr.Dataset + Source dataset + variables : dict + Dictionary with the variables to subset + with keys as the variable names and values with entries for each + coordinate and coordinate values to extract + chunking: dict + Dictionary with keys as the dimensions to chunk along and values + with the chunk size + """ + + ds_subset = xr.Dataset() + ds_subset.attrs.update(ds.attrs) + if isinstance(variables, dict): + for var, coords_to_sample in variables.items(): + da = ds[var] + for coord, sampling in coords_to_sample.items(): + coord_values = sampling.values + try: + da = da.sel(**{coord: coord_values}) + except KeyError as ex: + raise KeyError( + f"Could not find the all coordinate values `{coord_values}` in " + f"coordinate `{coord}` in the dataset" + ) from ex + expected_units = sampling.units + coord_units = da[coord].attrs.get("units", None) + if coord_units is not None and coord_units != expected_units: + raise ValueError( + f"Expected units {expected_units} for coordinate {coord}" + f" in variable {var} but got {coord_units}" + ) + ds_subset[var] = da + elif isinstance(variables, list): + try: + ds_subset = ds[variables] + except KeyError as ex: + raise KeyError( + f"Could not find the all variables `{variables}` in the dataset. " + f"The available variables are {list(ds.data_vars)}" + ) from ex + else: + raise ValueError("The `variables` argument should be a list or a dictionary") + + chunks = { + dim: chunking.get(dim, int(ds_subset[dim].count())) for dim in ds_subset.dims + } + ds_subset = ds_subset.chunk(chunks) + + return ds_subset From dc7dc5e04ade8d63b6923cbc4d5b4b9303d760be Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:48:26 +0000 Subject: [PATCH 56/96] Move 'derived_variables' to 'ops' --- example.danra.yaml | 4 ++-- mllam_data_prep/create_dataset.py | 2 +- mllam_data_prep/{ => ops}/derived_variables.py | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename mllam_data_prep/{ => ops}/derived_variables.py (100%) diff --git a/example.danra.yaml b/example.danra.yaml index f1fa443..d6a9468 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -82,11 +82,11 @@ inputs: time: time lat: lat lon: lon - function: mllam_data_prep.derived_variables.calculate_toa_radiation + function: mllam_data_prep.ops.derived_variables.calculate_toa_radiation hour_of_day: kwargs: time: time - function: mllam_data_prep.derived_variables.calculate_hour_of_day + function: mllam_data_prep.ops.derived_variables.calculate_hour_of_day dim_mapping: time: method: rename diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index a034eaa..19bf4df 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,7 +10,7 @@ from . import __version__ from .config import Config, InvalidConfigException -from .derived_variables import derive_variables +from .ops.derived_variables import derive_variables from .ops.loading import load_input_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/ops/derived_variables.py similarity index 100% rename from mllam_data_prep/derived_variables.py rename to mllam_data_prep/ops/derived_variables.py From c9e96af9388a2fa9d75be2c6367b24d6ba399f6c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:59:02 +0000 Subject: [PATCH 57/96] Move chunk size check to 'chunking' module --- mllam_data_prep/ops/chunking.py | 44 ++++++++++++++++++++++++ mllam_data_prep/ops/derived_variables.py | 26 +++----------- 2 files changed, 48 insertions(+), 22 deletions(-) create mode 100644 mllam_data_prep/ops/chunking.py diff --git a/mllam_data_prep/ops/chunking.py b/mllam_data_prep/ops/chunking.py new file mode 100644 index 0000000..12731e1 --- /dev/null +++ b/mllam_data_prep/ops/chunking.py @@ -0,0 +1,44 @@ +import numpy as np +from loguru import logger + +# Max chunk size warning +CHUNK_MAX_SIZE_WARNING = 1 * 1024**3 # 1GB + + +def check_chunk_size(ds, chunks): + """ + Check the chunk size and warn if it exceed CHUNK_MAX_SIZE_WARNING. + + Parameters + ---------- + ds: xr.Dataset + Dataset to be chunked + chunks: Dict[str, int] + Dictionary with keys as dimensions to be chunked and + chunk sizes as the values + + Returns + ------- + ds: xr.Dataset + Dataset with chunking applied + """ + + # Check the chunk size + for var_name, var_data in ds.data_vars.items(): + total_size = 1 + + for dim, chunk_size in chunks.items(): + dim_size = ds.sizes.get(dim, None) + if dim_size is None: + raise KeyError(f"Dimension '{dim}' not found in the dataset.") + total_size *= chunk_size + + dtype = var_data.dtype + bytes_per_element = np.dtype(dtype).itemsize + + memory_usage = total_size * bytes_per_element + + if memory_usage > CHUNK_MAX_SIZE_WARNING: + logger.warning( + f"The chunk size for '{var_name}' exceeds '{CHUNK_MAX_SIZE_WARNING}' GB." + ) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 80d6cae..f31865f 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -6,6 +6,8 @@ import xarray as xr from loguru import logger +from .chunking import check_chunk_size + def derive_variables(ds, derived_variables, chunking): """ @@ -105,7 +107,7 @@ def derive_variables(ds, derived_variables, chunking): def _chunk_dataset(ds, chunks): """ - Chunk dataset and check the chunk size. + Check the chunk size and chunk dataset. Parameters ---------- @@ -120,28 +122,8 @@ def _chunk_dataset(ds, chunks): ds: xr.Dataset Dataset with chunking applied """ - # Define the memory limit check - memory_limit_check = 1 * 1024**3 # 1 GB - # Check the chunk size - for var_name, var_data in ds.data_vars.items(): - total_size = 1 - - for dim, chunk_size in chunks.items(): - dim_size = ds.sizes.get(dim, None) - if dim_size is None: - raise KeyError(f"Dimension '{dim}' not found in the dataset.") - total_size *= chunk_size - - dtype = var_data.dtype - bytes_per_element = np.dtype(dtype).itemsize - - memory_usage = total_size * bytes_per_element - - if memory_usage > memory_limit_check: - logger.warning( - f"The chunk size for '{var_name}' exceeds '{memory_limit_check}' GB." - ) + check_chunk_size(ds, chunks) # Try chunking try: From 47b8411b0b23c026d0f64260384d81c2d5e2b700 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 14:46:28 +0000 Subject: [PATCH 58/96] Add module docstring --- mllam_data_prep/ops/derived_variables.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index f31865f..9ad495e 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -1,3 +1,11 @@ +""" +Handle deriving new variables (xr.DataArrays) from an individual input dataset +that has been loaded. This makes it possible to for example add fields that can +be derived from analytical expressions and are functions of coordinate values +(e.g. top-of-atmosphere incoming radiation is a function of time and lat/lon location), +but also of other physical fields (wind-speed is a function of both meridional +and zonal wind components). +""" import datetime import importlib import sys From 5ae772f736e773d5d1beef8db684e19e09366ea7 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 14:54:07 +0000 Subject: [PATCH 59/96] Update tests --- tests/test_derived_variables.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_derived_variables.py b/tests/test_derived_variables.py index 70a9810..786e064 100644 --- a/tests/test_derived_variables.py +++ b/tests/test_derived_variables.py @@ -50,7 +50,7 @@ def mock_cyclic_encoding(data, data_max): - """Mock the `cyclic_encoding` function from mllam_data_prep.derived_variables.""" + """Mock the `cyclic_encoding` function from mllam_data_prep.ops.derived_variables.""" if isinstance(data, xr.DataArray): data_cos = xr.DataArray( random.uniform(-1, 1), @@ -75,14 +75,14 @@ def test_toa_radiation(lat, lon, time): Test the `calculate_toa_radiation` function from mllam_data_prep.derived_variables """ with patch( - "mllam_data_prep.derived_variables.cyclic_encoding", + "mllam_data_prep.ops.derived_variables.cyclic_encoding", side_effect=mock_cyclic_encoding, ): if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.derived_variables.calculate_toa_radiation(lat, lon, time) + mdp.ops.derived_variables.calculate_toa_radiation(lat, lon, time) else: with pytest.raises(TypeError): - mdp.derived_variables.calculate_toa_radiation(lat, lon, time) + mdp.ops.derived_variables.calculate_toa_radiation(lat, lon, time) @pytest.mark.parametrize("time", TIME) @@ -91,14 +91,14 @@ def test_hour_of_day(time): Test the `calculate_hour_of_day` function from mllam_data_prep.derived_variables """ with patch( - "mllam_data_prep.derived_variables.cyclic_encoding", + "mllam_data_prep.ops.derived_variables.cyclic_encoding", side_effect=mock_cyclic_encoding, ): if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.derived_variables.calculate_hour_of_day(time) + mdp.ops.derived_variables.calculate_hour_of_day(time) else: with pytest.raises(TypeError): - mdp.derived_variables.calculate_hour_of_day(time) + mdp.ops.derived_variables.calculate_hour_of_day(time) @pytest.mark.parametrize("time", TIME) @@ -107,11 +107,11 @@ def test_day_of_year(time): Test the `calculate_day_of_year` function from mllam_data_prep.derived_variables """ with patch( - "mllam_data_prep.derived_variables.cyclic_encoding", + "mllam_data_prep.ops.derived_variables.cyclic_encoding", side_effect=mock_cyclic_encoding, ): if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.derived_variables.calculate_day_of_year(time) + mdp.ops.derived_variables.calculate_day_of_year(time) else: with pytest.raises(TypeError): - mdp.derived_variables.calculate_day_of_year(time) + mdp.ops.derived_variables.calculate_day_of_year(time) From 2c0bdf879df0fbfe428e1eb1318267fe5a4d4713 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 18 Dec 2024 09:00:08 +0000 Subject: [PATCH 60/96] Add global REQUIRED_FIELD_ATTRIBUTES var and updated check for required attributes --- mllam_data_prep/ops/derived_variables.py | 53 ++++++++++++++---------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 9ad495e..41db614 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -16,6 +16,8 @@ from .chunking import check_chunk_size +REQUIRED_FIELD_ATTRIBUTES = ["units", "long_name"] + def derive_variables(ds, derived_variables, chunking): """ @@ -44,7 +46,7 @@ def derive_variables(ds, derived_variables, chunking): for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function - derived_variable_attributes = derived_variable.attrs + expected_field_attributes = derived_variable.attrs # Separate the lat,lon from the required variables as these will be derived separately logger.warning( @@ -87,17 +89,18 @@ def derive_variables(ds, derived_variables, chunking): # Calculate the derived variable derived_field = func(**kwargs) - # Check the derived field(s) and add it to the dataset + # Check that the derived field has the necessary attributes (REQUIRED_FIELD_ATTRIBUTES) + # set and add it to the dataset if isinstance(derived_field, xr.DataArray): - derived_field = _check_attributes( - derived_field, derived_variable_attributes + derived_field = _check_for_required_attributes( + derived_field, expected_field_attributes ) ds_derived_vars[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: - field = _check_attributes(field, derived_variable_attributes) + field = _check_for_required_attributes(field, expected_field_attributes) ds_derived_vars[field.name] = field else: raise TypeError( @@ -190,7 +193,7 @@ def _get_derived_variable_function(function_namespace): return function -def _check_attributes(field, field_attributes): +def _check_for_required_attributes(field, expected_attributes): """ Check the attributes of the derived variable. @@ -198,8 +201,8 @@ def _check_attributes(field, field_attributes): ---------- field: xr.DataArray The derived field - field_attributes: Dict[str, str] - Dictionary with attributes for the derived variables. + expected_attributes: Dict[str, str] + Dictionary with expected attributes for the derived variables. Defined in the config file. Returns @@ -207,32 +210,36 @@ def _check_attributes(field, field_attributes): field: xr.DataArray The derived field """ - for attribute in ["units", "long_name"]: + for attribute in REQUIRED_FIELD_ATTRIBUTES: if attribute not in field.attrs or field.attrs[attribute] is None: - if attribute in field_attributes.keys(): - field.attrs[attribute] = field_attributes[attribute] + if attribute in expected_attributes.keys(): + field.attrs[attribute] = expected_attributes[attribute] else: # The expected attributes are empty and the attributes have not been # set during the calculation of the derived variable - raise ValueError( - f"The attribute '{attribute}' has not been set for the derived" - f" variable '{field.name}' (most likely because you are using a" - " function external to `mlllam-data-prep` to derive the field)." - " This attribute has not been defined in the 'attributes' section" - " of the config file either. Make sure that you add it to the" - f" 'attributes' section of the derived variable '{field.name}'." + raise KeyError( + f'The attribute "{attribute}" has not been set for the derived' + f' variable "{field.name}". This is most likely because you are' + " using a function external to `mlllam-data-prep` to derive the field," + f" in which the required attributes ({', '.join(REQUIRED_FIELD_ATTRIBUTES)})" + " are not set. If they are not set in the function call when deriving the field," + ' they can be set in the config file by adding an "attrs" section under the' + f' "{field.name}" derived variable section. For example, if the required attributes' + f" ({', '.join(REQUIRED_FIELD_ATTRIBUTES)}) are not set for a derived variable named" + f' "toa_radiation" they can be set by adding the following to the config file:' + ' {"attrs": {"units": "W*m**-2", "long_name": "top-of-atmosphere incoming radiation"}}.' ) - elif attribute in field_attributes.keys(): + elif attribute in expected_attributes.keys(): logger.warning( f"The attribute '{attribute}' of the derived field" f" {field.name} is being overwritten from" f" '{field.attrs[attribute]}' to" - f" '{field_attributes[attribute]}' according" - " to specification in the config file." + f" '{expected_attributes[attribute]}' according" + " to the specification in the config file." ) - field.attrs[attribute] = field_attributes[attribute] + field.attrs[attribute] = expected_attributes[attribute] else: - # Attributes are set and nothing has been defined in the config file + # Attributes are set in the funciton and nothing has been defined in the config file pass return field From f1ce6d196e9662663593b5933afcf188cee0e911 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 18 Dec 2024 09:01:57 +0000 Subject: [PATCH 61/96] Update long name for toa_radiation --- mllam_data_prep/ops/derived_variables.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 41db614..8dafcd0 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -277,7 +277,7 @@ def _return_dropped_coordinates( def calculate_toa_radiation(lat, lon, time): """ - Function for calculating top-of-the-atmosphere radiation + Function for calculating top-of-atmosphere incoming radiation Parameters ---------- @@ -291,9 +291,9 @@ def calculate_toa_radiation(lat, lon, time): Returns ------- toa_radiation : Union[xr.DataArray, float] - TOA radiation data + Top-of-atmosphere incoming radiation """ - logger.info("Calculating top-of-atmosphere radiation") + logger.info("Calculating top-of-atmosphere incoming radiation") # Solar constant solar_constant = 1366 # W*m**-2 @@ -331,7 +331,7 @@ def calculate_toa_radiation(lat, lon, time): if isinstance(toa_radiation, xr.DataArray): # Add attributes toa_radiation.name = "toa_radiation" - toa_radiation.attrs["long_name"] = "top-of-the-atmosphere radiation" + toa_radiation.attrs["long_name"] = "top-of-atmosphere incoming radiation" toa_radiation.attrs["units"] = "W*m**-2" return toa_radiation From 58d8af6cbcce45b0bce5416c72249f04c5a6b405 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 18 Dec 2024 09:03:11 +0000 Subject: [PATCH 62/96] Update README --- README.md | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index de089dc..fcb903d 100644 --- a/README.md +++ b/README.md @@ -371,10 +371,39 @@ The `inputs` section defines the source datasets to extract data from. Each sour - `rename`: simply rename the dimension to the new name - `stack`: stack the listed dimension to create the dimension in the output - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable. -- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with the following additional information. - - `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.derived_variables` module it is enough with the function name only. - - `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the variable name to select from the source dataset and each value is the named argument to `function`. - - `attributes`: section where users can specify attributes (e.g. `units` and `long_name`) as a dictionary (not included in the example config file), where the keys are the attribute names and the values are strings. If using a function defined in `mllam_data_prep.derived_variables` this section is optional as the attributes should already be defined. In this case, adding the attributes to the config file will overwrite the already-defined ones. If using an external function, where the attributes `units` and `long_name` are not set, this section is a requirement. +- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with the following additional information. See the 'Derived Variables' section for more details. + - `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.ops.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.ops.derived_variables` module it is enough with the function name only. + - `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the name of the variables to select from the source dataset and each value is the named argument to `function`. + +#### Derived Variables +Variables that are not part of the source dataset but can be derived from variables in the source dataset can also be included. They should be defined in their own section, called `derived_variables` as illustrated in the example config above and in the `example.danra.yaml` config file. + +To derive the variables, the function to be used to derive the variable (`function`) and the arguments to this function (`kwargs`) need to be specified, as explained above. In addition, an optional section called `attrs` can be added. In this section, the user can add attributes to the derived variable, as illustrated below. +```yaml + derived_variables: + toa_radiation: + kwargs: + time: time + lat: lat + lon: lon + function: mllam_data_prep.derived_variables.calculate_toa_radiation + attrs: + units: W*m**-2 + long_name: top-of-atmosphere incoming radiation +``` + +Note that the attributes `units` and `long_name` are required. This means that if the function used to derive a variable does not set these attributes they are **required** to be set in the config file. If using a function defined in `mllam_data_prep.ops.derived_variables` the `attrs` section is optional as the attributes should already be defined. In this case, adding the `units` and `long_name` attributes to the `attrs` section of the derived variable in config file will overwrite the already-defined attributes from the function. + +Currently, the following derived variables are included as part of `mllam-data-prep`: +- `toa_radiation`: + - Top-of-atmosphere incoming radiation + - function: `mllam_data_prep.ops.derived_variables.calculate_toa_radiation` +- `hour_of_day`: + - Hour of day (cyclically encoded) + - function: `mllam_data_prep.ops.derived_variables.calculate_hour_of_day` +- `day_of_year`: + - Day of year (cyclically encoded) + - function: `mllam_data_prep.ops.derived_variables.calculate_day_of_year` ### Config schema versioning From f87b95438f7452e0757dc5e65699e7190962d2b8 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 19 Dec 2024 07:23:47 +0000 Subject: [PATCH 63/96] Return dropped coordinates to the data-arrays instead --- mllam_data_prep/ops/derived_variables.py | 27 ++++++++++++------------ 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 8dafcd0..2e30f12 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -95,12 +95,18 @@ def derive_variables(ds, derived_variables, chunking): derived_field = _check_for_required_attributes( derived_field, expected_field_attributes ) + derived_field = _return_dropped_coordinates( + derived_field, ds_input, required_coordinates, chunks + ) ds_derived_vars[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: field = _check_for_required_attributes(field, expected_field_attributes) + field = _return_dropped_coordinates( + field, ds_input, required_coordinates, chunks + ) ds_derived_vars[field.name] = field else: raise TypeError( @@ -108,11 +114,6 @@ def derive_variables(ds, derived_variables, chunking): f" but got {type(derived_field)}." ) - # Add back dropped coordinates - ds_derived_vars = _return_dropped_coordinates( - ds_derived_vars, ds_input, required_coordinates, chunks - ) - return ds_derived_vars @@ -245,16 +246,14 @@ def _check_for_required_attributes(field, expected_attributes): return field -def _return_dropped_coordinates( - ds_derived_vars, ds_input, required_coordinates, chunks -): +def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, chunks): """ - Return the coordinates that have been reset. + Return the coordinates that have been dropped/reset. Parameters ---------- - ds_derived_vars: xr.Dataset - Dataset with derived variables + derived_field: xr.Dataset + Derived variable ds_input: xr.Dataset Input dataset for deriving variables required_coordinates: List[str] @@ -265,14 +264,14 @@ def _return_dropped_coordinates( Returns ------- - ds_derived_vars: xr.Dataset + derived_field: xr.Dataset Dataset with derived variables, now also with dropped coordinates returned """ for req_coord in required_coordinates: if req_coord in chunks: - ds_derived_vars.coords[req_coord] = ds_input[req_coord] + derived_field.coords[req_coord] = ds_input[req_coord] - return ds_derived_vars + return derived_field def calculate_toa_radiation(lat, lon, time): From 80cf058440b421747e8745c0b95b7feea405aff2 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 19 Dec 2024 07:27:42 +0000 Subject: [PATCH 64/96] Adds dims to the dataset to make it work with derived variables that doesn't have all dimensions. This way we don't need to broadcast these variables explicitly to all dimensions. --- mllam_data_prep/ops/derived_variables.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 2e30f12..9876e42 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -43,6 +43,10 @@ def derive_variables(ds, derived_variables, chunking): ds_derived_vars = xr.Dataset() ds_derived_vars.attrs.update(ds.attrs) + # Add dimensions to the new dataset + for dim in ds.dims: + ds_derived_vars = ds_derived_vars.assign_coords({dim: ds.coords[dim]}) + for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function From da0c171245b12b6aae4df10af33b5d97317d4a71 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 19 Dec 2024 09:12:43 +0000 Subject: [PATCH 65/96] Add ability to have 'variables' and 'derived_variables' in the same dataset - Output dataset is created in 'create_dataset' instead of in the 'subset_dataset' and 'derive_variables' functions. - Rename dataset variables to make it clearer what they are and also make them more consistent between 'subset_dataset' and 'derive_variables'. - Add function for aligning the derived variables to the correct output dimensions. - Move the 'derived_variables' from their own dataset in the example config file to the 'danra_surface' dataset, as it is now possible to combine them. --- example.danra.yaml | 16 +--- mllam_data_prep/create_dataset.py | 14 +++- mllam_data_prep/ops/derived_variables.py | 97 +++++++++++++++++------- mllam_data_prep/ops/subsetting.py | 19 ++--- 4 files changed, 89 insertions(+), 57 deletions(-) diff --git a/example.danra.yaml b/example.danra.yaml index d6a9468..30682ff 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -61,22 +61,8 @@ inputs: variables: # use surface incoming shortwave radiation as forcing - swavr0m - dim_mapping: - time: - method: rename - dim: time - grid_index: - method: stack - dims: [x, y] - forcing_feature: - method: stack_variables_by_var_name - name_format: "{var_name}" - target_output_variable: forcing - - danra_derived_forcings: - path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr - dims: [time, x, y] derived_variables: + # derive variables to be used as forcings toa_radiation: kwargs: time: time diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 19bf4df..698aed9 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -145,16 +145,26 @@ def create_dataset(config: Config): except Exception as ex: raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex + # Initialize the output dataset and add dimensions + ds = xr.Dataset() + ds.attrs.update(ds_input.attrs) + for dim in ds_input.dims: + ds = ds.assign_coords({dim: ds_input.coords[dim]}) + if variables: logger.info(f"Subsetting dataset {dataset_name}") ds = subset_dataset( - ds=ds_input, variables=variables, chunking=chunking_config + ds_subset=ds, + ds_input=ds_input, + variables=variables, + chunking=chunking_config, ) if derived_variables: logger.info(f"Deriving variables from {dataset_name}") ds = derive_variables( - ds=ds_input, + ds=ds, + ds_input=ds_input, derived_variables=derived_variables, chunking=chunking_config, ) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 9876e42..7502deb 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -19,14 +19,16 @@ REQUIRED_FIELD_ATTRIBUTES = ["units", "long_name"] -def derive_variables(ds, derived_variables, chunking): +def derive_variables(ds, ds_input, derived_variables, chunking): """ Load the dataset, and derive the specified variables Parameters --------- ds : xr.Dataset - Source dataset + Output dataset + ds_input : xr.Dataset + Input/source dataset derived_variables : Dict[str, DerivedVariable] Dictionary with the variables to derive with keys as the variable names and values with entries for kwargs and function to use in @@ -41,11 +43,7 @@ def derive_variables(ds, derived_variables, chunking): Dataset with derived variables included """ - ds_derived_vars = xr.Dataset() - ds_derived_vars.attrs.update(ds.attrs) - # Add dimensions to the new dataset - for dim in ds.dims: - ds_derived_vars = ds_derived_vars.assign_coords({dim: ds.coords[dim]}) + target_dims = list(ds_input.sizes.keys()) for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs @@ -62,63 +60,69 @@ def derive_variables(ds, derived_variables, chunking): if key in ["lat", "lon"]: latlon_coords_to_include[key] = required_kwargs.pop(key) - # Get input dataset for calculating derived variables - ds_input = ds[required_kwargs.keys()] + # Get subset of input dataset for calculating derived variables + ds_subset = ds_input[required_kwargs.keys()] # Any coordinates needed for the derivation, for which chunking should be performed, # should be converted to variables since it is not possible for *indexed* coordinates # to be chunked dask arrays chunks = { - dim: chunking.get(dim, int(ds_input[dim].count())) for dim in ds_input.dims + dim: chunking.get(dim, int(ds_subset[dim].count())) + for dim in ds_subset.dims } required_coordinates = [ - req_var for req_var in required_kwargs.keys() if req_var in ds_input.coords + req_var for req_var in required_kwargs.keys() if req_var in ds_subset.coords ] - ds_input = ds_input.drop_indexes(required_coordinates, errors="ignore") + ds_subset = ds_subset.drop_indexes(required_coordinates, errors="ignore") for req_coord in required_coordinates: if req_coord in chunks: - ds_input = ds_input.reset_coords(req_coord) + ds_subset = ds_subset.reset_coords(req_coord) # Chunk the dataset - ds_input = _chunk_dataset(ds_input, chunks) + ds_subset = _chunk_dataset(ds_subset, chunks) # Add function arguments to kwargs kwargs = {} if len(latlon_coords_to_include): - latlon = get_latlon_coords_for_input(ds) + latlon = get_latlon_coords_for_input(ds_input) for key, val in latlon_coords_to_include.items(): kwargs[val] = latlon[key] - kwargs.update({val: ds_input[key] for key, val in required_kwargs.items()}) + kwargs.update({val: ds_subset[key] for key, val in required_kwargs.items()}) func = _get_derived_variable_function(function_name) # Calculate the derived variable derived_field = func(**kwargs) # Check that the derived field has the necessary attributes (REQUIRED_FIELD_ATTRIBUTES) - # set and add it to the dataset + # set, return any dropped/reset coordinates, align it to the output dataset dimensions + # (if necessary) and add it to the dataset if isinstance(derived_field, xr.DataArray): derived_field = _check_for_required_attributes( derived_field, expected_field_attributes ) derived_field = _return_dropped_coordinates( - derived_field, ds_input, required_coordinates, chunks + derived_field, ds_subset, required_coordinates, chunks ) - ds_derived_vars[derived_field.name] = derived_field + derived_field = _align_derived_variable( + derived_field, ds_input, target_dims + ) + ds[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: field = _check_for_required_attributes(field, expected_field_attributes) field = _return_dropped_coordinates( - field, ds_input, required_coordinates, chunks + field, ds_subset, required_coordinates, chunks ) - ds_derived_vars[field.name] = field + field = _align_derived_variable(field, ds_input, target_dims) + ds[field.name] = field else: raise TypeError( "Expected an instance of xr.DataArray or tuple(xr.DataArray)," f" but got {type(derived_field)}." ) - return ds_derived_vars + return ds def _chunk_dataset(ds, chunks): @@ -250,7 +254,7 @@ def _check_for_required_attributes(field, expected_attributes): return field -def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, chunks): +def _return_dropped_coordinates(derived_field, ds, required_coordinates, chunks): """ Return the coordinates that have been dropped/reset. @@ -258,8 +262,8 @@ def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, c ---------- derived_field: xr.Dataset Derived variable - ds_input: xr.Dataset - Input dataset for deriving variables + ds: xr.Dataset + Dataset with required coordinatwes required_coordinates: List[str] List of coordinates required for the derived variable chunks: Dict[str, int] @@ -269,15 +273,50 @@ def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, c Returns ------- derived_field: xr.Dataset - Dataset with derived variables, now also with dropped coordinates returned + Derived variable, now also with dropped coordinates returned """ for req_coord in required_coordinates: if req_coord in chunks: - derived_field.coords[req_coord] = ds_input[req_coord] + derived_field.coords[req_coord] = ds[req_coord] return derived_field +def _align_derived_variable(field, ds, target_dims): + """ + Align a derived variable to the target dimensions (ignoring non-dimension coordinates). + + Parameters + ---------- + field: xr.DataArray + Derived field to align + ds: xr.Dataset + Target dataset + target_dims: List[str] + Dimensions to align to (e.g. 'time', 'y', 'x') + + Returns + ------- + field: xr.DataArray + The derived field aligned to the target dimensions + """ + # Ensure that dimensions are ordered correctly + field = field.transpose( + *[dim for dim in target_dims if dim in field.dims], missing_dims="ignore" + ) + + # Add missing dimensions explicitly + for dim in target_dims: + if dim not in field.dims: + field = field.expand_dims({dim: ds.sizes[dim]}) + + # Broadcast to match only the target dimensions + broadcast_shape = {dim: ds[dim] for dim in target_dims if dim in ds.dims} + field = field.broadcast_like(xr.Dataset(coords=broadcast_shape)) + + return field + + def calculate_toa_radiation(lat, lon, time): """ Function for calculating top-of-atmosphere incoming radiation @@ -467,6 +506,6 @@ def cyclic_encoding(data, data_max): return data_cos, data_sin -def get_latlon_coords_for_input(ds_input): +def get_latlon_coords_for_input(ds): """Dummy function for getting lat and lon.""" - return ds_input[["lat", "lon"]].chunk(-1, -1) + return ds[["lat", "lon"]].chunk(-1, -1) diff --git a/mllam_data_prep/ops/subsetting.py b/mllam_data_prep/ops/subsetting.py index 8cfa8ca..d2ba3a8 100644 --- a/mllam_data_prep/ops/subsetting.py +++ b/mllam_data_prep/ops/subsetting.py @@ -1,15 +1,14 @@ -import xarray as xr - - -def subset_dataset(ds, variables, chunking): +def subset_dataset(ds_subset, ds_input, variables, chunking): """ Select specific variables from the provided the dataset, subset the variables along the specified coordinates and check coordinate units Parameters ---------- - ds : xr.Dataset - Source dataset + ds_subset : xr.Dataset + Subset of ds_input + ds_input : xr.Dataset + Input/source dataset variables : dict Dictionary with the variables to subset with keys as the variable names and values with entries for each @@ -19,11 +18,9 @@ def subset_dataset(ds, variables, chunking): with the chunk size """ - ds_subset = xr.Dataset() - ds_subset.attrs.update(ds.attrs) if isinstance(variables, dict): for var, coords_to_sample in variables.items(): - da = ds[var] + da = ds_input[var] for coord, sampling in coords_to_sample.items(): coord_values = sampling.values try: @@ -43,11 +40,11 @@ def subset_dataset(ds, variables, chunking): ds_subset[var] = da elif isinstance(variables, list): try: - ds_subset = ds[variables] + ds_subset = ds_input[variables] except KeyError as ex: raise KeyError( f"Could not find the all variables `{variables}` in the dataset. " - f"The available variables are {list(ds.data_vars)}" + f"The available variables are {list(ds_input.data_vars)}" ) from ex else: raise ValueError("The `variables` argument should be a list or a dictionary") From f61a3b6590cf858683e03dd5ef3cd846c4a67b2e Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 19 Dec 2024 09:32:37 +0000 Subject: [PATCH 66/96] Update README --- README.md | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/README.md b/README.md index fcb903d..48eb1b6 100644 --- a/README.md +++ b/README.md @@ -320,22 +320,8 @@ inputs: variables: # use surface incoming shortwave radiation as forcing - swavr0m - dim_mapping: - time: - method: rename - dim: time - grid_index: - method: stack - dims: [x, y] - forcing_feature: - method: stack_variables_by_var_name - name_format: "{var_name}" - target_output_variable: forcing - - danra_derived_forcings: - path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr - dims: [time, x, y] derived_variables: + # derive variables to be used as forcings toa_radiation: kwargs: time: time From 554f86940e1276f4301a25458e97f1c9537275e2 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 20 Dec 2024 08:10:03 +0000 Subject: [PATCH 67/96] Add 'load_config' function, which wraps 'from_yaml_file' and checks that either 'variables' or 'derived_variables' are included and that if both are included, they don't contain the same variable names --- README.md | 2 +- mllam_data_prep/config.py | 50 ++++++++++++++++++++++++++++++- mllam_data_prep/create_dataset.py | 2 +- 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 48eb1b6..b95d186 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ The package can also be used as a python module to create datasets directly, for import mllam_data_prep as mdp config_path = "example.danra.yaml" -config = mdp.Config.from_yaml_file(config_path) +config = mdp.Config.load_config(config_path) ds = mdp.create_dataset(config=config) ``` diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index f114f60..93f407b 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -328,6 +328,54 @@ class Config(dataclass_wizard.JSONWizard, dataclass_wizard.YAMLWizard): class _(JSONWizard.Meta): raise_on_unknown_json_key = True + @staticmethod + def load_config(*args, **kwargs): + """ + Wrapper function for `from_yaml_file` to load config file and validate that: + - either `variables` or `derived_variables` are present in the config + - if both `variables` and `derived_variables` are present, that they don't + add the same variables to the dataset + + Parameters + ---------- + *args: Positional arguments for `from_yaml_file` + **kwargs: Keyword arguments for `from_yaml_file` + + Returns + ------- + config: Config + """ + + # Load the config + config = Config.from_yaml_file(*args, **kwargs) + + for input_dataset in config.inputs.values(): + if not input_dataset.variables and not input_dataset.derived_variables: + raise InvalidConfigException( + "At least one of the keys `variables` and `derived_variables` must be included" + " in the input dataset." + ) + elif input_dataset.variables and input_dataset.derived_variables: + # Check so that there are no overlapping variables + if isinstance(input_dataset.variables, list): + variable_vars = input_dataset.variables + elif isinstance(input_dataset.variables, dict): + variable_vars = input_dataset.variables.keys() + else: + raise TypeError( + f"Expected an instance of list or dict, but got {type(input_dataset.variables)}." + ) + derived_variable_vars = input_dataset.derived_variables.keys() + common_vars = list(set(variable_vars) & set(derived_variable_vars)) + if len(common_vars) > 0: + raise InvalidConfigException( + "Both `variables` and `derived_variables` include the following variables name(s):" + f" '{', '.join(common_vars)}'. This is not allowed. Make sure that there" + " are no overlapping variable names between `variables` and `derived_variables`," + f" either by renaming or removing '{', '.join(common_vars)}' from one of them." + ) + return config + if __name__ == "__main__": import argparse @@ -338,7 +386,7 @@ class _(JSONWizard.Meta): ) args = argparser.parse_args() - config = Config.from_yaml_file(args.f) + config = Config.load_config(args.f) import rich rich.print(config) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 698aed9..93cf82d 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -286,7 +286,7 @@ def create_dataset_zarr(fp_config, fp_zarr: str = None): The path to the zarr file to write the dataset to. If not provided, the zarr file will be written to the same directory as the config file with the extension changed to '.zarr'. """ - config = Config.from_yaml_file(file=fp_config) + config = Config.load_config(file=fp_config) ds = create_dataset(config=config) From 085aae33259f507668db74f5352c59b08b281091 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 20 Dec 2024 08:20:34 +0000 Subject: [PATCH 68/96] Update README --- README.md | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index b95d186..034aa60 100644 --- a/README.md +++ b/README.md @@ -175,32 +175,18 @@ inputs: variables: # use surface incoming shortwave radiation as forcing - swavr0m - dim_mapping: - time: - method: rename - dim: time - grid_index: - method: stack - dims: [x, y] - forcing_feature: - method: stack_variables_by_var_name - name_format: "{var_name}" - target_output_variable: forcing - - danra_derived_forcings: - path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr - dims: [time, x, y] derived_variables: + # derive variables to be used as forcings toa_radiation: kwargs: time: time lat: lat lon: lon - function: mllam_data_prep.derived_variables.calculate_toa_radiation + function: mllam_data_prep.ops.derived_variables.calculate_toa_radiation hour_of_day: kwargs: time: time - function: mllam_data_prep.derived_variables.calculate_hour_of_day + function: mllam_data_prep.ops.derived_variables.calculate_hour_of_day dim_mapping: time: method: rename From 980e511a6aa664b1998f1e4be02f0d05eb24f327 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 8 Jan 2025 07:56:45 +0000 Subject: [PATCH 69/96] Move 'chunk_dataset' to the chunking module --- mllam_data_prep/create_dataset.py | 4 +-- mllam_data_prep/ops/chunking.py | 30 ++++++++++++++++++++- mllam_data_prep/ops/derived_variables.py | 33 ++---------------------- mllam_data_prep/ops/subsetting.py | 10 +------ 4 files changed, 34 insertions(+), 43 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 93cf82d..96f672b 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,6 +10,7 @@ from . import __version__ from .config import Config, InvalidConfigException +from .ops.chunking import chunk_dataset from .ops.derived_variables import derive_variables from .ops.loading import load_input_dataset from .ops.mapping import map_dims_and_variables @@ -157,7 +158,6 @@ def create_dataset(config: Config): ds_subset=ds, ds_input=ds_input, variables=variables, - chunking=chunking_config, ) if derived_variables: @@ -225,7 +225,7 @@ def create_dataset(config: Config): # in the config logger.info(f"Chunking dataset with {chunking_config}") chunks = {dim: chunking_config.get(dim, int(ds[dim].count())) for dim in ds.dims} - ds = ds.chunk(chunks) + ds = chunk_dataset(ds, chunks) splitting = config.output.splitting diff --git a/mllam_data_prep/ops/chunking.py b/mllam_data_prep/ops/chunking.py index 12731e1..dfac4b1 100644 --- a/mllam_data_prep/ops/chunking.py +++ b/mllam_data_prep/ops/chunking.py @@ -23,7 +23,6 @@ def check_chunk_size(ds, chunks): Dataset with chunking applied """ - # Check the chunk size for var_name, var_data in ds.data_vars.items(): total_size = 1 @@ -42,3 +41,32 @@ def check_chunk_size(ds, chunks): logger.warning( f"The chunk size for '{var_name}' exceeds '{CHUNK_MAX_SIZE_WARNING}' GB." ) + + +def chunk_dataset(ds, chunks): + """ + Check the chunk size and chunk dataset. + + Parameters + ---------- + ds: xr.Dataset + Dataset to be chunked + chunks: Dict[str, int] + Dictionary with keys as dimensions to be chunked and + chunk sizes as the values + + Returns + ------- + ds: xr.Dataset + Dataset with chunking applied + """ + # Check the chunk size + check_chunk_size(ds, chunks) + + # Try chunking + try: + ds = ds.chunk(chunks) + except Exception as ex: + raise Exception(f"Error chunking dataset: {ex}") + + return ds diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 7502deb..3f09359 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -14,7 +14,7 @@ import xarray as xr from loguru import logger -from .chunking import check_chunk_size +from .chunking import chunk_dataset REQUIRED_FIELD_ATTRIBUTES = ["units", "long_name"] @@ -79,7 +79,7 @@ def derive_variables(ds, ds_input, derived_variables, chunking): ds_subset = ds_subset.reset_coords(req_coord) # Chunk the dataset - ds_subset = _chunk_dataset(ds_subset, chunks) + ds_subset = chunk_dataset(ds_subset, chunks) # Add function arguments to kwargs kwargs = {} @@ -125,35 +125,6 @@ def derive_variables(ds, ds_input, derived_variables, chunking): return ds -def _chunk_dataset(ds, chunks): - """ - Check the chunk size and chunk dataset. - - Parameters - ---------- - ds: xr.Dataset - Dataset to be chunked - chunks: Dict[str, int] - Dictionary with keys as dimensions to be chunked and - chunk sizes as the values - - Returns - ------- - ds: xr.Dataset - Dataset with chunking applied - """ - # Check the chunk size - check_chunk_size(ds, chunks) - - # Try chunking - try: - ds = ds.chunk(chunks) - except Exception as ex: - raise Exception(f"Error chunking dataset: {ex}") - - return ds - - def _get_derived_variable_function(function_namespace): """ Function for getting the function for deriving diff --git a/mllam_data_prep/ops/subsetting.py b/mllam_data_prep/ops/subsetting.py index d2ba3a8..c841533 100644 --- a/mllam_data_prep/ops/subsetting.py +++ b/mllam_data_prep/ops/subsetting.py @@ -1,4 +1,4 @@ -def subset_dataset(ds_subset, ds_input, variables, chunking): +def subset_dataset(ds_subset, ds_input, variables): """ Select specific variables from the provided the dataset, subset the variables along the specified coordinates and check coordinate units @@ -13,9 +13,6 @@ def subset_dataset(ds_subset, ds_input, variables, chunking): Dictionary with the variables to subset with keys as the variable names and values with entries for each coordinate and coordinate values to extract - chunking: dict - Dictionary with keys as the dimensions to chunk along and values - with the chunk size """ if isinstance(variables, dict): @@ -49,9 +46,4 @@ def subset_dataset(ds_subset, ds_input, variables, chunking): else: raise ValueError("The `variables` argument should be a list or a dictionary") - chunks = { - dim: chunking.get(dim, int(ds_subset[dim].count())) for dim in ds_subset.dims - } - ds_subset = ds_subset.chunk(chunks) - return ds_subset From b6e80d5042f7010a4b5eecb0f794706d51e25b98 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 8 Jan 2025 08:28:28 +0000 Subject: [PATCH 70/96] Update error message for when missing both 'variables' and 'derived_variables' in an input dataset in the config file --- mllam_data_prep/config.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 93f407b..428ca2c 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -349,11 +349,13 @@ def load_config(*args, **kwargs): # Load the config config = Config.from_yaml_file(*args, **kwargs) - for input_dataset in config.inputs.values(): + for input_dataset_name, input_dataset in config.inputs.items(): if not input_dataset.variables and not input_dataset.derived_variables: raise InvalidConfigException( - "At least one of the keys `variables` and `derived_variables` must be included" - " in the input dataset." + f"Input dataset '{input_dataset_name}' is missing the keys `variables` and/or" + " `derived_variables`. Make sure that you update the config so that the input" + f" dataset '{input_dataset_name}' contains at least either a `variables` or" + " `derived_variables` section." ) elif input_dataset.variables and input_dataset.derived_variables: # Check so that there are no overlapping variables From d6c1b36668201435b06f290dc644a1eef9d07c75 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 8 Jan 2025 10:30:54 +0000 Subject: [PATCH 71/96] Move the deriving-functions to a separate module --- example.danra.yaml | 4 +- mllam_data_prep/ops/__init__.py | 1 + .../ops/derive_variable/__init__.py | 1 + .../ops/derive_variable/physical_field.py | 74 +++++++ .../ops/derive_variable/time_components.py | 112 ++++++++++ mllam_data_prep/ops/derived_variables.py | 201 +----------------- 6 files changed, 199 insertions(+), 194 deletions(-) create mode 100644 mllam_data_prep/ops/derive_variable/__init__.py create mode 100644 mllam_data_prep/ops/derive_variable/physical_field.py create mode 100644 mllam_data_prep/ops/derive_variable/time_components.py diff --git a/example.danra.yaml b/example.danra.yaml index 30682ff..59f207c 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -68,11 +68,11 @@ inputs: time: time lat: lat lon: lon - function: mllam_data_prep.ops.derived_variables.calculate_toa_radiation + function: mllam_data_prep.ops.derive_variable.physical_field.calculate_toa_radiation hour_of_day: kwargs: time: time - function: mllam_data_prep.ops.derived_variables.calculate_hour_of_day + function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day dim_mapping: time: method: rename diff --git a/mllam_data_prep/ops/__init__.py b/mllam_data_prep/ops/__init__.py index e69de29..877cdfb 100644 --- a/mllam_data_prep/ops/__init__.py +++ b/mllam_data_prep/ops/__init__.py @@ -0,0 +1 @@ +from . import derive_variable diff --git a/mllam_data_prep/ops/derive_variable/__init__.py b/mllam_data_prep/ops/derive_variable/__init__.py new file mode 100644 index 0000000..2c94709 --- /dev/null +++ b/mllam_data_prep/ops/derive_variable/__init__.py @@ -0,0 +1 @@ +from . import physical_field, time_components diff --git a/mllam_data_prep/ops/derive_variable/physical_field.py b/mllam_data_prep/ops/derive_variable/physical_field.py new file mode 100644 index 0000000..d7b9617 --- /dev/null +++ b/mllam_data_prep/ops/derive_variable/physical_field.py @@ -0,0 +1,74 @@ +""" +Contains functions used to derive physical fields. This can be both +fields that can be derived from analytical expressions and are functions +of coordinate values (e.g. top-of-atmosphere incoming radiation is a function +of time and lat/lon location), but also of other physical fields, such as +wind speed, which is a function of both meridional and zonal wind components. +""" +import datetime + +import numpy as np +import xarray as xr +from loguru import logger + + +def calculate_toa_radiation(lat, lon, time): + """ + Function for calculating top-of-atmosphere incoming radiation + + Parameters + ---------- + lat : Union[xr.DataArray, float] + Latitude values. Should be in the range [-90, 90] + lon : Union[xr.DataArray, float] + Longitude values. Should be in the range [-180, 180] or [0, 360] + time : Union[xr.DataArray, datetime.datetime] + Time + + Returns + ------- + toa_radiation : Union[xr.DataArray, float] + Top-of-atmosphere incoming radiation + """ + logger.info("Calculating top-of-atmosphere incoming radiation") + + # Solar constant + solar_constant = 1366 # W*m**-2 + + # Different handling if xr.DataArray or datetime object + if isinstance(time, xr.DataArray): + day = time.dt.dayofyear + hour_utc = time.dt.hour + elif isinstance(time, datetime.datetime): + day = time.timetuple().tm_yday + hour_utc = time.hour + else: + raise TypeError( + "Expected an instance of xr.DataArray or datetime object," + f" but got {type(time)}." + ) + + # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. + # dec: declination - angular position of the sun at solar noon w.r.t. + # the plane of the equator + dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) + + utc_solar_time = hour_utc + lon / 15 + hour_angle = 15 * (utc_solar_time - 12) + + # Eq. 1.6.2 with beta=0 in Solar Engineering of Thermal Processes 4th ed. + # cos_sza: Cosine of solar zenith angle + cos_sza = np.sin(lat * np.pi / 180) * np.sin(dec) + np.cos( + lat * np.pi / 180 + ) * np.cos(dec) * np.cos(hour_angle * np.pi / 180) + + # Where TOA radiation is negative, set to 0 + toa_radiation = xr.where(solar_constant * cos_sza < 0, 0, solar_constant * cos_sza) + + if isinstance(toa_radiation, xr.DataArray): + # Add attributes + toa_radiation.name = "toa_radiation" + toa_radiation.attrs["long_name"] = "top-of-atmosphere incoming radiation" + toa_radiation.attrs["units"] = "W*m**-2" + + return toa_radiation diff --git a/mllam_data_prep/ops/derive_variable/time_components.py b/mllam_data_prep/ops/derive_variable/time_components.py new file mode 100644 index 0000000..eeec0da --- /dev/null +++ b/mllam_data_prep/ops/derive_variable/time_components.py @@ -0,0 +1,112 @@ +""" +Contains functions used to derive time component fields, such as e.g. day of year +and hour of day. +""" +import datetime + +import xarray as xr +from loguru import logger + +from ..derived_variables import cyclic_encoding + + +def calculate_hour_of_day(time): + """ + Function for calculating hour of day features with a cyclic encoding + + Parameters + ---------- + time : Union[xr.DataArray, datetime.datetime] + Time + + Returns + ------- + hour_of_day_cos: Union[xr.DataArray, float] + cosine of the hour of day + hour_of_day_sin: Union[xr.DataArray, float] + sine of the hour of day + """ + logger.info("Calculating hour of day") + + # Get the hour of the day + if isinstance(time, xr.DataArray): + hour_of_day = time.dt.hour + elif isinstance(time, datetime.datetime): + hour_of_day = time.hour + else: + raise TypeError( + "Expected an instance of xr.DataArray or datetime object," + f" but got {type(time)}." + ) + + # Cyclic encoding of hour of day + hour_of_day_cos, hour_of_day_sin = cyclic_encoding(hour_of_day, 24) + + if isinstance(hour_of_day_cos, xr.DataArray): + # Add attributes + hour_of_day_cos.name = "hour_of_day_cos" + hour_of_day_cos.attrs[ + "long_name" + ] = "Cosine component of cyclically encoded hour of day" + hour_of_day_cos.attrs["units"] = "1" + + if isinstance(hour_of_day_sin, xr.DataArray): + # Add attributes + hour_of_day_sin.name = "hour_of_day_sin" + hour_of_day_sin.attrs[ + "long_name" + ] = "Sine component of cyclically encoded hour of day" + hour_of_day_sin.attrs["units"] = "1" + + return hour_of_day_cos, hour_of_day_sin + + +def calculate_day_of_year(time): + """ + Function for calculating day of year features with a cyclic encoding + + Parameters + ---------- + time : Union[xr.DataArray, datetime.datetime] + Time + + Returns + ------- + day_of_year_cos: Union[xr.DataArray, float] + cosine of the day of year + day_of_year_sin: Union[xr.DataArray, float] + sine of the day of year + """ + logger.info("Calculating day of year") + + # Get the day of year + if isinstance(time, xr.DataArray): + day_of_year = time.dt.dayofyear + elif isinstance(time, datetime.datetime): + day_of_year = time.timetuple().tm_yday + else: + raise TypeError( + "Expected an instance of xr.DataArray or datetime object," + f" but got {type(time)}." + ) + + # Cyclic encoding of day of year - use 366 to include leap years! + day_of_year_cos, day_of_year_sin = cyclic_encoding(day_of_year, 366) + + if isinstance(day_of_year_cos, xr.DataArray): + # Add attributes + day_of_year_cos.name = "day_of_year_cos" + day_of_year_cos.attrs[ + "long_name" + ] = "Cosine component of cyclically encoded day of year" + day_of_year_cos.attrs["units"] = "1" + + if isinstance(day_of_year_sin, xr.DataArray): + # Add attributes + day_of_year_sin.name = "day_of_year_sin" + day_of_year_sin.attrs[ + "long_name" + ] = "Sine component of cyclically encoded day of year" + day_of_year_sin.attrs["units"] = "1" + + return day_of_year_cos, day_of_year_sin diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 3f09359..9824986 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -6,7 +6,6 @@ but also of other physical fields (wind-speed is a function of both meridional and zonal wind components). """ -import datetime import importlib import sys @@ -47,7 +46,7 @@ def derive_variables(ds, ds_input, derived_variables, chunking): for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs - function_name = derived_variable.function + function_namespace = derived_variable.function expected_field_attributes = derived_variable.attrs # Separate the lat,lon from the required variables as these will be derived separately @@ -88,7 +87,7 @@ def derive_variables(ds, ds_input, derived_variables, chunking): for key, val in latlon_coords_to_include.items(): kwargs[val] = latlon[key] kwargs.update({val: ds_subset[key] for key, val in required_kwargs.items()}) - func = _get_derived_variable_function(function_name) + func = _get_derived_variable_function(function_namespace) # Calculate the derived variable derived_field = func(**kwargs) @@ -133,42 +132,24 @@ def _get_derived_variable_function(function_namespace): Parameters ---------- function_namespace: str - The full function namespace or just the function name - if it is a function included in this module. + The full function namespace Returns ------- function: object Function for deriving the specified variable """ - # Get the name of the calling module - calling_module = globals()["__name__"] - # Get module and function names module_name, _, function_name = function_namespace.rpartition(".") - # Check if the module_name is pointing to here (the calling module or empty "") - # If it does, then use globals() to get the function otherwise import the - # correct module and get the correct function - if module_name in [calling_module, ""]: - function = globals().get(function_name) - if not function: - raise TypeError( - f"Function '{function_namespace}' was not found in '{calling_module}'." - f" Check that you have specified the correct function name" - " and/or that you have defined the full function namespace if you" - " want to use a function defined outside of of the current module" - f" '{calling_module}'." - ) + # Import the module (if necessary) + if module_name in sys.modules: + module = sys.modules[module_name] else: - # Check if the module is already imported - if module_name in sys.modules: - module = module_name - else: - module = importlib.import_module(module_name) + module = importlib.import_module(module_name) - # Get the function from the module - function = getattr(module, function_name) + # Get the function from the module + function = getattr(module, function_name) return function @@ -288,170 +269,6 @@ def _align_derived_variable(field, ds, target_dims): return field -def calculate_toa_radiation(lat, lon, time): - """ - Function for calculating top-of-atmosphere incoming radiation - - Parameters - ---------- - lat : Union[xr.DataArray, float] - Latitude values. Should be in the range [-90, 90] - lon : Union[xr.DataArray, float] - Longitude values. Should be in the range [-180, 180] or [0, 360] - time : Union[xr.DataArray, datetime.datetime] - Time - - Returns - ------- - toa_radiation : Union[xr.DataArray, float] - Top-of-atmosphere incoming radiation - """ - logger.info("Calculating top-of-atmosphere incoming radiation") - - # Solar constant - solar_constant = 1366 # W*m**-2 - - # Different handling if xr.DataArray or datetime object - if isinstance(time, xr.DataArray): - day = time.dt.dayofyear - hour_utc = time.dt.hour - elif isinstance(time, datetime.datetime): - day = time.timetuple().tm_yday - hour_utc = time.hour - else: - raise TypeError( - "Expected an instance of xr.DataArray or datetime object," - f" but got {type(time)}." - ) - - # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. - # dec: declination - angular position of the sun at solar noon w.r.t. - # the plane of the equator - dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) - - utc_solar_time = hour_utc + lon / 15 - hour_angle = 15 * (utc_solar_time - 12) - - # Eq. 1.6.2 with beta=0 in Solar Engineering of Thermal Processes 4th ed. - # cos_sza: Cosine of solar zenith angle - cos_sza = np.sin(lat * np.pi / 180) * np.sin(dec) + np.cos( - lat * np.pi / 180 - ) * np.cos(dec) * np.cos(hour_angle * np.pi / 180) - - # Where TOA radiation is negative, set to 0 - toa_radiation = xr.where(solar_constant * cos_sza < 0, 0, solar_constant * cos_sza) - - if isinstance(toa_radiation, xr.DataArray): - # Add attributes - toa_radiation.name = "toa_radiation" - toa_radiation.attrs["long_name"] = "top-of-atmosphere incoming radiation" - toa_radiation.attrs["units"] = "W*m**-2" - - return toa_radiation - - -def calculate_hour_of_day(time): - """ - Function for calculating hour of day features with a cyclic encoding - - Parameters - ---------- - time : Union[xr.DataArray, datetime.datetime] - Time - - Returns - ------- - hour_of_day_cos: Union[xr.DataArray, float] - cosine of the hour of day - hour_of_day_sin: Union[xr.DataArray, float] - sine of the hour of day - """ - logger.info("Calculating hour of day") - - # Get the hour of the day - if isinstance(time, xr.DataArray): - hour_of_day = time.dt.hour - elif isinstance(time, datetime.datetime): - hour_of_day = time.hour - else: - raise TypeError( - "Expected an instance of xr.DataArray or datetime object," - f" but got {type(time)}." - ) - - # Cyclic encoding of hour of day - hour_of_day_cos, hour_of_day_sin = cyclic_encoding(hour_of_day, 24) - - if isinstance(hour_of_day_cos, xr.DataArray): - # Add attributes - hour_of_day_cos.name = "hour_of_day_cos" - hour_of_day_cos.attrs[ - "long_name" - ] = "Cosine component of cyclically encoded hour of day" - hour_of_day_cos.attrs["units"] = "1" - - if isinstance(hour_of_day_sin, xr.DataArray): - # Add attributes - hour_of_day_sin.name = "hour_of_day_sin" - hour_of_day_sin.attrs[ - "long_name" - ] = "Sine component of cyclically encoded hour of day" - hour_of_day_sin.attrs["units"] = "1" - - return hour_of_day_cos, hour_of_day_sin - - -def calculate_day_of_year(time): - """ - Function for calculating day of year features with a cyclic encoding - - Parameters - ---------- - time : Union[xr.DataArray, datetime.datetime] - Time - - Returns - ------- - day_of_year_cos: Union[xr.DataArray, float] - cosine of the day of year - day_of_year_sin: Union[xr.DataArray, float] - sine of the day of year - """ - logger.info("Calculating day of year") - - # Get the day of year - if isinstance(time, xr.DataArray): - day_of_year = time.dt.dayofyear - elif isinstance(time, datetime.datetime): - day_of_year = time.timetuple().tm_yday - else: - raise TypeError( - "Expected an instance of xr.DataArray or datetime object," - f" but got {type(time)}." - ) - - # Cyclic encoding of day of year - use 366 to include leap years! - day_of_year_cos, day_of_year_sin = cyclic_encoding(day_of_year, 366) - - if isinstance(day_of_year_cos, xr.DataArray): - # Add attributes - day_of_year_cos.name = "day_of_year_cos" - day_of_year_cos.attrs[ - "long_name" - ] = "Cosine component of cyclically encoded day of year" - day_of_year_cos.attrs["units"] = "1" - - if isinstance(day_of_year_sin, xr.DataArray): - # Add attributes - day_of_year_sin.name = "day_of_year_sin" - day_of_year_sin.attrs[ - "long_name" - ] = "Sine component of cyclically encoded day of year" - day_of_year_sin.attrs["units"] = "1" - - return day_of_year_cos, day_of_year_sin - - def cyclic_encoding(data, data_max): """ Cyclic encoding of data From f1e67bc86a64641f56203e17f496935aa156c4f5 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 8 Jan 2025 10:33:11 +0000 Subject: [PATCH 72/96] Update tests --- tests/test_derived_variables.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/test_derived_variables.py b/tests/test_derived_variables.py index 786e064..f0ae5bc 100644 --- a/tests/test_derived_variables.py +++ b/tests/test_derived_variables.py @@ -79,10 +79,14 @@ def test_toa_radiation(lat, lon, time): side_effect=mock_cyclic_encoding, ): if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.ops.derived_variables.calculate_toa_radiation(lat, lon, time) + mdp.ops.derive_variable.physical_field.calculate_toa_radiation( + lat, lon, time + ) else: with pytest.raises(TypeError): - mdp.ops.derived_variables.calculate_toa_radiation(lat, lon, time) + mdp.ops.derive_variable.physical_field.calculate_toa_radiation( + lat, lon, time + ) @pytest.mark.parametrize("time", TIME) @@ -95,10 +99,10 @@ def test_hour_of_day(time): side_effect=mock_cyclic_encoding, ): if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.ops.derived_variables.calculate_hour_of_day(time) + mdp.ops.derive_variable.time_components.calculate_hour_of_day(time) else: with pytest.raises(TypeError): - mdp.ops.derived_variables.calculate_hour_of_day(time) + mdp.ops.derive_variable.time_components.calculate_hour_of_day(time) @pytest.mark.parametrize("time", TIME) @@ -111,7 +115,7 @@ def test_day_of_year(time): side_effect=mock_cyclic_encoding, ): if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.ops.derived_variables.calculate_day_of_year(time) + mdp.ops.derive_variable.time_components.calculate_day_of_year(time) else: with pytest.raises(TypeError): - mdp.ops.derived_variables.calculate_day_of_year(time) + mdp.ops.derive_variable.time_components.calculate_day_of_year(time) From 89e9ad83d0f55c2b17869ca60207d61112a445a4 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 8 Jan 2025 10:45:10 +0000 Subject: [PATCH 73/96] Rename (and move): 'mllam_data_prep/ops/derived_variables.py' -> 'mllam_data_prep/ops/derive_variable/dispatcher.py' --- mllam_data_prep/create_dataset.py | 2 +- mllam_data_prep/ops/derive_variable/__init__.py | 2 +- .../ops/{derived_variables.py => derive_variable/dispatcher.py} | 2 +- mllam_data_prep/ops/derive_variable/time_components.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename mllam_data_prep/ops/{derived_variables.py => derive_variable/dispatcher.py} (99%) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 96f672b..3bac8fc 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -11,7 +11,7 @@ from . import __version__ from .config import Config, InvalidConfigException from .ops.chunking import chunk_dataset -from .ops.derived_variables import derive_variables +from .ops.derive_variable.dispatcher import derive_variables from .ops.loading import load_input_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs diff --git a/mllam_data_prep/ops/derive_variable/__init__.py b/mllam_data_prep/ops/derive_variable/__init__.py index 2c94709..29e9c65 100644 --- a/mllam_data_prep/ops/derive_variable/__init__.py +++ b/mllam_data_prep/ops/derive_variable/__init__.py @@ -1 +1 @@ -from . import physical_field, time_components +from . import dispatcher, physical_field, time_components diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derive_variable/dispatcher.py similarity index 99% rename from mllam_data_prep/ops/derived_variables.py rename to mllam_data_prep/ops/derive_variable/dispatcher.py index 9824986..39a3a71 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derive_variable/dispatcher.py @@ -13,7 +13,7 @@ import xarray as xr from loguru import logger -from .chunking import chunk_dataset +from ..chunking import chunk_dataset REQUIRED_FIELD_ATTRIBUTES = ["units", "long_name"] diff --git a/mllam_data_prep/ops/derive_variable/time_components.py b/mllam_data_prep/ops/derive_variable/time_components.py index eeec0da..b99ab51 100644 --- a/mllam_data_prep/ops/derive_variable/time_components.py +++ b/mllam_data_prep/ops/derive_variable/time_components.py @@ -7,7 +7,7 @@ import xarray as xr from loguru import logger -from ..derived_variables import cyclic_encoding +from .dispatcher import cyclic_encoding def calculate_hour_of_day(time): From bdf34669fa116b9445802fd7110fcafe703b46d9 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 15 Jan 2025 06:44:09 +0000 Subject: [PATCH 74/96] Use the __post_init__() method to validate the config --- mllam_data_prep/config.py | 99 +++++++++++++++---------------- mllam_data_prep/create_dataset.py | 2 +- 2 files changed, 49 insertions(+), 52 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 428ca2c..702dff0 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -9,6 +9,50 @@ class InvalidConfigException(Exception): pass +def validate_config(config_inputs): + """ + Validate that, in the config: + - either `variables` or `derived_variables` are present in the config + - if both `variables` and `derived_variables` are present, that they don't + add the same variables to the dataset + + Parameters + ---------- + config_inputs: Dict[str, InputDataset] + + Returns + ------- + """ + + for input_dataset_name, input_dataset in config_inputs.items(): + if not input_dataset.variables and not input_dataset.derived_variables: + raise InvalidConfigException( + f"Input dataset '{input_dataset_name}' is missing the keys `variables` and/or" + " `derived_variables`. Make sure that you update the config so that the input" + f" dataset '{input_dataset_name}' contains at least either a `variables` or" + " `derived_variables` section." + ) + elif input_dataset.variables and input_dataset.derived_variables: + # Check so that there are no overlapping variables + if isinstance(input_dataset.variables, list): + variable_vars = input_dataset.variables + elif isinstance(input_dataset.variables, dict): + variable_vars = input_dataset.variables.keys() + else: + raise TypeError( + f"Expected an instance of list or dict, but got {type(input_dataset.variables)}." + ) + derived_variable_vars = input_dataset.derived_variables.keys() + common_vars = list(set(variable_vars) & set(derived_variable_vars)) + if len(common_vars) > 0: + raise InvalidConfigException( + "Both `variables` and `derived_variables` include the following variables name(s):" + f" '{', '.join(common_vars)}'. This is not allowed. Make sure that there" + " are no overlapping variable names between `variables` and `derived_variables`," + f" either by renaming or removing '{', '.join(common_vars)}' from one of them." + ) + + @dataclass class Range: """ @@ -325,59 +369,12 @@ class Config(dataclass_wizard.JSONWizard, dataclass_wizard.YAMLWizard): dataset_version: str extra: Dict[str, Any] = None + def __post_init__(self): + validate_config(self.inputs) + class _(JSONWizard.Meta): raise_on_unknown_json_key = True - @staticmethod - def load_config(*args, **kwargs): - """ - Wrapper function for `from_yaml_file` to load config file and validate that: - - either `variables` or `derived_variables` are present in the config - - if both `variables` and `derived_variables` are present, that they don't - add the same variables to the dataset - - Parameters - ---------- - *args: Positional arguments for `from_yaml_file` - **kwargs: Keyword arguments for `from_yaml_file` - - Returns - ------- - config: Config - """ - - # Load the config - config = Config.from_yaml_file(*args, **kwargs) - - for input_dataset_name, input_dataset in config.inputs.items(): - if not input_dataset.variables and not input_dataset.derived_variables: - raise InvalidConfigException( - f"Input dataset '{input_dataset_name}' is missing the keys `variables` and/or" - " `derived_variables`. Make sure that you update the config so that the input" - f" dataset '{input_dataset_name}' contains at least either a `variables` or" - " `derived_variables` section." - ) - elif input_dataset.variables and input_dataset.derived_variables: - # Check so that there are no overlapping variables - if isinstance(input_dataset.variables, list): - variable_vars = input_dataset.variables - elif isinstance(input_dataset.variables, dict): - variable_vars = input_dataset.variables.keys() - else: - raise TypeError( - f"Expected an instance of list or dict, but got {type(input_dataset.variables)}." - ) - derived_variable_vars = input_dataset.derived_variables.keys() - common_vars = list(set(variable_vars) & set(derived_variable_vars)) - if len(common_vars) > 0: - raise InvalidConfigException( - "Both `variables` and `derived_variables` include the following variables name(s):" - f" '{', '.join(common_vars)}'. This is not allowed. Make sure that there" - " are no overlapping variable names between `variables` and `derived_variables`," - f" either by renaming or removing '{', '.join(common_vars)}' from one of them." - ) - return config - if __name__ == "__main__": import argparse @@ -388,7 +385,7 @@ def load_config(*args, **kwargs): ) args = argparser.parse_args() - config = Config.load_config(args.f) + config = Config.from_yaml_file(args.f) import rich rich.print(config) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 3bac8fc..dcf4c55 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -286,7 +286,7 @@ def create_dataset_zarr(fp_config, fp_zarr: str = None): The path to the zarr file to write the dataset to. If not provided, the zarr file will be written to the same directory as the config file with the extension changed to '.zarr'. """ - config = Config.load_config(file=fp_config) + config = Config.from_yaml_file(file=fp_config) ds = create_dataset(config=config) From d3c869343b93bdb3067ef5635a1af40781acb903 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 15 Jan 2025 07:53:48 +0000 Subject: [PATCH 75/96] Loop over 'variables' in 'create_dataset' --- mllam_data_prep/create_dataset.py | 27 +++++++---- mllam_data_prep/ops/subsetting.py | 77 ++++++++++++++++--------------- 2 files changed, 57 insertions(+), 47 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index dcf4c55..b700c09 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -16,7 +16,7 @@ from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs from .ops.statistics import calc_stats -from .ops.subsetting import subset_dataset +from .ops.subsetting import extract_variable # the `extra` field in the config that was added between v0.2.0 and v0.5.0 is # optional, so we can support both v0.2.0 and v0.5.0 @@ -132,7 +132,7 @@ def create_dataset(config: Config): for dataset_name, input_config in config.inputs.items(): path = input_config.path - variables = input_config.variables + selected_variables = input_config.variables derived_variables = input_config.derived_variables target_output_var = input_config.target_output_variable expected_input_attributes = input_config.attributes @@ -152,13 +152,22 @@ def create_dataset(config: Config): for dim in ds_input.dims: ds = ds.assign_coords({dim: ds_input.coords[dim]}) - if variables: - logger.info(f"Subsetting dataset {dataset_name}") - ds = subset_dataset( - ds_subset=ds, - ds_input=ds_input, - variables=variables, - ) + if selected_variables: + logger.info(f"Extracting selected variables from dataset {dataset_name}") + if isinstance(selected_variables, dict): + for var_name, coords_to_sample in selected_variables.items(): + ds[var_name] = extract_variable( + ds=ds_input, + var_name=var_name, + coords_to_sample=coords_to_sample, + ) + elif isinstance(selected_variables, list): + for var_name in selected_variables: + ds[var_name] = extract_variable(ds=ds_input, var_name=var_name) + else: + raise ValueError( + "The `variables` argument should be a list or a dictionary" + ) if derived_variables: logger.info(f"Deriving variables from {dataset_name}") diff --git a/mllam_data_prep/ops/subsetting.py b/mllam_data_prep/ops/subsetting.py index c841533..abdd59a 100644 --- a/mllam_data_prep/ops/subsetting.py +++ b/mllam_data_prep/ops/subsetting.py @@ -1,49 +1,50 @@ -def subset_dataset(ds_subset, ds_input, variables): +def extract_variable(ds, var_name, coords_to_sample=dict()): """ - Select specific variables from the provided the dataset, subset the - variables along the specified coordinates and check coordinate units + Extract specified variable from the provided the input dataset. If + coordinates for subsetting are defines, then subset the variable along + them and check coordinate units Parameters ---------- - ds_subset : xr.Dataset - Subset of ds_input - ds_input : xr.Dataset - Input/source dataset - variables : dict - Dictionary with the variables to subset - with keys as the variable names and values with entries for each - coordinate and coordinate values to extract + ds : xr.Dataset + Input dataset + var_name : Union[Dict, List] + Either a list or dictionary with variables to extract. + If a dictionary the keys are the variable name and the values are + entries for each coordinate and coordinate values to extract + coords_to_sample: Dict + Optional argument for subsetting/sampling along the specified + coordinates + + Returns + ---------- + da: xr.DataArray + Extracted variable (subsetted along the specified coordinates) """ - if isinstance(variables, dict): - for var, coords_to_sample in variables.items(): - da = ds_input[var] - for coord, sampling in coords_to_sample.items(): - coord_values = sampling.values - try: - da = da.sel(**{coord: coord_values}) - except KeyError as ex: - raise KeyError( - f"Could not find the all coordinate values `{coord_values}` in " - f"coordinate `{coord}` in the dataset" - ) from ex - expected_units = sampling.units - coord_units = da[coord].attrs.get("units", None) - if coord_units is not None and coord_units != expected_units: - raise ValueError( - f"Expected units {expected_units} for coordinate {coord}" - f" in variable {var} but got {coord_units}" - ) - ds_subset[var] = da - elif isinstance(variables, list): + try: + da = ds[var_name] + except KeyError as ex: + raise KeyError( + f"Could not find the variable `{var_name}` in the dataset. " + f"The available variables are {list(ds.data_vars)}" + ) from ex + + for coord, sampling in coords_to_sample.items(): + coord_values = sampling.values try: - ds_subset = ds_input[variables] + da = da.sel(**{coord: coord_values}) except KeyError as ex: raise KeyError( - f"Could not find the all variables `{variables}` in the dataset. " - f"The available variables are {list(ds_input.data_vars)}" + f"Could not find the all coordinate values `{coord_values}` in " + f"coordinate `{coord}` in the dataset" ) from ex - else: - raise ValueError("The `variables` argument should be a list or a dictionary") + expected_units = sampling.units + coord_units = da[coord].attrs.get("units", None) + if coord_units is not None and coord_units != expected_units: + raise ValueError( + f"Expected units {expected_units} for coordinate {coord}" + f" in variable {var_name} but got {coord_units}" + ) - return ds_subset + return da From 0fc31bf765d13032d184b676b5bb28220fdb088c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 15 Jan 2025 11:23:33 +0000 Subject: [PATCH 76/96] Update file structure --- mllam_data_prep/create_dataset.py | 4 ++-- mllam_data_prep/ops/derive_variable/__init__.py | 2 +- .../ops/derive_variable/{dispatcher.py => main.py} | 2 +- mllam_data_prep/ops/derive_variable/time_components.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) rename mllam_data_prep/ops/derive_variable/{dispatcher.py => main.py} (99%) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index b700c09..65af312 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -11,7 +11,7 @@ from . import __version__ from .config import Config, InvalidConfigException from .ops.chunking import chunk_dataset -from .ops.derive_variable.dispatcher import derive_variables +from .ops.derive_variable import derive_variable from .ops.loading import load_input_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs @@ -171,7 +171,7 @@ def create_dataset(config: Config): if derived_variables: logger.info(f"Deriving variables from {dataset_name}") - ds = derive_variables( + ds = derive_variable( ds=ds, ds_input=ds_input, derived_variables=derived_variables, diff --git a/mllam_data_prep/ops/derive_variable/__init__.py b/mllam_data_prep/ops/derive_variable/__init__.py index 29e9c65..f097efa 100644 --- a/mllam_data_prep/ops/derive_variable/__init__.py +++ b/mllam_data_prep/ops/derive_variable/__init__.py @@ -1 +1 @@ -from . import dispatcher, physical_field, time_components +from .main import derive_variable diff --git a/mllam_data_prep/ops/derive_variable/dispatcher.py b/mllam_data_prep/ops/derive_variable/main.py similarity index 99% rename from mllam_data_prep/ops/derive_variable/dispatcher.py rename to mllam_data_prep/ops/derive_variable/main.py index 39a3a71..b34d940 100644 --- a/mllam_data_prep/ops/derive_variable/dispatcher.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -18,7 +18,7 @@ REQUIRED_FIELD_ATTRIBUTES = ["units", "long_name"] -def derive_variables(ds, ds_input, derived_variables, chunking): +def derive_variable(ds, ds_input, derived_variables, chunking): """ Load the dataset, and derive the specified variables diff --git a/mllam_data_prep/ops/derive_variable/time_components.py b/mllam_data_prep/ops/derive_variable/time_components.py index b99ab51..b6a23fc 100644 --- a/mllam_data_prep/ops/derive_variable/time_components.py +++ b/mllam_data_prep/ops/derive_variable/time_components.py @@ -7,7 +7,7 @@ import xarray as xr from loguru import logger -from .dispatcher import cyclic_encoding +from .main import cyclic_encoding def calculate_hour_of_day(time): From 6a7a1e355af2ba395e40b2c59bca830d0ed12d36 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 15 Jan 2025 11:38:38 +0000 Subject: [PATCH 77/96] Add comment as to why chunking of coordinates is needed --- mllam_data_prep/ops/derive_variable/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index b34d940..9be0f7b 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -62,6 +62,9 @@ def derive_variable(ds, ds_input, derived_variables, chunking): # Get subset of input dataset for calculating derived variables ds_subset = ds_input[required_kwargs.keys()] + # Chunking is needed for coordinates used to derive a variable since they are + # not lazily loaded, as otherwise one might run into memory issues if using a + # large dataset as input. # Any coordinates needed for the derivation, for which chunking should be performed, # should be converted to variables since it is not possible for *indexed* coordinates # to be chunked dask arrays From 92ad379e2889fa59108043036389310aa3def049 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 15 Jan 2025 11:52:56 +0000 Subject: [PATCH 78/96] Loop over 'derived_variables' in 'create_dataset' --- mllam_data_prep/create_dataset.py | 12 +- mllam_data_prep/ops/derive_variable/main.py | 151 +++++++++----------- 2 files changed, 73 insertions(+), 90 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 65af312..2315c96 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -171,12 +171,12 @@ def create_dataset(config: Config): if derived_variables: logger.info(f"Deriving variables from {dataset_name}") - ds = derive_variable( - ds=ds, - ds_input=ds_input, - derived_variables=derived_variables, - chunking=chunking_config, - ) + for var_name, derived_variable in derived_variables.items(): + ds[var_name] = derive_variable( + ds=ds_input, + derived_variable=derived_variable, + chunking=chunking_config, + ) _check_dataset_attributes( ds=ds, diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index 9be0f7b..371ef25 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -18,17 +18,15 @@ REQUIRED_FIELD_ATTRIBUTES = ["units", "long_name"] -def derive_variable(ds, ds_input, derived_variables, chunking): +def derive_variable(ds, derived_variable, chunking): """ Load the dataset, and derive the specified variables Parameters --------- ds : xr.Dataset - Output dataset - ds_input : xr.Dataset - Input/source dataset - derived_variables : Dict[str, DerivedVariable] + Input dataset + derived_variable : Dict[str, DerivedVariable] Dictionary with the variables to derive with keys as the variable names and values with entries for kwargs and function to use in the calculation @@ -42,89 +40,74 @@ def derive_variable(ds, ds_input, derived_variables, chunking): Dataset with derived variables included """ - target_dims = list(ds_input.sizes.keys()) + target_dims = list(ds.sizes.keys()) - for _, derived_variable in derived_variables.items(): - required_kwargs = derived_variable.kwargs - function_namespace = derived_variable.function - expected_field_attributes = derived_variable.attrs + required_kwargs = derived_variable.kwargs + function_namespace = derived_variable.function + expected_field_attributes = derived_variable.attrs - # Separate the lat,lon from the required variables as these will be derived separately - logger.warning( - "Assuming that the lat/lon coordinates are given as variables called" - " 'lat' and 'lon'." + # Separate the lat,lon from the required variables as these will be derived separately + logger.warning( + "Assuming that the lat/lon coordinates are given as variables called" + " 'lat' and 'lon'." + ) + latlon_coords_to_include = {} + for key in list(required_kwargs.keys()): + if key in ["lat", "lon"]: + latlon_coords_to_include[key] = required_kwargs.pop(key) + + # Get subset of input dataset for calculating derived variables + ds_subset = ds[required_kwargs.keys()] + + # Chunking is needed for coordinates used to derive a variable since they are + # not lazily loaded, as otherwise one might run into memory issues if using a + # large dataset as input. + # Any coordinates needed for the derivation, for which chunking should be performed, + # should be converted to variables since it is not possible for *indexed* coordinates + # to be chunked dask arrays + chunks = { + dim: chunking.get(dim, int(ds_subset[dim].count())) for dim in ds_subset.dims + } + required_coordinates = [ + req_var for req_var in required_kwargs.keys() if req_var in ds_subset.coords + ] + ds_subset = ds_subset.drop_indexes(required_coordinates, errors="ignore") + for req_coord in required_coordinates: + if req_coord in chunks: + ds_subset = ds_subset.reset_coords(req_coord) + + # Chunk the dataset + ds_subset = chunk_dataset(ds_subset, chunks) + + # Add function arguments to kwargs + kwargs = {} + if len(latlon_coords_to_include): + latlon = get_latlon_coords_for_input(ds) + for key, val in latlon_coords_to_include.items(): + kwargs[val] = latlon[key] + kwargs.update({val: ds_subset[key] for key, val in required_kwargs.items()}) + func = _get_derived_variable_function(function_namespace) + # Calculate the derived variable + derived_field = func(**kwargs) + + # Check that the derived field has the necessary attributes (REQUIRED_FIELD_ATTRIBUTES) + # set, return any dropped/reset coordinates, align it to the output dataset dimensions + # (if necessary) and add it to the dataset + if isinstance(derived_field, xr.DataArray): + derived_field = _check_for_required_attributes( + derived_field, expected_field_attributes + ) + derived_field = _return_dropped_coordinates( + derived_field, ds_subset, required_coordinates, chunks + ) + derived_field = _align_derived_variable(derived_field, ds, target_dims) + else: + raise TypeError( + "Expected an instance of xr.DataArray or tuple(xr.DataArray)," + f" but got {type(derived_field)}." ) - latlon_coords_to_include = {} - for key in list(required_kwargs.keys()): - if key in ["lat", "lon"]: - latlon_coords_to_include[key] = required_kwargs.pop(key) - - # Get subset of input dataset for calculating derived variables - ds_subset = ds_input[required_kwargs.keys()] - - # Chunking is needed for coordinates used to derive a variable since they are - # not lazily loaded, as otherwise one might run into memory issues if using a - # large dataset as input. - # Any coordinates needed for the derivation, for which chunking should be performed, - # should be converted to variables since it is not possible for *indexed* coordinates - # to be chunked dask arrays - chunks = { - dim: chunking.get(dim, int(ds_subset[dim].count())) - for dim in ds_subset.dims - } - required_coordinates = [ - req_var for req_var in required_kwargs.keys() if req_var in ds_subset.coords - ] - ds_subset = ds_subset.drop_indexes(required_coordinates, errors="ignore") - for req_coord in required_coordinates: - if req_coord in chunks: - ds_subset = ds_subset.reset_coords(req_coord) - - # Chunk the dataset - ds_subset = chunk_dataset(ds_subset, chunks) - - # Add function arguments to kwargs - kwargs = {} - if len(latlon_coords_to_include): - latlon = get_latlon_coords_for_input(ds_input) - for key, val in latlon_coords_to_include.items(): - kwargs[val] = latlon[key] - kwargs.update({val: ds_subset[key] for key, val in required_kwargs.items()}) - func = _get_derived_variable_function(function_namespace) - # Calculate the derived variable - derived_field = func(**kwargs) - - # Check that the derived field has the necessary attributes (REQUIRED_FIELD_ATTRIBUTES) - # set, return any dropped/reset coordinates, align it to the output dataset dimensions - # (if necessary) and add it to the dataset - if isinstance(derived_field, xr.DataArray): - derived_field = _check_for_required_attributes( - derived_field, expected_field_attributes - ) - derived_field = _return_dropped_coordinates( - derived_field, ds_subset, required_coordinates, chunks - ) - derived_field = _align_derived_variable( - derived_field, ds_input, target_dims - ) - ds[derived_field.name] = derived_field - elif isinstance(derived_field, tuple) and all( - isinstance(field, xr.DataArray) for field in derived_field - ): - for field in derived_field: - field = _check_for_required_attributes(field, expected_field_attributes) - field = _return_dropped_coordinates( - field, ds_subset, required_coordinates, chunks - ) - field = _align_derived_variable(field, ds_input, target_dims) - ds[field.name] = field - else: - raise TypeError( - "Expected an instance of xr.DataArray or tuple(xr.DataArray)," - f" but got {type(derived_field)}." - ) - return ds + return derived_field def _get_derived_variable_function(function_namespace): From d95c031864889c0e11c59063e19c7d98269a44c6 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 15 Jan 2025 12:49:53 +0000 Subject: [PATCH 79/96] Add 'extra_args' to 'derived_variables' to allow functions to have arguments that are not data extracted from the input dataset. Update the 'hour_of_day' variable so that it is now specified in the config file which cyclically encoded component is to be derived (sin or cos) and make 'calculate_hour_of_day' only return one component, based on the extra_kwargs 'component' supplied. --- example.danra.yaml | 10 ++++- mllam_data_prep/config.py | 20 ++++++---- mllam_data_prep/ops/derive_variable/main.py | 20 +++++++--- .../ops/derive_variable/time_components.py | 39 +++++++++---------- 4 files changed, 54 insertions(+), 35 deletions(-) diff --git a/example.danra.yaml b/example.danra.yaml index 59f207c..b8536e4 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -69,9 +69,17 @@ inputs: lat: lat lon: lon function: mllam_data_prep.ops.derive_variable.physical_field.calculate_toa_radiation - hour_of_day: + hour_of_day_sin: kwargs: time: time + extra_kwargs: + component: sin + function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day + hour_of_day_cos: + kwargs: + time: time + extra_kwargs: + component: cos function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day dim_mapping: time: diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 702dff0..9735b9b 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -99,22 +99,28 @@ class ValueSelection: @dataclass class DerivedVariable: """ - Defines a derived variables, where the kwargs (variables required - for the calculation) and the function (for calculating the variable) - are specified. Optionally, in case a function does not return an - `xr.DataArray` with the required attributes (`units` and `long_name`) set, - these should be specified in `attrs`, e.g. - {"attrs": "units": "W*m**-2, "long_name": "top-of-the-atmosphere radiation"}. + Defines a derived variables, where the kwargs (variables required for the + calculation, to be extracted from the input dataset) and the function (for + calculating the variable) are specified. Also, if the function has other arguments + which should not be extracted from the dataset (e.g. a string to indicate if the + sine or cosine component should be extracted) these can be specified in the extra_kwargs. + Optionally, in case a function does not return an `xr.DataArray` with the required + attributes (`units` and `long_name`) set, these should be specified in `attrs`, e.g.: + {"attrs": "units": "W*m**-2, "long_name": "top-of-the-atmosphere radiation"}. Additional attributes can also be set if desired. Attributes: - kwargs: Variables required for calculating the derived variable. + kwargs: Variables required for calculating the derived variable, to be extracted + from the input dataset. function: Function used to calculate the derived variable. + extra_kwargs: Extra arguments for `function` that should not be extracted from + the input dataset, such as a string. attrs: Attributes (e.g. `units` and `long_name`) to set for the derived variable. """ kwargs: Dict[str, str] function: str + extra_kwargs: Optional[Dict[str, str]] = field(default_factory=dict) attrs: Optional[Dict[str, str]] = field(default_factory=dict) diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index 371ef25..faf7520 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -42,7 +42,8 @@ def derive_variable(ds, derived_variable, chunking): target_dims = list(ds.sizes.keys()) - required_kwargs = derived_variable.kwargs + ds_kwargs = derived_variable.kwargs + extra_kwargs = derived_variable.extra_kwargs function_namespace = derived_variable.function expected_field_attributes = derived_variable.attrs @@ -52,12 +53,12 @@ def derive_variable(ds, derived_variable, chunking): " 'lat' and 'lon'." ) latlon_coords_to_include = {} - for key in list(required_kwargs.keys()): + for key in list(ds_kwargs.keys()): if key in ["lat", "lon"]: - latlon_coords_to_include[key] = required_kwargs.pop(key) + latlon_coords_to_include[key] = ds_kwargs.pop(key) # Get subset of input dataset for calculating derived variables - ds_subset = ds[required_kwargs.keys()] + ds_subset = ds[ds_kwargs.keys()] # Chunking is needed for coordinates used to derive a variable since they are # not lazily loaded, as otherwise one might run into memory issues if using a @@ -69,7 +70,7 @@ def derive_variable(ds, derived_variable, chunking): dim: chunking.get(dim, int(ds_subset[dim].count())) for dim in ds_subset.dims } required_coordinates = [ - req_var for req_var in required_kwargs.keys() if req_var in ds_subset.coords + ds_var for ds_var in ds_kwargs.keys() if ds_var in ds_subset.coords ] ds_subset = ds_subset.drop_indexes(required_coordinates, errors="ignore") for req_coord in required_coordinates: @@ -81,12 +82,19 @@ def derive_variable(ds, derived_variable, chunking): # Add function arguments to kwargs kwargs = {} + # - Add lat, and lon, if used as arguments if len(latlon_coords_to_include): latlon = get_latlon_coords_for_input(ds) for key, val in latlon_coords_to_include.items(): kwargs[val] = latlon[key] - kwargs.update({val: ds_subset[key] for key, val in required_kwargs.items()}) + # Add variables extracted from the input dataset + kwargs.update({val: ds_subset[key] for key, val in ds_kwargs.items()}) + # Add extra arguments + kwargs.update(extra_kwargs) + + # Get the function func = _get_derived_variable_function(function_namespace) + # Calculate the derived variable derived_field = func(**kwargs) diff --git a/mllam_data_prep/ops/derive_variable/time_components.py b/mllam_data_prep/ops/derive_variable/time_components.py index b6a23fc..7b6a7d3 100644 --- a/mllam_data_prep/ops/derive_variable/time_components.py +++ b/mllam_data_prep/ops/derive_variable/time_components.py @@ -4,27 +4,29 @@ """ import datetime +import numpy as np import xarray as xr from loguru import logger from .main import cyclic_encoding -def calculate_hour_of_day(time): +def calculate_hour_of_day(time, component): """ Function for calculating hour of day features with a cyclic encoding Parameters ---------- - time : Union[xr.DataArray, datetime.datetime] + time: Union[xr.DataArray, datetime.datetime] Time + component: str + String indicating if the sine or cosine component of the encoding + should be returned Returns ------- - hour_of_day_cos: Union[xr.DataArray, float] - cosine of the hour of day - hour_of_day_sin: Union[xr.DataArray, float] - sine of the hour of day + hour_of_day_encoded: Union[xr.DataArray, float] + sine or cosine of the hour of day """ logger.info("Calculating hour of day") @@ -40,25 +42,20 @@ def calculate_hour_of_day(time): ) # Cyclic encoding of hour of day - hour_of_day_cos, hour_of_day_sin = cyclic_encoding(hour_of_day, 24) - - if isinstance(hour_of_day_cos, xr.DataArray): - # Add attributes - hour_of_day_cos.name = "hour_of_day_cos" - hour_of_day_cos.attrs[ - "long_name" - ] = "Cosine component of cyclically encoded hour of day" - hour_of_day_cos.attrs["units"] = "1" + if component == "sin": + hour_of_day_encoded = np.sin((hour_of_day / 24) * 2 * np.pi) + elif component == "cos": + hour_of_day_encoded = np.cos((hour_of_day / 24) * 2 * np.pi) - if isinstance(hour_of_day_sin, xr.DataArray): + if isinstance(hour_of_day_encoded, xr.DataArray): # Add attributes - hour_of_day_sin.name = "hour_of_day_sin" - hour_of_day_sin.attrs[ + hour_of_day_encoded.name = "hour_of_day_" + component + hour_of_day_encoded.attrs[ "long_name" - ] = "Sine component of cyclically encoded hour of day" - hour_of_day_sin.attrs["units"] = "1" + ] = f"{component.capitalize()} component of cyclically encoded hour of day" + hour_of_day_encoded.attrs["units"] = "1" - return hour_of_day_cos, hour_of_day_sin + return hour_of_day_encoded def calculate_day_of_year(time): From e158a6c5db4d33287b1773b3f8fb8bbc9a63a847 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 15 Jan 2025 12:57:43 +0000 Subject: [PATCH 80/96] Update 'calculate_day_of_year' to only return one component (sin or cos) --- mllam_data_prep/ops/derive_variable/main.py | 26 ------------- .../ops/derive_variable/time_components.py | 38 ++++++++----------- 2 files changed, 16 insertions(+), 48 deletions(-) diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index faf7520..9b54227 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -9,7 +9,6 @@ import importlib import sys -import numpy as np import xarray as xr from loguru import logger @@ -263,31 +262,6 @@ def _align_derived_variable(field, ds, target_dims): return field -def cyclic_encoding(data, data_max): - """ - Cyclic encoding of data - - Parameters - ---------- - data : Union[xr.DataArray, float, int] - Data that should be cyclically encoded - data_max: Union[int, float] - Maximum possible value of input data. Should be greater than 0. - - Returns - ------- - data_cos: Union[xr.DataArray, float, int] - Cosine part of cyclically encoded input data - data_sin: Union[xr.DataArray, float, int] - Sine part of cyclically encoded input data - """ - - data_sin = np.sin((data / data_max) * 2 * np.pi) - data_cos = np.cos((data / data_max) * 2 * np.pi) - - return data_cos, data_sin - - def get_latlon_coords_for_input(ds): """Dummy function for getting lat and lon.""" return ds[["lat", "lon"]].chunk(-1, -1) diff --git a/mllam_data_prep/ops/derive_variable/time_components.py b/mllam_data_prep/ops/derive_variable/time_components.py index 7b6a7d3..90c547d 100644 --- a/mllam_data_prep/ops/derive_variable/time_components.py +++ b/mllam_data_prep/ops/derive_variable/time_components.py @@ -8,8 +8,6 @@ import xarray as xr from loguru import logger -from .main import cyclic_encoding - def calculate_hour_of_day(time, component): """ @@ -58,7 +56,7 @@ def calculate_hour_of_day(time, component): return hour_of_day_encoded -def calculate_day_of_year(time): +def calculate_day_of_year(time, component): """ Function for calculating day of year features with a cyclic encoding @@ -66,13 +64,14 @@ def calculate_day_of_year(time): ---------- time : Union[xr.DataArray, datetime.datetime] Time + component: str + String indicating if the sine or cosine component of the encoding + should be returned Returns ------- - day_of_year_cos: Union[xr.DataArray, float] - cosine of the day of year - day_of_year_sin: Union[xr.DataArray, float] - sine of the day of year + day_of_year_encoded: Union[xr.DataArray, float] + sine or cosine of the day of year """ logger.info("Calculating day of year") @@ -88,22 +87,17 @@ def calculate_day_of_year(time): ) # Cyclic encoding of day of year - use 366 to include leap years! - day_of_year_cos, day_of_year_sin = cyclic_encoding(day_of_year, 366) - - if isinstance(day_of_year_cos, xr.DataArray): - # Add attributes - day_of_year_cos.name = "day_of_year_cos" - day_of_year_cos.attrs[ - "long_name" - ] = "Cosine component of cyclically encoded day of year" - day_of_year_cos.attrs["units"] = "1" + if component == "sin": + day_of_year_encoded = np.sin((day_of_year / 366) * 2 * np.pi) + elif component == "cos": + day_of_year_encoded = np.cos((day_of_year / 366) * 2 * np.pi) - if isinstance(day_of_year_sin, xr.DataArray): + if isinstance(day_of_year_encoded, xr.DataArray): # Add attributes - day_of_year_sin.name = "day_of_year_sin" - day_of_year_sin.attrs[ + day_of_year_encoded.name = "day_of_year_" + component + day_of_year_encoded.attrs[ "long_name" - ] = "Sine component of cyclically encoded day of year" - day_of_year_sin.attrs["units"] = "1" + ] = f"{component.capitalize()} component of cyclically encoded day of year" + day_of_year_encoded.attrs["units"] = "1" - return day_of_year_cos, day_of_year_sin + return day_of_year_encoded From ff9acc7e671e09d9da1b0023698aca52afcd68a7 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 15 Jan 2025 14:08:58 +0000 Subject: [PATCH 81/96] Do not modify the arguments in the function for checking (and now getting) attrs for derived variables --- mllam_data_prep/ops/derive_variable/main.py | 29 ++++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index 9b54227..e1f4cbc 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -97,16 +97,20 @@ def derive_variable(ds, derived_variable, chunking): # Calculate the derived variable derived_field = func(**kwargs) - # Check that the derived field has the necessary attributes (REQUIRED_FIELD_ATTRIBUTES) - # set, return any dropped/reset coordinates, align it to the output dataset dimensions - # (if necessary) and add it to the dataset if isinstance(derived_field, xr.DataArray): - derived_field = _check_for_required_attributes( + # Check that the derived field has the necessary attributes + # (REQUIRED_FIELD_ATTRIBUTES) set, and set them if not + derived_field_attrs = _check_and_get_required_attributes( derived_field, expected_field_attributes ) + derived_field.attrs.update(derived_field_attrs) + + # Return any dropped/reset coordinates derived_field = _return_dropped_coordinates( derived_field, ds_subset, required_coordinates, chunks ) + + # Align the derived field to the output dataset dimensions (if necessary) derived_field = _align_derived_variable(derived_field, ds, target_dims) else: raise TypeError( @@ -147,9 +151,12 @@ def _get_derived_variable_function(function_namespace): return function -def _check_for_required_attributes(field, expected_attributes): +def _check_and_get_required_attributes(field, expected_attributes): """ - Check the attributes of the derived variable. + Check if the required attributes of the derived variable are set. + If not set, get them from the config. + If set and defined in the config, get the attributes from the config + and use them for overwriting the attributes defined in the function. Parameters ---------- @@ -164,10 +171,12 @@ def _check_for_required_attributes(field, expected_attributes): field: xr.DataArray The derived field """ + + attrs = {} for attribute in REQUIRED_FIELD_ATTRIBUTES: if attribute not in field.attrs or field.attrs[attribute] is None: if attribute in expected_attributes.keys(): - field.attrs[attribute] = expected_attributes[attribute] + attrs[attribute] = expected_attributes[attribute] else: # The expected attributes are empty and the attributes have not been # set during the calculation of the derived variable @@ -191,12 +200,12 @@ def _check_for_required_attributes(field, expected_attributes): f" '{expected_attributes[attribute]}' according" " to the specification in the config file." ) - field.attrs[attribute] = expected_attributes[attribute] + attrs[attribute] = expected_attributes[attribute] else: # Attributes are set in the funciton and nothing has been defined in the config file - pass + attrs[attribute] = field.attrs[attribute] - return field + return attrs def _return_dropped_coordinates(derived_field, ds, required_coordinates, chunks): From dc3f2000c8c05e3bcbccf66e099661d72d03b1e4 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 16 Jan 2025 08:18:32 +0000 Subject: [PATCH 82/96] Update tests for functions for deriving toa_radiation and time components --- .../ops/derive_variable/__init__.py | 2 + tests/test_derived_variables.py | 88 ++++++++----------- 2 files changed, 40 insertions(+), 50 deletions(-) diff --git a/mllam_data_prep/ops/derive_variable/__init__.py b/mllam_data_prep/ops/derive_variable/__init__.py index f097efa..cc455e7 100644 --- a/mllam_data_prep/ops/derive_variable/__init__.py +++ b/mllam_data_prep/ops/derive_variable/__init__.py @@ -1 +1,3 @@ from .main import derive_variable +from .physical_field import calculate_toa_radiation +from .time_components import calculate_day_of_year, calculate_hour_of_day diff --git a/tests/test_derived_variables.py b/tests/test_derived_variables.py index f0ae5bc..0fd6108 100644 --- a/tests/test_derived_variables.py +++ b/tests/test_derived_variables.py @@ -1,6 +1,4 @@ import datetime -import random -from unittest.mock import patch import isodate import numpy as np @@ -49,24 +47,6 @@ ] -def mock_cyclic_encoding(data, data_max): - """Mock the `cyclic_encoding` function from mllam_data_prep.ops.derived_variables.""" - if isinstance(data, xr.DataArray): - data_cos = xr.DataArray( - random.uniform(-1, 1), - coords=data.coords, - dims=data.dims, - ) - data_sin = xr.DataArray( - random.uniform(-1, 1), - coords=data.coords, - dims=data.dims, - ) - return data_cos, data_sin - elif isinstance(data, (float, int)): - return random.uniform(-1, 1), random.uniform(-1, 1) - - @pytest.mark.parametrize("lat", LATITUDE) @pytest.mark.parametrize("lon", LONGITUDE) @pytest.mark.parametrize("time", TIME) @@ -74,48 +54,56 @@ def test_toa_radiation(lat, lon, time): """ Test the `calculate_toa_radiation` function from mllam_data_prep.derived_variables """ - with patch( - "mllam_data_prep.ops.derived_variables.cyclic_encoding", - side_effect=mock_cyclic_encoding, - ): - if isinstance(time, (xr.DataArray, datetime.datetime)): + if isinstance(time, (xr.DataArray, datetime.datetime)): + mdp.ops.derive_variable.physical_field.calculate_toa_radiation(lat, lon, time) + else: + with pytest.raises(TypeError): mdp.ops.derive_variable.physical_field.calculate_toa_radiation( lat, lon, time ) - else: - with pytest.raises(TypeError): - mdp.ops.derive_variable.physical_field.calculate_toa_radiation( - lat, lon, time - ) @pytest.mark.parametrize("time", TIME) -def test_hour_of_day(time): +@pytest.mark.parametrize( + "component", + [ + "cos", + "sin", + ], +) +def test_hour_of_day(time, component): """ Test the `calculate_hour_of_day` function from mllam_data_prep.derived_variables """ - with patch( - "mllam_data_prep.ops.derived_variables.cyclic_encoding", - side_effect=mock_cyclic_encoding, - ): - if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.ops.derive_variable.time_components.calculate_hour_of_day(time) - else: - with pytest.raises(TypeError): - mdp.ops.derive_variable.time_components.calculate_hour_of_day(time) + if isinstance(time, (xr.DataArray, datetime.datetime)): + mdp.ops.derive_variable.time_components.calculate_hour_of_day( + time, component=component + ) + else: + with pytest.raises(TypeError): + mdp.ops.derive_variable.time_components.calculate_hour_of_day( + time, component=component + ) @pytest.mark.parametrize("time", TIME) -def test_day_of_year(time): +@pytest.mark.parametrize( + "component", + [ + "cos", + "sin", + ], +) +def test_day_of_year(time, component): """ Test the `calculate_day_of_year` function from mllam_data_prep.derived_variables """ - with patch( - "mllam_data_prep.ops.derived_variables.cyclic_encoding", - side_effect=mock_cyclic_encoding, - ): - if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.ops.derive_variable.time_components.calculate_day_of_year(time) - else: - with pytest.raises(TypeError): - mdp.ops.derive_variable.time_components.calculate_day_of_year(time) + if isinstance(time, (xr.DataArray, datetime.datetime)): + mdp.ops.derive_variable.time_components.calculate_day_of_year( + time, component=component + ) + else: + with pytest.raises(TypeError): + mdp.ops.derive_variable.time_components.calculate_day_of_year( + time, component=component + ) From 909353485303973976ebc1aacb65c38a0096f5f0 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 16 Jan 2025 08:36:19 +0000 Subject: [PATCH 83/96] Update the config version to v0.6.0 --- example.danra.yaml | 2 +- mllam_data_prep/create_dataset.py | 8 +- .../v0.5.0/example.danra.yaml | 99 +++++++++++++++++++ 3 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 tests/old_config_schema_examples/v0.5.0/example.danra.yaml diff --git a/example.danra.yaml b/example.danra.yaml index b8536e4..5101005 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -1,4 +1,4 @@ -schema_version: v0.5.0 +schema_version: v0.6.0 dataset_version: v0.1.0 output: diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 2315c96..c123e0c 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -18,9 +18,11 @@ from .ops.statistics import calc_stats from .ops.subsetting import extract_variable -# the `extra` field in the config that was added between v0.2.0 and v0.5.0 is -# optional, so we can support both v0.2.0 and v0.5.0 -SUPPORTED_CONFIG_VERSIONS = ["v0.2.0", "v0.5.0"] +# The config versions defined in SUPPORTED_CONFIG_VERSIONS are the ones currently supported. +# The `extra` field in the config that was added between v0.2.0 and v0.5.0 is optional, and +# the `derived_variables` field in the config added in v0.6.0 is also optional, so we can +# support v0.2.0, v0.5.0, and v0.6.0 +SUPPORTED_CONFIG_VERSIONS = ["v0.2.0", "v0.5.0", "v0.6.0"] def _check_dataset_attributes(ds, expected_attributes, dataset_name): diff --git a/tests/old_config_schema_examples/v0.5.0/example.danra.yaml b/tests/old_config_schema_examples/v0.5.0/example.danra.yaml new file mode 100644 index 0000000..3edf126 --- /dev/null +++ b/tests/old_config_schema_examples/v0.5.0/example.danra.yaml @@ -0,0 +1,99 @@ +schema_version: v0.5.0 +dataset_version: v0.1.0 + +output: + variables: + static: [grid_index, static_feature] + state: [time, grid_index, state_feature] + forcing: [time, grid_index, forcing_feature] + coord_ranges: + time: + start: 1990-09-03T00:00 + end: 1990-09-09T00:00 + step: PT3H + chunking: + time: 1 + splitting: + dim: time + splits: + train: + start: 1990-09-03T00:00 + end: 1990-09-06T00:00 + compute_statistics: + ops: [mean, std, diff_mean, diff_std] + dims: [grid_index, time] + val: + start: 1990-09-06T00:00 + end: 1990-09-07T00:00 + test: + start: 1990-09-07T00:00 + end: 1990-09-09T00:00 + +inputs: + danra_height_levels: + path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/height_levels.zarr + dims: [time, x, y, altitude] + variables: + u: + altitude: + values: [100,] + units: m + v: + altitude: + values: [100, ] + units: m + dim_mapping: + time: + method: rename + dim: time + state_feature: + method: stack_variables_by_var_name + dims: [altitude] + name_format: "{var_name}{altitude}m" + grid_index: + method: stack + dims: [x, y] + target_output_variable: state + + danra_surface: + path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr + dims: [time, x, y] + variables: + # use surface incoming shortwave radiation as forcing + - swavr0m + dim_mapping: + time: + method: rename + dim: time + grid_index: + method: stack + dims: [x, y] + forcing_feature: + method: stack_variables_by_var_name + name_format: "{var_name}" + target_output_variable: forcing + + danra_lsm: + path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/lsm.zarr + dims: [x, y] + variables: + - lsm + dim_mapping: + grid_index: + method: stack + dims: [x, y] + static_feature: + method: stack_variables_by_var_name + name_format: "{var_name}" + target_output_variable: static + +extra: + projection: + class_name: LambertConformal + kwargs: + central_longitude: 25.0 + central_latitude: 56.7 + standard_parallels: [56.7, 56.7] + globe: + semimajor_axis: 6367470.0 + semiminor_axis: 6367470.0 From 233206ca46a36f35f097b40bb3d5fcaec058ce05 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 16 Jan 2025 10:28:17 +0000 Subject: [PATCH 84/96] Raise an error if 'component' is neither 'cos' nor 'sin'. --- mllam_data_prep/ops/derive_variable/time_components.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mllam_data_prep/ops/derive_variable/time_components.py b/mllam_data_prep/ops/derive_variable/time_components.py index 90c547d..5329e12 100644 --- a/mllam_data_prep/ops/derive_variable/time_components.py +++ b/mllam_data_prep/ops/derive_variable/time_components.py @@ -44,6 +44,11 @@ def calculate_hour_of_day(time, component): hour_of_day_encoded = np.sin((hour_of_day / 24) * 2 * np.pi) elif component == "cos": hour_of_day_encoded = np.cos((hour_of_day / 24) * 2 * np.pi) + else: + raise ValueError( + f"Invalid value of `component`: '{component}'. Expected one of: 'cos' or 'sin'." + " Please update the config accordingly." + ) if isinstance(hour_of_day_encoded, xr.DataArray): # Add attributes @@ -91,6 +96,11 @@ def calculate_day_of_year(time, component): day_of_year_encoded = np.sin((day_of_year / 366) * 2 * np.pi) elif component == "cos": day_of_year_encoded = np.cos((day_of_year / 366) * 2 * np.pi) + else: + raise ValueError( + f"Invalid value of `component`: '{component}'. Expected one of: 'cos' or 'sin'." + " Please update the config accordingly." + ) if isinstance(day_of_year_encoded, xr.DataArray): # Add attributes From b716c13ed9a77038705ab14ac92947e285304eb9 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 16 Jan 2025 10:46:50 +0000 Subject: [PATCH 85/96] Update docstring and rename variable --- mllam_data_prep/ops/derive_variable/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index e1f4cbc..027f67b 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -208,13 +208,13 @@ def _check_and_get_required_attributes(field, expected_attributes): return attrs -def _return_dropped_coordinates(derived_field, ds, required_coordinates, chunks): +def _return_dropped_coordinates(field, ds, required_coordinates, chunks): """ Return the coordinates that have been dropped/reset. Parameters ---------- - derived_field: xr.Dataset + field: xr.DataArray Derived variable ds: xr.Dataset Dataset with required coordinatwes @@ -226,14 +226,14 @@ def _return_dropped_coordinates(derived_field, ds, required_coordinates, chunks) Returns ------- - derived_field: xr.Dataset + field: xr.DataArray Derived variable, now also with dropped coordinates returned """ for req_coord in required_coordinates: if req_coord in chunks: - derived_field.coords[req_coord] = ds[req_coord] + field.coords[req_coord] = ds[req_coord] - return derived_field + return field def _align_derived_variable(field, ds, target_dims): From 8519da4b5c28667650a3bd176d19a0ba07b380d4 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 17 Jan 2025 10:11:40 +0000 Subject: [PATCH 86/96] Update error message since we now only support xr.DataArrays --- mllam_data_prep/ops/derive_variable/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index 027f67b..9af736f 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -114,8 +114,7 @@ def derive_variable(ds, derived_variable, chunking): derived_field = _align_derived_variable(derived_field, ds, target_dims) else: raise TypeError( - "Expected an instance of xr.DataArray or tuple(xr.DataArray)," - f" but got {type(derived_field)}." + f"Expected an instance of xr.DataArray, but got {type(derived_field)}." ) return derived_field From 79b6e46356dd7ba5b0e585cb5b29d380a1249985 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 17 Jan 2025 11:10:56 +0000 Subject: [PATCH 87/96] Update README --- README.md | 57 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 034aa60..fe19134 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ ds = mdp.create_dataset(config=config) A full example configuration file is given in [example.danra.yaml](example.danra.yaml), and reproduced here for completeness: ```yaml -schema_version: v0.5.0 +schema_version: v0.6.0 dataset_version: v0.1.0 output: @@ -182,11 +182,19 @@ inputs: time: time lat: lat lon: lon - function: mllam_data_prep.ops.derived_variables.calculate_toa_radiation - hour_of_day: + function: mllam_data_prep.ops.derive_variable.physical_field.calculate_toa_radiation + hour_of_day_sin: kwargs: time: time - function: mllam_data_prep.ops.derived_variables.calculate_hour_of_day + extra_kwargs: + component: sin + function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day + hour_of_day_cos: + kwargs: + time: time + extra_kwargs: + component: cos + function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day dim_mapping: time: method: rename @@ -313,11 +321,19 @@ inputs: time: time lat: lat lon: lon - function: mllam_data_prep.derived_variables.calculate_toa_radiation - hour_of_day: + function: mllam_data_prep.derive_variable.physical_field.calculate_toa_radiation + hour_of_day_sin: + kwargs: + time: time + extra_kwargs: + component: sin + function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day + hour_of_day_cos: kwargs: time: time - function: mllam_data_prep.derived_variables.calculate_hour_of_day + extra_kwargs: + component: cos + function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day dim_mapping: time: method: rename @@ -343,14 +359,15 @@ The `inputs` section defines the source datasets to extract data from. Each sour - `rename`: simply rename the dimension to the new name - `stack`: stack the listed dimension to create the dimension in the output - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable. -- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with the following additional information. See the 'Derived Variables' section for more details. - - `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.ops.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.ops.derived_variables` module it is enough with the function name only. - - `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the name of the variables to select from the source dataset and each value is the named argument to `function`. +- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the name of the variable to be derived and the value defines a dictionary with the following additional information. See also the 'Derived Variables' section for more details. + - `function`: the function used to derive a variable. This should be a string the full namespace of the function, e.g. `mllam_data_prep.ops.derived_variables.physical_field.calculate_toa_radiation`. + - `kwargs`: `function` arguments that should be extracted from the source dataset. This is a dictionary where each key is the name of the variables to select from the source dataset and each value is the named argument to `function`. + - `extra_kwargs`: `function` arguments that should not be extracted from the source dataset, such as the extra argument `component` to `mllam_data_prep.ops.derived_variables.time_components.calculate_hour_of_day` which is a string (either "sin" or "cos") the decides if the returned field is the sine or cosine component of the cyclically encoded hour of day variable. #### Derived Variables Variables that are not part of the source dataset but can be derived from variables in the source dataset can also be included. They should be defined in their own section, called `derived_variables` as illustrated in the example config above and in the `example.danra.yaml` config file. -To derive the variables, the function to be used to derive the variable (`function`) and the arguments to this function (`kwargs`) need to be specified, as explained above. In addition, an optional section called `attrs` can be added. In this section, the user can add attributes to the derived variable, as illustrated below. +To derive the variables, the function used to derive the variable (`function`) and the arguments to this function (`kwargs` and `extra_kwargs`) need to be specified, as explained above. In addition, an optional section called `attrs` can be added. In this section, the user can add attributes to the derived variable, as illustrated below. ```yaml derived_variables: toa_radiation: @@ -358,24 +375,24 @@ To derive the variables, the function to be used to derive the variable (`functi time: time lat: lat lon: lon - function: mllam_data_prep.derived_variables.calculate_toa_radiation + function: mllam_data_prep.derive_variable.physical_field.calculate_toa_radiation attrs: units: W*m**-2 long_name: top-of-atmosphere incoming radiation ``` -Note that the attributes `units` and `long_name` are required. This means that if the function used to derive a variable does not set these attributes they are **required** to be set in the config file. If using a function defined in `mllam_data_prep.ops.derived_variables` the `attrs` section is optional as the attributes should already be defined. In this case, adding the `units` and `long_name` attributes to the `attrs` section of the derived variable in config file will overwrite the already-defined attributes from the function. +Note that the attributes `units` and `long_name` are required. This means that if the function used to derive a variable does not set these attributes they are **required** to be set in the config file. If using a function defined in `mllam_data_prep.ops.derive_variable` the `attrs` section is optional as the attributes should already be defined. In this case, adding the `units` and `long_name` attributes to the `attrs` section of the derived variable in config file will **overwrite** the already-defined attributes in the function. Currently, the following derived variables are included as part of `mllam-data-prep`: - `toa_radiation`: - Top-of-atmosphere incoming radiation - - function: `mllam_data_prep.ops.derived_variables.calculate_toa_radiation` -- `hour_of_day`: - - Hour of day (cyclically encoded) - - function: `mllam_data_prep.ops.derived_variables.calculate_hour_of_day` -- `day_of_year`: - - Day of year (cyclically encoded) - - function: `mllam_data_prep.ops.derived_variables.calculate_day_of_year` + - function: `mllam_data_prep.ops.derive_variable.physical_field.calculate_toa_radiation` +- `hour_of_day_[sin/cos]`: + - Sine of cosine part of cyclically encoded hour of day + - function: `mllam_data_prep.ops.derive_variable.time_compoents.calculate_hour_of_day` +- `day_of_year_[sin/cos]`: + - Sine of cosine part of cyclically encoded day of year + - function: `mllam_data_prep.ops.derive_variable.time_compoents.calculate_day_of_year` ### Config schema versioning From 3baa1c0261301168c2ff538c6a09bfac41d9b2aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Fr=C3=B8lund?= Date: Mon, 20 Jan 2025 13:55:58 +0100 Subject: [PATCH 88/96] Tests for _check_and_get_required_attributes and _get_derived_variable_function --- mllam_data_prep/ops/derive_variable/main.py | 3 +- tests/derive_variable/test_main.py | 131 ++++++++++++++++++++ 2 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 tests/derive_variable/test_main.py diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index 9af736f..da6ee58 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -6,6 +6,7 @@ but also of other physical fields (wind-speed is a function of both meridional and zonal wind components). """ + import importlib import sys @@ -201,7 +202,7 @@ def _check_and_get_required_attributes(field, expected_attributes): ) attrs[attribute] = expected_attributes[attribute] else: - # Attributes are set in the funciton and nothing has been defined in the config file + # Attributes are set in the function and nothing has been defined in the config file attrs[attribute] = field.attrs[attribute] return attrs diff --git a/tests/derive_variable/test_main.py b/tests/derive_variable/test_main.py new file mode 100644 index 0000000..3e8539d --- /dev/null +++ b/tests/derive_variable/test_main.py @@ -0,0 +1,131 @@ +"""Unit tests for the main module of the derive_variable operations.""" + +import pathlib +import sys +from types import ModuleType +from typing import Generator +from unittest.mock import MagicMock, patch + +import pytest +import xarray as xr +from mllam_data_prep.ops.derive_variable.main import ( + _check_and_get_required_attributes, + _get_derived_variable_function, +) + + +@pytest.fixture(name="mock_import_module") +def fixture_mock_import_module() -> Generator[MagicMock, None, None]: + """Fixture to mock importlib.import_module.""" + with patch("importlib.import_module") as mock: + yield mock + + +@pytest.fixture() +def fixture_mock_sys_modules() -> Generator[None, None, None]: + """Fixture to mock sys.modules.""" + with patch.dict("sys.modules", {}): + yield + + +class TestGetDerivedVariableFunction: + """Tests for the _get_derived_variable_function.""" + + @pytest.mark.usefixtures("fixture_mock_sys_modules") + def test_function_in_sys_modules(self, mock_import_module: MagicMock) -> None: + """Test when the function to import is already in sys.modules.""" + # Mock the module and function + mock_module: ModuleType = MagicMock() + mock_function: MagicMock = MagicMock() + sys.modules["mock_module"] = mock_module + mock_module.mock_function = mock_function + + # Call the function + result = _get_derived_variable_function("mock_module.mock_function") + + # Assert the function is returned correctly + assert result == mock_function + + # Assert the module was not imported + mock_import_module.assert_not_called() + + def test_function_not_in_sys_modules(self, mock_import_module: MagicMock) -> None: + """Test when the function to import is not in sys.modules.""" + # Mock the module and function + mock_module: ModuleType = MagicMock() + mock_function: MagicMock = MagicMock() + mock_import_module.return_value = mock_module + mock_module.mock_function = mock_function + + # Call the function + result = _get_derived_variable_function("mock_module.mock_function") + + # Assert the function is returned correctly + assert result == mock_function + + @patch("importlib.import_module") + def test_importing_standard_library_module( + self, mock_import_module: MagicMock + ) -> None: + """Test function on standard library module.""" + # Mock the import of pathlib + mock_import_module.return_value = pathlib + + # Remove pathlib from sys.modules if it exists + sys.modules.pop("pathlib", None) + + result = _get_derived_variable_function("pathlib.Path") + + # Assert the function is returned correctly + assert result == pathlib.Path + + +@patch( + "mllam_data_prep.ops.derive_variable.main.REQUIRED_FIELD_ATTRIBUTES", + ["units", "long_name"], +) +class TestCheckAndGetRequiredAttributes: + """Tests for the _check_and_get_required_attributes function.""" + + @pytest.mark.parametrize( + ["field_attrs", "expected_attributes", "expected_result"], + [ + [ + {"units": "m", "long_name": "test"}, + {"units": "m", "long_name": "test"}, + {"units": "m", "long_name": "test"}, + ], + [ + {"units": "m", "long_name": "test"}, + {}, + {"units": "m", "long_name": "test"}, + ], + [ + {"units": "m"}, + {"units": "m", "long_name": "test"}, + {"units": "m", "long_name": "test"}, + ], + [ + {"units": "m", "long_name": "old_name"}, + {"units": "m", "long_name": "new_name"}, + {"units": "m", "long_name": "new_name"}, + ], + ], + ) + def test_valid_input( + self, field_attrs, expected_attributes, expected_result + ) -> None: + """Test that the function returns the correct attributes with valid input.""" + field = xr.DataArray([1, 2, 3], attrs=field_attrs) + + result = _check_and_get_required_attributes(field, expected_attributes) + + assert result == expected_result + + def test_missing_attributes_raises_key_error(self) -> None: + """Test when required attributes are missing and not in expected attributes.""" + field = xr.DataArray([1, 2, 3], attrs={"units": "m"}) + expected_attributes = {"units": "m"} + + with pytest.raises(KeyError): + _check_and_get_required_attributes(field, expected_attributes) From 245e97bca0017702431edf03cdc009db3cbbd48e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Fr=C3=B8lund?= Date: Tue, 21 Jan 2025 16:06:47 +0100 Subject: [PATCH 89/96] Restructured test data into fixtures and indirect parametrizations. Split up test_derived_variable.py into test_physical_field.py and test_time_components.py. Removed redundant test from test_main.py --- tests/derive_variable/conftest.py | 32 +++++ tests/derive_variable/test_main.py | 16 --- tests/derive_variable/test_physical_field.py | 75 ++++++++++++ tests/derive_variable/test_time_components.py | 54 +++++++++ tests/test_derived_variables.py | 109 ------------------ 5 files changed, 161 insertions(+), 125 deletions(-) create mode 100644 tests/derive_variable/conftest.py create mode 100644 tests/derive_variable/test_physical_field.py create mode 100644 tests/derive_variable/test_time_components.py delete mode 100644 tests/test_derived_variables.py diff --git a/tests/derive_variable/conftest.py b/tests/derive_variable/conftest.py new file mode 100644 index 0000000..913d0cc --- /dev/null +++ b/tests/derive_variable/conftest.py @@ -0,0 +1,32 @@ +"""Fixtures for the derive_variable module tests.""" + +import datetime +from typing import List + +import isodate +import numpy as np +import pandas as pd +import pytest +import xarray as xr + + +@pytest.fixture(name="time") +def fixture_time(request) -> List[np.datetime64 | datetime.datetime | xr.DataArray]: + """Fixture that returns test time data + + The fixture has to be indirectly parametrized with the number of time steps. + """ + ntime = request.param + return [ + np.datetime64("2004-06-11T00:00:00"), # invalid type + isodate.parse_datetime("1999-03-21T00:00"), + xr.DataArray( + pd.date_range( + start=isodate.parse_datetime("1999-03-21T00:00"), + periods=ntime, + freq=isodate.parse_duration("PT1H"), + ), + dims=["time"], + name="time", + ), + ] diff --git a/tests/derive_variable/test_main.py b/tests/derive_variable/test_main.py index 3e8539d..c0bb823 100644 --- a/tests/derive_variable/test_main.py +++ b/tests/derive_variable/test_main.py @@ -63,22 +63,6 @@ def test_function_not_in_sys_modules(self, mock_import_module: MagicMock) -> Non # Assert the function is returned correctly assert result == mock_function - @patch("importlib.import_module") - def test_importing_standard_library_module( - self, mock_import_module: MagicMock - ) -> None: - """Test function on standard library module.""" - # Mock the import of pathlib - mock_import_module.return_value = pathlib - - # Remove pathlib from sys.modules if it exists - sys.modules.pop("pathlib", None) - - result = _get_derived_variable_function("pathlib.Path") - - # Assert the function is returned correctly - assert result == pathlib.Path - @patch( "mllam_data_prep.ops.derive_variable.main.REQUIRED_FIELD_ATTRIBUTES", diff --git a/tests/derive_variable/test_physical_field.py b/tests/derive_variable/test_physical_field.py new file mode 100644 index 0000000..1790bb1 --- /dev/null +++ b/tests/derive_variable/test_physical_field.py @@ -0,0 +1,75 @@ +"""Unit tests for the `mllam_data_prep.ops.derive_variable.physical_field` module.""" + +import datetime +from typing import List + +import numpy as np +import pytest +import xarray as xr +from mllam_data_prep.ops.derive_variable.physical_field import calculate_toa_radiation + + +@pytest.fixture(name="lat") +def fixture_lat(request) -> List[float | xr.DataArray]: + """Fixture that returns test latitude data + + The fixture has to be indirectly parametrized with the number of coordinates, + the minimum and maximum latitude values. + """ + ncoord, lat_min, lat_max = request.param + return [ + 55.711, + xr.DataArray( + np.random.uniform(lat_min, lat_max, size=(ncoord, ncoord)), + dims=["x", "y"], + coords={"x": np.arange(ncoord), "y": np.arange(ncoord)}, + name="lat", + ), + ] + + +@pytest.fixture(name="lon") +def fixture_lon(request) -> List[float | xr.DataArray]: + """Fixture that returns test longitude data + + The fixture has to be indirectly parametrized with the number of coordinates, + the minimum and maximum longitude values. + """ + ncoord, lon_min, lon_max = request.param + return [ + 12.564, + xr.DataArray( + np.random.uniform(lon_min, lon_max, size=(ncoord, ncoord)), + dims=["x", "y"], + coords={"x": np.arange(ncoord), "y": np.arange(ncoord)}, + name="lon", + ), + ] + + +@pytest.mark.parametrize( + "lat", + # Format: (ncoord, lat_min, lat_max) + [(10, -90, 90), (10, -40, 40), (10, 40, -40), (10, -10, 10), (1000, -40, 40)], + indirect=True, +) +@pytest.mark.parametrize( + "lon", + # Format: (ncoord, lon_min, lon_max) + [(10, 0, 360), (10, -180, 180), (10, -90, 90), (10, 100, 110), (1000, -180, 180)], + indirect=True, +) +@pytest.mark.parametrize("time", [1, 10, 100], indirect=True) +def test_toa_radiation( + lat: float | xr.DataArray, + lon: float | xr.DataArray, + time: np.datetime64 | datetime.datetime | xr.DataArray, +): + """ + Test the `calculate_toa_radiation` function from mllam_data_prep.derived_variables + """ + if isinstance(time, (xr.DataArray, datetime.datetime)): + calculate_toa_radiation(lat, lon, time) + else: + with pytest.raises(TypeError): + calculate_toa_radiation(lat, lon, time) diff --git a/tests/derive_variable/test_time_components.py b/tests/derive_variable/test_time_components.py new file mode 100644 index 0000000..2419e6d --- /dev/null +++ b/tests/derive_variable/test_time_components.py @@ -0,0 +1,54 @@ +"""Unit tests for the `mllam_data_prep.ops.derive_variable.time_components` module.""" + +import datetime + +import numpy as np +import pytest +import xarray as xr +from mllam_data_prep.ops.derive_variable.time_components import ( + calculate_day_of_year, + calculate_hour_of_day, +) + + +@pytest.mark.parametrize("time", [1, 10, 1000], indirect=True) +@pytest.mark.parametrize( + "component", + [ + "cos", + "sin", + ], +) +def test_hour_of_day( + time: np.datetime64 | datetime.datetime | xr.DataArray, component: str +): + """ + Test the `calculate_hour_of_day` function from mllam_data_prep.derived_variables + """ + if isinstance(time, (xr.DataArray, datetime.datetime)): + calculate_hour_of_day(time, component=component) + else: + with pytest.raises(TypeError): + calculate_hour_of_day(time, component=component) + + +@pytest.mark.parametrize("time", [1, 10, 1000], indirect=True) +@pytest.mark.parametrize( + "component", + [ + "cos", + "sin", + ], +) +def test_day_of_year( + time: np.datetime64 | datetime.datetime | xr.DataArray, component: str +): + """ + Test the `calculate_day_of_year` function from mllam_data_prep.derived_variables + """ + + if isinstance(time, (xr.DataArray, datetime.datetime)): + calculate_day_of_year(time, component=component) + else: + with pytest.raises(TypeError): + calculate_day_of_year(time, component=component) diff --git a/tests/test_derived_variables.py b/tests/test_derived_variables.py deleted file mode 100644 index 0fd6108..0000000 --- a/tests/test_derived_variables.py +++ /dev/null @@ -1,109 +0,0 @@ -import datetime - -import isodate -import numpy as np -import pandas as pd -import pytest -import xarray as xr - -import mllam_data_prep as mdp - -NCOORD = 10 -NTIME = 10 -LAT_MIN = -90 -LAT_MAX = 90 -LON_MIN = 0 -LON_MAX = 360 -LATITUDE = [ - 55.711, - xr.DataArray( - np.random.uniform(LAT_MIN, LAT_MAX, size=(NCOORD, NCOORD)), - dims=["x", "y"], - coords={"x": np.arange(NCOORD), "y": np.arange(NCOORD)}, - name="lat", - ), -] -LONGITUDE = [ - 12.564, - xr.DataArray( - np.random.uniform(LON_MIN, LON_MAX, size=(NCOORD, NCOORD)), - dims=["x", "y"], - coords={"x": np.arange(NCOORD), "y": np.arange(NCOORD)}, - name="lon", - ), -] -TIME = [ - np.datetime64("2004-06-11T00:00:00"), # invalid type - isodate.parse_datetime("1999-03-21T00:00"), - xr.DataArray( - pd.date_range( - start=isodate.parse_datetime("1999-03-21T00:00"), - periods=NTIME, - freq=isodate.parse_duration("PT1H"), - ), - dims=["time"], - name="time", - ), -] - - -@pytest.mark.parametrize("lat", LATITUDE) -@pytest.mark.parametrize("lon", LONGITUDE) -@pytest.mark.parametrize("time", TIME) -def test_toa_radiation(lat, lon, time): - """ - Test the `calculate_toa_radiation` function from mllam_data_prep.derived_variables - """ - if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.ops.derive_variable.physical_field.calculate_toa_radiation(lat, lon, time) - else: - with pytest.raises(TypeError): - mdp.ops.derive_variable.physical_field.calculate_toa_radiation( - lat, lon, time - ) - - -@pytest.mark.parametrize("time", TIME) -@pytest.mark.parametrize( - "component", - [ - "cos", - "sin", - ], -) -def test_hour_of_day(time, component): - """ - Test the `calculate_hour_of_day` function from mllam_data_prep.derived_variables - """ - if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.ops.derive_variable.time_components.calculate_hour_of_day( - time, component=component - ) - else: - with pytest.raises(TypeError): - mdp.ops.derive_variable.time_components.calculate_hour_of_day( - time, component=component - ) - - -@pytest.mark.parametrize("time", TIME) -@pytest.mark.parametrize( - "component", - [ - "cos", - "sin", - ], -) -def test_day_of_year(time, component): - """ - Test the `calculate_day_of_year` function from mllam_data_prep.derived_variables - """ - if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.ops.derive_variable.time_components.calculate_day_of_year( - time, component=component - ) - else: - with pytest.raises(TypeError): - mdp.ops.derive_variable.time_components.calculate_day_of_year( - time, component=component - ) From 325866a5f5f422678384911f4f5d0291eba0c827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Fr=C3=B8lund?= Date: Wed, 22 Jan 2025 14:10:11 +0100 Subject: [PATCH 90/96] Adjusted docstrings --- tests/derive_variable/test_physical_field.py | 5 +++-- tests/derive_variable/test_time_components.py | 10 ++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/derive_variable/test_physical_field.py b/tests/derive_variable/test_physical_field.py index 1790bb1..f2b346c 100644 --- a/tests/derive_variable/test_physical_field.py +++ b/tests/derive_variable/test_physical_field.py @@ -65,8 +65,9 @@ def test_toa_radiation( lon: float | xr.DataArray, time: np.datetime64 | datetime.datetime | xr.DataArray, ): - """ - Test the `calculate_toa_radiation` function from mllam_data_prep.derived_variables + """Test the `calculate_toa_radiation` function. + + Function from mllam_data_prep.ops.derive_variable.physical_field. """ if isinstance(time, (xr.DataArray, datetime.datetime)): calculate_toa_radiation(lat, lon, time) diff --git a/tests/derive_variable/test_time_components.py b/tests/derive_variable/test_time_components.py index 2419e6d..1e308d5 100644 --- a/tests/derive_variable/test_time_components.py +++ b/tests/derive_variable/test_time_components.py @@ -22,8 +22,9 @@ def test_hour_of_day( time: np.datetime64 | datetime.datetime | xr.DataArray, component: str ): - """ - Test the `calculate_hour_of_day` function from mllam_data_prep.derived_variables + """Test the `calculate_hour_of_day` function. + + Function from mllam_data_prep.ops.derive_variable.time_components. """ if isinstance(time, (xr.DataArray, datetime.datetime)): calculate_hour_of_day(time, component=component) @@ -43,8 +44,9 @@ def test_hour_of_day( def test_day_of_year( time: np.datetime64 | datetime.datetime | xr.DataArray, component: str ): - """ - Test the `calculate_day_of_year` function from mllam_data_prep.derived_variables + """Test the `calculate_day_of_year` function. + + Function from mllam_data_prep.ops.derive_variable.time_components. """ if isinstance(time, (xr.DataArray, datetime.datetime)): From c633462fe8a9b588b8f6ff80f81721d591b01e09 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 23 Jan 2025 08:26:17 +0000 Subject: [PATCH 91/96] Add mafdmi as contributor --- CHANGELOG.md | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 124e544..03159a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- add ability to derive variables from input datasets [\#34](https://github.com/mllam/mllam-data-prep/pull/34), @ealerskans +- add ability to derive variables from input datasets [\#34](https://github.com/mllam/mllam-data-prep/pull/34), @ealerskans, @mafdmi - add github PR template to guide development process on github [\#44](https://github.com/mllam/mllam-data-prep/pull/44), @leifdenby - add support for zarr 3.0.0 and above [\#51](https://github.com/mllam/mllam-data-prep/pull/51), @kashif diff --git a/pyproject.toml b/pyproject.toml index 3edf350..0059e32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ authors = [ {name = "Eleni Briola", email = "elb@dmi.dk"}, {name = "Joel Oskarsson", email = "joel.oskarsson@liu.se"}, {name = "Kashif Rasul", email = "kashif.rasul@gmail.com"}, + {name = "Martin Frølund", email = "maf@dmi.dk"}, ] dependencies = [ "xarray>=2024.2.0", From ceb0d21aa6b562fa44e9cebd5dfbeaaac49d9590 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 23 Jan 2025 08:35:11 +0000 Subject: [PATCH 92/96] Linting --- tests/derive_variable/test_main.py | 2 +- tests/derive_variable/test_physical_field.py | 1 + tests/derive_variable/test_time_components.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/derive_variable/test_main.py b/tests/derive_variable/test_main.py index c0bb823..804213a 100644 --- a/tests/derive_variable/test_main.py +++ b/tests/derive_variable/test_main.py @@ -1,6 +1,5 @@ """Unit tests for the main module of the derive_variable operations.""" -import pathlib import sys from types import ModuleType from typing import Generator @@ -8,6 +7,7 @@ import pytest import xarray as xr + from mllam_data_prep.ops.derive_variable.main import ( _check_and_get_required_attributes, _get_derived_variable_function, diff --git a/tests/derive_variable/test_physical_field.py b/tests/derive_variable/test_physical_field.py index f2b346c..a3ee7b5 100644 --- a/tests/derive_variable/test_physical_field.py +++ b/tests/derive_variable/test_physical_field.py @@ -6,6 +6,7 @@ import numpy as np import pytest import xarray as xr + from mllam_data_prep.ops.derive_variable.physical_field import calculate_toa_radiation diff --git a/tests/derive_variable/test_time_components.py b/tests/derive_variable/test_time_components.py index 1e308d5..69c8d54 100644 --- a/tests/derive_variable/test_time_components.py +++ b/tests/derive_variable/test_time_components.py @@ -5,6 +5,7 @@ import numpy as np import pytest import xarray as xr + from mllam_data_prep.ops.derive_variable.time_components import ( calculate_day_of_year, calculate_hour_of_day, From 0ecfccaecb04d6918cef66efe2a1b5ef199dd439 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 24 Jan 2025 07:36:56 +0000 Subject: [PATCH 93/96] Prefix 'function' arguments from the dataset with 'ds_input.' --- README.md | 64 ++++++++++----------- example.danra.yaml | 12 ++-- mllam_data_prep/config.py | 26 ++++----- mllam_data_prep/ops/derive_variable/main.py | 31 ++++++---- 4 files changed, 70 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index fe19134..a56bd2f 100644 --- a/README.md +++ b/README.md @@ -179,20 +179,18 @@ inputs: # derive variables to be used as forcings toa_radiation: kwargs: - time: time - lat: lat - lon: lon + time: ds_input.time + lat: ds_input.lat + lon: ds_input.lon function: mllam_data_prep.ops.derive_variable.physical_field.calculate_toa_radiation hour_of_day_sin: kwargs: - time: time - extra_kwargs: + time: ds_input.time component: sin function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day hour_of_day_cos: kwargs: - time: time - extra_kwargs: + time: ds_input.time component: cos function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day dim_mapping: @@ -318,20 +316,18 @@ inputs: # derive variables to be used as forcings toa_radiation: kwargs: - time: time - lat: lat - lon: lon - function: mllam_data_prep.derive_variable.physical_field.calculate_toa_radiation + time: ds_input.time + lat: ds_input.lat + lon: ds_input.lon + function: mllam_data_prep.ops.derive_variable.physical_field.calculate_toa_radiation hour_of_day_sin: kwargs: - time: time - extra_kwargs: + time: ds_input.time component: sin function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day hour_of_day_cos: kwargs: - time: time - extra_kwargs: + time: ds_input.time component: cos function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day dim_mapping: @@ -360,39 +356,43 @@ The `inputs` section defines the source datasets to extract data from. Each sour - `stack`: stack the listed dimension to create the dimension in the output - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable. - `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the name of the variable to be derived and the value defines a dictionary with the following additional information. See also the 'Derived Variables' section for more details. - - `function`: the function used to derive a variable. This should be a string the full namespace of the function, e.g. `mllam_data_prep.ops.derived_variables.physical_field.calculate_toa_radiation`. - - `kwargs`: `function` arguments that should be extracted from the source dataset. This is a dictionary where each key is the name of the variables to select from the source dataset and each value is the named argument to `function`. - - `extra_kwargs`: `function` arguments that should not be extracted from the source dataset, such as the extra argument `component` to `mllam_data_prep.ops.derived_variables.time_components.calculate_hour_of_day` which is a string (either "sin" or "cos") the decides if the returned field is the sine or cosine component of the cyclically encoded hour of day variable. + - `function`: the function used to derive a variable. This should be a string with the full namespace of the function, e.g. `mllam_data_prep.ops.derived_variables.physical_field.calculate_toa_radiation`. + - `kwargs`: arguments to `function`. This is a dictionary where each key is the named argument to `function` and each value is the input to the function. Here we distinguish between values to be extracted/selected from the input dataset and values supplied by the users themselves. Arguments with values to be extracted from the input dataset need to be prefixed with "input_dataset." to distinguish them from other arguments. See the 'Derived Variables' section for more details. #### Derived Variables -Variables that are not part of the source dataset but can be derived from variables in the source dataset can also be included. They should be defined in their own section, called `derived_variables` as illustrated in the example config above and in the `example.danra.yaml` config file. +Variables that are not part of the source dataset but can be derived from variables in the source dataset can also be included. They should be defined in their own section, called `derived_variables` as illustrated in the example config above and in the example config file [example.danra.yaml](example.danra.yaml). + +To derive a variable, the function to be used (`function`) and the arguments to this function (`kwargs`) need to be specified, as explained above. Here we need to distinguish between arguments that should be data from the input dataset and arguments that should be supplied by the users themselves. The example below illustrates how to derive the cosine component of the cyclically encoded hour of day variable -To derive the variables, the function used to derive the variable (`function`) and the arguments to this function (`kwargs` and `extra_kwargs`) need to be specified, as explained above. In addition, an optional section called `attrs` can be added. In this section, the user can add attributes to the derived variable, as illustrated below. ```yaml derived_variables: - toa_radiation: + hour_of_day_cos: kwargs: - time: time - lat: lat - lon: lon - function: mllam_data_prep.derive_variable.physical_field.calculate_toa_radiation + time: ds_input.time + component: cos + function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day attrs: - units: W*m**-2 - long_name: top-of-atmosphere incoming radiation + units: 1 + long_name: cos component of cyclically encoded hour of day ``` -Note that the attributes `units` and `long_name` are required. This means that if the function used to derive a variable does not set these attributes they are **required** to be set in the config file. If using a function defined in `mllam_data_prep.ops.derive_variable` the `attrs` section is optional as the attributes should already be defined. In this case, adding the `units` and `long_name` attributes to the `attrs` section of the derived variable in config file will **overwrite** the already-defined attributes in the function. +The function `mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day` takes two arguments; `time` and `component`. The `time` argument should extract the `time` variable from the input dataset and has therefore been prefixed with "ds_input." to distinguish it from other arguments that should not be extracted from the source dataset. The `component` argument, on the other hand, is a string (either "sin" or "cos") and decides if the returned derived variable is the sine or cosine component of the cyclically encoded hour of day. + +In addition, an optional section called `attrs` can be added. In this section, the user can add attributes to the derived variable, as illustrated in the example above. Note that the attributes `units` and `long_name` are **required**. This means that if the function used to derive a variable does not set these attributes they are **required** to be set in the config file. If using a function defined in `mllam_data_prep.ops.derive_variable` the `attrs` section is optional as the required attributes should already be defined. In this case, adding the `units` and `long_name` attributes to the `attrs` section of the derived variable in config file will **overwrite** the already-defined attributes in the function. It is also possible to set other attributes. This can be done by adding them under the `attrs` section in the same way as shown for `unit` and `long_name` in the example above. Currently, the following derived variables are included as part of `mllam-data-prep`: - `toa_radiation`: - Top-of-atmosphere incoming radiation - function: `mllam_data_prep.ops.derive_variable.physical_field.calculate_toa_radiation` + - arguments: `lat`, `lon`, `time` - `hour_of_day_[sin/cos]`: - - Sine of cosine part of cyclically encoded hour of day - - function: `mllam_data_prep.ops.derive_variable.time_compoents.calculate_hour_of_day` + - Sine or cosine part of cyclically encoded hour of day + - function: `mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day` + - arguments: `time`, `component` - `day_of_year_[sin/cos]`: - - Sine of cosine part of cyclically encoded day of year - - function: `mllam_data_prep.ops.derive_variable.time_compoents.calculate_day_of_year` + - Sine or cosine part of cyclically encoded day of year + - function: `mllam_data_prep.ops.derive_variable.time_components.calculate_day_of_year` + - arguments: `time`, `component` ### Config schema versioning diff --git a/example.danra.yaml b/example.danra.yaml index 5101005..4e80a2d 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -65,20 +65,18 @@ inputs: # derive variables to be used as forcings toa_radiation: kwargs: - time: time - lat: lat - lon: lon + time: ds_input.time + lat: ds_input.lat + lon: ds_input.lon function: mllam_data_prep.ops.derive_variable.physical_field.calculate_toa_radiation hour_of_day_sin: kwargs: - time: time - extra_kwargs: + time: ds_input.time component: sin function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day hour_of_day_cos: kwargs: - time: time - extra_kwargs: + time: ds_input.time component: cos function: mllam_data_prep.ops.derive_variable.time_components.calculate_hour_of_day dim_mapping: diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 9735b9b..2246efb 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -99,28 +99,26 @@ class ValueSelection: @dataclass class DerivedVariable: """ - Defines a derived variables, where the kwargs (variables required for the - calculation, to be extracted from the input dataset) and the function (for - calculating the variable) are specified. Also, if the function has other arguments - which should not be extracted from the dataset (e.g. a string to indicate if the - sine or cosine component should be extracted) these can be specified in the extra_kwargs. - Optionally, in case a function does not return an `xr.DataArray` with the required - attributes (`units` and `long_name`) set, these should be specified in `attrs`, e.g.: - {"attrs": "units": "W*m**-2, "long_name": "top-of-the-atmosphere radiation"}. - Additional attributes can also be set if desired. + Defines a derived variables, where the function (for calculating the variable) and + the kwargs (arguments to function) are specified. kwargs can contain both arguments + which should extract/select data from the input dataset, in which case they should + have the "ds_input." prefix to distinguish them from other argument that should not + be extracted from the dataset (e.g. a string to indicate if the sine or cosine + component should be extracted). + + Optionally, attributes to the derived variable can be specified in `attrs`, e.g. + {"attrs": "units": "W*m**-2, "long_name": "top-of-the-atmosphere radiation"}. + In case a function does not return an `xr.DataArray` with the required attributes + (`units` and `long_name`) set, these have to be specified in `attrs`. Attributes: - kwargs: Variables required for calculating the derived variable, to be extracted - from the input dataset. + kwargs: Variables required for calculating the derived variable. function: Function used to calculate the derived variable. - extra_kwargs: Extra arguments for `function` that should not be extracted from - the input dataset, such as a string. attrs: Attributes (e.g. `units` and `long_name`) to set for the derived variable. """ kwargs: Dict[str, str] function: str - extra_kwargs: Optional[Dict[str, str]] = field(default_factory=dict) attrs: Optional[Dict[str, str]] = field(default_factory=dict) diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index da6ee58..789a588 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -42,19 +42,30 @@ def derive_variable(ds, derived_variable, chunking): target_dims = list(ds.sizes.keys()) - ds_kwargs = derived_variable.kwargs - extra_kwargs = derived_variable.extra_kwargs function_namespace = derived_variable.function expected_field_attributes = derived_variable.attrs + # Arguments to the function to be selected/extracted from the input dataset + ds_kwargs = { + key: val.rpartition(".")[2] + for key, val in derived_variable.kwargs.items() + if "ds_input" in val + } + # Other arguments that should not be selected/extracted form the input dataset + other_kwargs = { + key: val + for key, val in derived_variable.kwargs.items() + if "ds_input" not in val + } + # Separate the lat,lon from the required variables as these will be derived separately logger.warning( "Assuming that the lat/lon coordinates are given as variables called" " 'lat' and 'lon'." ) latlon_coords_to_include = {} - for key in list(ds_kwargs.keys()): - if key in ["lat", "lon"]: + for key, val in list(ds_kwargs.items()): + if val in ["lat", "lon"]: latlon_coords_to_include[key] = ds_kwargs.pop(key) # Get subset of input dataset for calculating derived variables @@ -82,15 +93,15 @@ def derive_variable(ds, derived_variable, chunking): # Add function arguments to kwargs kwargs = {} - # - Add lat, and lon, if used as arguments + # - lat, and lon, if used as arguments if len(latlon_coords_to_include): latlon = get_latlon_coords_for_input(ds) for key, val in latlon_coords_to_include.items(): - kwargs[val] = latlon[key] - # Add variables extracted from the input dataset - kwargs.update({val: ds_subset[key] for key, val in ds_kwargs.items()}) - # Add extra arguments - kwargs.update(extra_kwargs) + kwargs[key] = latlon[key] + # - variables selected/extracted from the input dataset + kwargs.update({key: ds_subset[val] for key, val in ds_kwargs.items()}) + # - other arguments + kwargs.update(other_kwargs) # Get the function func = _get_derived_variable_function(function_namespace) From 97ee6dde8b74a71389e8143c6e73ae71f89dc301 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 24 Jan 2025 12:11:22 +0000 Subject: [PATCH 94/96] Minor updates according to review --- README.md | 2 +- mllam_data_prep/ops/chunking.py | 4 +-- mllam_data_prep/ops/derive_variable/main.py | 36 +++------------------ mllam_data_prep/ops/subsetting.py | 6 ++-- 4 files changed, 10 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index a56bd2f..802e62d 100644 --- a/README.md +++ b/README.md @@ -357,7 +357,7 @@ The `inputs` section defines the source datasets to extract data from. Each sour - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable. - `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the name of the variable to be derived and the value defines a dictionary with the following additional information. See also the 'Derived Variables' section for more details. - `function`: the function used to derive a variable. This should be a string with the full namespace of the function, e.g. `mllam_data_prep.ops.derived_variables.physical_field.calculate_toa_radiation`. - - `kwargs`: arguments to `function`. This is a dictionary where each key is the named argument to `function` and each value is the input to the function. Here we distinguish between values to be extracted/selected from the input dataset and values supplied by the users themselves. Arguments with values to be extracted from the input dataset need to be prefixed with "input_dataset." to distinguish them from other arguments. See the 'Derived Variables' section for more details. + - `kwargs`: arguments to `function`. This is a dictionary where each key is the named argument to `function` and each value is the input to the function. Here we distinguish between values to be extracted/selected from the input dataset and values supplied by the users themselves. Arguments with values to be extracted from the input dataset need to be prefixed with "ds_input." to distinguish them from other arguments. See the 'Derived Variables' section for more details. #### Derived Variables Variables that are not part of the source dataset but can be derived from variables in the source dataset can also be included. They should be defined in their own section, called `derived_variables` as illustrated in the example config above and in the example config file [example.danra.yaml](example.danra.yaml). diff --git a/mllam_data_prep/ops/chunking.py b/mllam_data_prep/ops/chunking.py index dfac4b1..3d88e51 100644 --- a/mllam_data_prep/ops/chunking.py +++ b/mllam_data_prep/ops/chunking.py @@ -7,7 +7,7 @@ def check_chunk_size(ds, chunks): """ - Check the chunk size and warn if it exceed CHUNK_MAX_SIZE_WARNING. + Check the chunk size and warn if it exceeds CHUNK_MAX_SIZE_WARNING. Parameters ---------- @@ -45,7 +45,7 @@ def check_chunk_size(ds, chunks): def chunk_dataset(ds, chunks): """ - Check the chunk size and chunk dataset. + Check the chunk size and chunk the dataset. Parameters ---------- diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index 789a588..e44a694 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -20,7 +20,7 @@ def derive_variable(ds, derived_variable, chunking): """ - Load the dataset, and derive the specified variables + Derive a variable using the `function` and `kwargs` of `derived_variable`. Parameters --------- @@ -118,9 +118,9 @@ def derive_variable(ds, derived_variable, chunking): derived_field.attrs.update(derived_field_attrs) # Return any dropped/reset coordinates - derived_field = _return_dropped_coordinates( - derived_field, ds_subset, required_coordinates, chunks - ) + for req_coord in required_coordinates: + if req_coord in chunks: + derived_field.coords[req_coord] = ds_subset[req_coord] # Align the derived field to the output dataset dimensions (if necessary) derived_field = _align_derived_variable(derived_field, ds, target_dims) @@ -219,34 +219,6 @@ def _check_and_get_required_attributes(field, expected_attributes): return attrs -def _return_dropped_coordinates(field, ds, required_coordinates, chunks): - """ - Return the coordinates that have been dropped/reset. - - Parameters - ---------- - field: xr.DataArray - Derived variable - ds: xr.Dataset - Dataset with required coordinatwes - required_coordinates: List[str] - List of coordinates required for the derived variable - chunks: Dict[str, int] - Dictionary with keys as dimensions to be chunked and - chunk sizes as the values - - Returns - ------- - field: xr.DataArray - Derived variable, now also with dropped coordinates returned - """ - for req_coord in required_coordinates: - if req_coord in chunks: - field.coords[req_coord] = ds[req_coord] - - return field - - def _align_derived_variable(field, ds, target_dims): """ Align a derived variable to the target dimensions (ignoring non-dimension coordinates). diff --git a/mllam_data_prep/ops/subsetting.py b/mllam_data_prep/ops/subsetting.py index abdd59a..80f2ce1 100644 --- a/mllam_data_prep/ops/subsetting.py +++ b/mllam_data_prep/ops/subsetting.py @@ -1,8 +1,8 @@ def extract_variable(ds, var_name, coords_to_sample=dict()): """ - Extract specified variable from the provided the input dataset. If - coordinates for subsetting are defines, then subset the variable along - them and check coordinate units + Extract specified variable from the provided input dataset. If + coordinates for subsetting are defined, then subset the variable along + them and check coordinate units. Parameters ---------- From 14beca824697f600e259241b63af90e779173dc2 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 27 Jan 2025 11:52:34 +0000 Subject: [PATCH 95/96] Change back example in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 802e62d..ee97317 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ The package can also be used as a python module to create datasets directly, for import mllam_data_prep as mdp config_path = "example.danra.yaml" -config = mdp.Config.load_config(config_path) +config = mdp.Config.from_yaml_file(config_path) ds = mdp.create_dataset(config=config) ``` From 209a8d871c94da5125ae9bb40c5237a60f6df85d Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 27 Jan 2025 11:56:04 +0000 Subject: [PATCH 96/96] Update docstring for 'get_latlon_coords_for_input' --- mllam_data_prep/ops/derive_variable/main.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index e44a694..22a3186 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -255,5 +255,9 @@ def _align_derived_variable(field, ds, target_dims): def get_latlon_coords_for_input(ds): - """Dummy function for getting lat and lon.""" + """ + Placeholder function for getting latitude and longitude values. + This will eventually be replaced by routines handling proper projection support + (see https://github.com/mllam/mllam-data-prep/pull/38). + """ return ds[["lat", "lon"]].chunk(-1, -1)