Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ rule prepare_restart_directory:
('fv_core.res', open_cubed_sphere(params.core)),
('fv_srf_wnd.res', open_cubed_sphere(params.srf_wnd)),
('sfc_data', xr.open_dataset(input.sfc_data)),
('grid_spec', xr.open_mfdataset(sorted(input.grid_spec), concat_dim='tiles'))
('grid_spec', xr.open_mfdataset(sorted(input.grid_spec), concat_dim='tile'))
]

make_experiment(
Expand Down
73 changes: 36 additions & 37 deletions src/data/cubedsphere.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,43 @@
"""Tools for working with cubedsphere data"""
import xarray as xr
from itertools import product
from toolz import groupby
import numpy as np


#TODO write a test for this method
def combine_subtiles(tiles):
"""Combine subtiles of a cubed-sphere dataset
NUM_TILES = 6
SUBTILE_FILE_PATTERN = '{prefix}.tile{tile:d}.nc.{subtile:04d}'

In v.12 of xarray, combined_by_coords behaves differently with DataArrays
and Datasets. It was broadcasting all the variables to a common set of
dimensions, dramatically increasing the size of the dataset.

# TODO: write a test for this method
def combine_subtiles(subtiles):
"""Combine subtiles of a cubed-sphere dataset
In v.12 of xarray, with data_vars='all' by default, combined_by_coords
broadcasts all the variables to a common set of dimensions, dramatically
increasing the size of the dataset. In some cases this can be avoided by
using data_vars='minimal'; however it then fails for combining data
variables that contain missing values and is also slow due to the fact that
it does equality checks between variables it combines.

To work around this issue, we combine the data variables of the dataset one
at a time with the default data_vars='all', and then merge them back
together.
"""
data_vars = list(tiles[0])
sample_subtile = subtiles[0]
output_vars = []
for key in data_vars:
# the to_dataset is needed to avoid a strange error
tiles_for_var = [tile[key].to_dataset() for tile in tiles]
combined = xr.combine_by_coords(tiles_for_var)
for key in sample_subtile:
# combine_by_coords currently does not work on DataArrays; see
# https://github.com/pydata/xarray/issues/3248.
subtiles_for_var = [subtile[key].to_dataset() for subtile in subtiles]
combined = xr.combine_by_coords(subtiles_for_var)
output_vars.append(combined)
return xr.merge(output_vars)


def file_names(prefix, num_tiles=6, num_subtiles=16):
def subtile_filenames(prefix, tile, pattern=SUBTILE_FILE_PATTERN,
num_subtiles=16):
for subtile in range(num_subtiles):
yield pattern.format(prefix=prefix, tile=tile, subtile=subtile)

tiles = list(range(1, num_tiles + 1))
subtiles = list(range(num_subtiles))

for (tile, proc) in product(tiles, subtiles):
filename = prefix + f'.tile{tile:d}.nc.{proc:04d}'
yield tile, proc, filename

#TODO test this
def remove_duplicate_coords(ds):
deduped_indices = {}
for dim in ds.dims:
Expand All @@ -45,19 +49,14 @@ def remove_duplicate_coords(ds):
def open_cubed_sphere(prefix: str, **kwargs):
"""Open cubed-sphere data

Args:
prefix: the beginning part of the filename before the `.tile1.nc.0001`
part
"""
files = file_names(prefix, **kwargs)
datasets = ((tile, xr.open_dataset(path, chunks={}))
for tile, proc, path in files)
tiles = groupby(lambda x: x[0], datasets)
tiles = {tile: [data for tile, data in values]
for tile, values in tiles.items()}

combined_tiles = [combine_subtiles(datasets).assign_coords(tiles=tile)
for tile, datasets in tiles.items()]

data = xr.concat(combined_tiles, dim='tiles').sortby('tiles')
return remove_duplicate_coords(data)
Args:
prefix: the beginning part of the filename before the `.tile1.nc.0001`
part
"""
tiles = []
for tile in range(1, NUM_TILES + 1):
files = subtile_filenames(prefix, tile, **kwargs)
subtiles = [xr.open_dataset(file, chunks={}) for file in files]
combined = combine_subtiles(subtiles).assign_coords(tile=tile)
tiles.append(remove_duplicate_coords(combined))
return xr.concat(tiles, dim='tile')
Empty file removed src/data/cubesphere.py
Empty file.
42 changes: 42 additions & 0 deletions src/data/test_cubedsphere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pytest
import xarray as xr


from .cubedsphere import remove_duplicate_coords


@pytest.mark.parametrize(
('x', 'y', 'data', 'expected_x', 'expected_y', 'expected_data'),
[
([1, 1], [3, 4], [[1, 2], [3, 4]], [1], [3, 4], [[1, 2]]),
([1, 2], [3, 3], [[1, 2], [3, 4]], [1, 2], [3], [[1], [3]]),
([1, 1], [3, 3], [[1, 2], [3, 4]], [1], [3], [[1]]),
([1, 2], [3, 4], [[1, 2], [3, 4]], [1, 2], [3, 4], [[1, 2], [3, 4]])
],
ids=['duplicate x', 'duplicate y', 'duplicate x and y', 'no duplicates']
)
def test_remove_duplicate_coords(
x, y, data, expected_x, expected_y, expected_data
):
x = xr.DataArray(x, coords=[x], dims=['x'])
y = xr.DataArray(y, coords=[y], dims=['y'])
data = xr.DataArray(data, coords=[x, y], dims=['x', 'y'], name='foo')

expected_x = xr.DataArray(expected_x, coords=[expected_x], dims=['x'])
expected_y = xr.DataArray(expected_y, coords=[expected_y], dims=['y'])
expected = xr.DataArray(
expected_data,
coords=[expected_x, expected_y],
dims=['x', 'y'],
name='foo'
)

# Test the DataArray case
result = remove_duplicate_coords(data)
xr.testing.assert_identical(result, expected)

# Test the Dataset case
data = data.to_dataset()
expected = expected.to_dataset()
result = remove_duplicate_coords(data)
xr.testing.assert_identical(result, expected)
4 changes: 2 additions & 2 deletions src/fv3/coarsen.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def _combine_subtiles(tiles):
def combine_subtiles(args_list):
tile, args_list = args_list
subtiles = [arg[1] for arg in args_list]
return tile, _combine_subtiles(subtiles).assign(tiles=tile)
return tile, _combine_subtiles(subtiles).assign(tile=tile)


def tile(args):
Expand Down Expand Up @@ -115,4 +115,4 @@ def process(args):
.map(lambda x: x[1])
)

return xr.concat(procs.compute(), dim='tiles')
return xr.concat(procs.compute(), dim='tile')
2 changes: 1 addition & 1 deletion src/fv3/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def save_tiles_separately(sfc_data, prefix, output_directory):
for i in range(6):
output_path = join(output_directory, f"{prefix}.tile{i+1}.nc")
logging.info(f"saving data to {output_path}")
sfc_data.isel(tiles=i).to_netcdf(output_path)
sfc_data.isel(tile=i).to_netcdf(output_path)


def rundir(directory):
Expand Down