-
Notifications
You must be signed in to change notification settings - Fork 2
Add test of remove_duplicate_coords; simplify open_cubed_sphere #13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
1eaf5fd
0c03f91
1659d41
347d25f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,39 +1,12 @@ | ||
| """Tools for working with cubedsphere data""" | ||
| import pandas as pd | ||
| import xarray as xr | ||
| from itertools import product | ||
| from toolz import groupby | ||
| import numpy as np | ||
|
|
||
|
|
||
| #TODO write a test for this method | ||
| def combine_subtiles(tiles): | ||
| """Combine subtiles of a cubed-sphere dataset | ||
| NUM_TILES = 6 | ||
|
|
||
| In v.12 of xarray, combined_by_coords behaves differently with DataArrays | ||
| and Datasets. It was broadcasting all the variables to a common set of | ||
| dimensions, dramatically increasing the size of the dataset. | ||
|
|
||
| """ | ||
| data_vars = list(tiles[0]) | ||
| output_vars = [] | ||
| for key in data_vars: | ||
| # the to_dataset is needed to avoid a strange error | ||
| tiles_for_var = [tile[key].to_dataset() for tile in tiles] | ||
| combined = xr.combine_by_coords(tiles_for_var) | ||
| output_vars.append(combined) | ||
| return xr.merge(output_vars) | ||
|
|
||
|
|
||
| def file_names(prefix, num_tiles=6, num_subtiles=16): | ||
|
|
||
| tiles = list(range(1, num_tiles + 1)) | ||
| subtiles = list(range(num_subtiles)) | ||
|
|
||
| for (tile, proc) in product(tiles, subtiles): | ||
| filename = prefix + f'.tile{tile:d}.nc.{proc:04d}' | ||
| yield tile, proc, filename | ||
|
|
||
| #TODO test this | ||
| def remove_duplicate_coords(ds): | ||
| deduped_indices = {} | ||
| for dim in ds.dims: | ||
|
|
@@ -42,22 +15,26 @@ def remove_duplicate_coords(ds): | |
| return ds.isel(deduped_indices) | ||
|
|
||
|
|
||
| # TODO(Spencer): write a test of this function | ||
| def read_tile(prefix, tile, num_subtiles=16): | ||
| subtiles = range(num_subtiles) | ||
| filenames = [f'{prefix}.tile{tile:d}.nc.{proc:04d}' for proc in subtiles] | ||
| return xr.open_mfdataset( | ||
| filenames, | ||
| data_vars='minimal', | ||
| combine='by_coords' | ||
| ) | ||
|
|
||
|
|
||
| # TODO(Spencer): write a test of this function | ||
| def open_cubed_sphere(prefix: str, **kwargs): | ||
| """Open cubed-sphere data | ||
|
|
||
| Args: | ||
| prefix: the beginning part of the filename before the `.tile1.nc.0001` | ||
| part | ||
| """ | ||
| files = file_names(prefix, **kwargs) | ||
| datasets = ((tile, xr.open_dataset(path, chunks={})) | ||
| for tile, proc, path in files) | ||
| tiles = groupby(lambda x: x[0], datasets) | ||
| tiles = {tile: [data for tile, data in values] | ||
| for tile, values in tiles.items()} | ||
|
|
||
| combined_tiles = [combine_subtiles(datasets).assign_coords(tiles=tile) | ||
| for tile, datasets in tiles.items()] | ||
|
|
||
| data = xr.concat(combined_tiles, dim='tiles').sortby('tiles') | ||
| return remove_duplicate_coords(data) | ||
| tile_index = pd.Index(range(1, NUM_TILES + 1), name='tiles') | ||
|
||
| tiles = [read_tile(prefix, tile, **kwargs) for tile in tile_index] | ||
| combined = xr.concat(tiles, dim=tile_index) | ||
| return remove_duplicate_coords(combined) | ||
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| import pytest | ||
| import xarray as xr | ||
|
|
||
|
|
||
| from .cubedsphere import remove_duplicate_coords | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| ('x', 'y', 'data', 'expected_x', 'expected_y', 'expected_data'), | ||
| [ | ||
| ([1, 1], [3, 4], [[1, 2], [3, 4]], [1], [3, 4], [[1, 2]]), | ||
| ([1, 2], [3, 3], [[1, 2], [3, 4]], [1, 2], [3], [[1], [3]]), | ||
| ([1, 1], [3, 3], [[1, 2], [3, 4]], [1], [3], [[1]]), | ||
| ([1, 2], [3, 4], [[1, 2], [3, 4]], [1, 2], [3, 4], [[1, 2], [3, 4]]) | ||
| ], | ||
| ids=['duplicate x', 'duplicate y', 'duplicate x and y', 'no duplicates'] | ||
| ) | ||
| def test_remove_duplicate_coords( | ||
| x, y, data, expected_x, expected_y, expected_data | ||
| ): | ||
| x = xr.DataArray(x, coords=[x], dims=['x']) | ||
| y = xr.DataArray(y, coords=[y], dims=['y']) | ||
| data = xr.DataArray(data, coords=[x, y], dims=['x', 'y'], name='foo') | ||
|
|
||
| expected_x = xr.DataArray(expected_x, coords=[expected_x], dims=['x']) | ||
| expected_y = xr.DataArray(expected_y, coords=[expected_y], dims=['y']) | ||
| expected = xr.DataArray( | ||
| expected_data, | ||
| coords=[expected_x, expected_y], | ||
| dims=['x', 'y'], | ||
| name='foo' | ||
| ) | ||
|
|
||
| # Test the DataArray case | ||
| result = remove_duplicate_coords(data) | ||
| xr.testing.assert_identical(result, expected) | ||
|
|
||
| # Test the Dataset case | ||
| data = data.to_dataset() | ||
| expected = expected.to_dataset() | ||
| result = remove_duplicate_coords(data) | ||
| xr.testing.assert_identical(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we make this filename pattern a keyword argument? I get a little worried about having hardcoded strings too far down the call stack.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you suggesting something like this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exactly, it's not perfect since I would prefer the file name handling code be completely separate from the combining code, but it is okay for now.