Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 18 additions & 41 deletions src/data/cubedsphere.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,12 @@
"""Tools for working with cubedsphere data"""
import pandas as pd
import xarray as xr
from itertools import product
from toolz import groupby
import numpy as np


#TODO write a test for this method
def combine_subtiles(tiles):
"""Combine subtiles of a cubed-sphere dataset
NUM_TILES = 6

In v.12 of xarray, combined_by_coords behaves differently with DataArrays
and Datasets. It was broadcasting all the variables to a common set of
dimensions, dramatically increasing the size of the dataset.

"""
data_vars = list(tiles[0])
output_vars = []
for key in data_vars:
# the to_dataset is needed to avoid a strange error
tiles_for_var = [tile[key].to_dataset() for tile in tiles]
combined = xr.combine_by_coords(tiles_for_var)
output_vars.append(combined)
return xr.merge(output_vars)


def file_names(prefix, num_tiles=6, num_subtiles=16):

tiles = list(range(1, num_tiles + 1))
subtiles = list(range(num_subtiles))

for (tile, proc) in product(tiles, subtiles):
filename = prefix + f'.tile{tile:d}.nc.{proc:04d}'
yield tile, proc, filename

#TODO test this
def remove_duplicate_coords(ds):
deduped_indices = {}
for dim in ds.dims:
Expand All @@ -42,22 +15,26 @@ def remove_duplicate_coords(ds):
return ds.isel(deduped_indices)


# TODO(Spencer): write a test of this function
def read_tile(prefix, tile, num_subtiles=16):
subtiles = range(num_subtiles)
filenames = [f'{prefix}.tile{tile:d}.nc.{proc:04d}' for proc in subtiles]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we make this filename pattern a keyword argument? I get a little worried about having hardcoded strings too far down the call stack.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you suggesting something like this?

def read_tile(prefix, tile, pattern='{prefix}.tile{tile:d}.nc.{proc:04d}', num_subtiles=16):
    subtiles = range(num_subtiles)
    filenames = [pattern.format(prefix=prefix, tile=tile, proc=proc) for proc in subtiles]
    ...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly, it's not perfect since I would prefer the file name handling code be completely separate from the combining code, but it is okay for now.

return xr.open_mfdataset(
filenames,
data_vars='minimal',
combine='by_coords'
)


# TODO(Spencer): write a test of this function
def open_cubed_sphere(prefix: str, **kwargs):
"""Open cubed-sphere data

Args:
prefix: the beginning part of the filename before the `.tile1.nc.0001`
part
"""
files = file_names(prefix, **kwargs)
datasets = ((tile, xr.open_dataset(path, chunks={}))
for tile, proc, path in files)
tiles = groupby(lambda x: x[0], datasets)
tiles = {tile: [data for tile, data in values]
for tile, values in tiles.items()}

combined_tiles = [combine_subtiles(datasets).assign_coords(tiles=tile)
for tile, datasets in tiles.items()]

data = xr.concat(combined_tiles, dim='tiles').sortby('tiles')
return remove_duplicate_coords(data)
tile_index = pd.Index(range(1, NUM_TILES + 1), name='tiles')
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a slight preference for a singular dimension name here, i.e. 'tile' instead of 'tiles'. Do you mind if I change that?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes please! Although there are some downsteam dependecies in the Snakefile, src.fv3.docker and src.fv3.coarsen.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great, thanks for the heads up -- hopefully I got to everything in 0c03f91.

tiles = [read_tile(prefix, tile, **kwargs) for tile in tile_index]
combined = xr.concat(tiles, dim=tile_index)
return remove_duplicate_coords(combined)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be in favor of calling remove_duplicate_coords in read_tile since that is where the problem arises.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing -- I'll change this in my next commit.

Empty file removed src/data/cubesphere.py
Empty file.
42 changes: 42 additions & 0 deletions src/data/test_cubedsphere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pytest
import xarray as xr


from .cubedsphere import remove_duplicate_coords


@pytest.mark.parametrize(
('x', 'y', 'data', 'expected_x', 'expected_y', 'expected_data'),
[
([1, 1], [3, 4], [[1, 2], [3, 4]], [1], [3, 4], [[1, 2]]),
([1, 2], [3, 3], [[1, 2], [3, 4]], [1, 2], [3], [[1], [3]]),
([1, 1], [3, 3], [[1, 2], [3, 4]], [1], [3], [[1]]),
([1, 2], [3, 4], [[1, 2], [3, 4]], [1, 2], [3, 4], [[1, 2], [3, 4]])
],
ids=['duplicate x', 'duplicate y', 'duplicate x and y', 'no duplicates']
)
def test_remove_duplicate_coords(
x, y, data, expected_x, expected_y, expected_data
):
x = xr.DataArray(x, coords=[x], dims=['x'])
y = xr.DataArray(y, coords=[y], dims=['y'])
data = xr.DataArray(data, coords=[x, y], dims=['x', 'y'], name='foo')

expected_x = xr.DataArray(expected_x, coords=[expected_x], dims=['x'])
expected_y = xr.DataArray(expected_y, coords=[expected_y], dims=['y'])
expected = xr.DataArray(
expected_data,
coords=[expected_x, expected_y],
dims=['x', 'y'],
name='foo'
)

# Test the DataArray case
result = remove_duplicate_coords(data)
xr.testing.assert_identical(result, expected)

# Test the Dataset case
data = data.to_dataset()
expected = expected.to_dataset()
result = remove_duplicate_coords(data)
xr.testing.assert_identical(result, expected)