ai2cm · nbren12 · Oct 16, 2019 · Oct 10, 2019 · Oct 10, 2019 · Oct 11, 2019
diff --git a/src/data/cubedsphere.py b/src/data/cubedsphere.py
@@ -1,39 +1,12 @@
 """Tools for working with cubedsphere data"""
+import pandas as pd
 import xarray as xr
-from itertools import product
-from toolz import groupby
 import numpy as np
 
 
-#TODO write a test for this method
-def combine_subtiles(tiles):
-    """Combine subtiles of a cubed-sphere dataset
+NUM_TILES = 6
 
-    In v.12 of xarray, combined_by_coords behaves differently with DataArrays
-    and Datasets. It was broadcasting all the variables to a common set of
-    dimensions, dramatically increasing the size of the dataset.
 
-    """
-    data_vars = list(tiles[0])
-    output_vars = []
-    for key in data_vars:
-        # the to_dataset is needed to avoid a strange error
-        tiles_for_var = [tile[key].to_dataset() for tile in tiles]
-        combined = xr.combine_by_coords(tiles_for_var)
-        output_vars.append(combined)
-    return xr.merge(output_vars)
-
-
-def file_names(prefix, num_tiles=6, num_subtiles=16):
-
-    tiles = list(range(1, num_tiles + 1))
-    subtiles = list(range(num_subtiles))
-
-    for (tile, proc) in product(tiles, subtiles):
-        filename = prefix + f'.tile{tile:d}.nc.{proc:04d}'
-        yield tile, proc, filename
-
-#TODO test this
 def remove_duplicate_coords(ds):
     deduped_indices = {}
     for dim in ds.dims:
@@ -42,22 +15,26 @@ def remove_duplicate_coords(ds):
     return ds.isel(deduped_indices)
 
 
+# TODO(Spencer): write a test of this function
+def read_tile(prefix, tile, num_subtiles=16):
+    subtiles = range(num_subtiles)
+    filenames = [f'{prefix}.tile{tile:d}.nc.{proc:04d}' for proc in subtiles]
+    return xr.open_mfdataset(
+        filenames,
+        data_vars='minimal',
+        combine='by_coords'
+    )
+
+
+# TODO(Spencer): write a test of this function
 def open_cubed_sphere(prefix: str, **kwargs):
     """Open cubed-sphere data
 
     Args:
         prefix: the beginning part of the filename before the `.tile1.nc.0001`
           part
     """
-    files = file_names(prefix, **kwargs)
-    datasets = ((tile, xr.open_dataset(path, chunks={}))
-                for tile, proc, path in files)
-    tiles = groupby(lambda x: x[0], datasets)
-    tiles = {tile: [data for tile, data in values]
-             for tile, values in tiles.items()}
-
-    combined_tiles = [combine_subtiles(datasets).assign_coords(tiles=tile)
-                      for tile, datasets in tiles.items()]
-
-    data = xr.concat(combined_tiles, dim='tiles').sortby('tiles')
-    return remove_duplicate_coords(data)
+    tile_index = pd.Index(range(1, NUM_TILES + 1), name='tiles')
+    tiles = [read_tile(prefix, tile, **kwargs) for tile in tile_index]
+    combined = xr.concat(tiles, dim=tile_index)
+    return remove_duplicate_coords(combined)
diff --git a/src/data/cubesphere.py b/src/data/cubesphere.py
diff --git a/src/data/test_cubedsphere.py b/src/data/test_cubedsphere.py
@@ -0,0 +1,42 @@
+import pytest
+import xarray as xr
+
+
+from .cubedsphere import remove_duplicate_coords
+
+
+@pytest.mark.parametrize(
+    ('x', 'y', 'data', 'expected_x', 'expected_y', 'expected_data'),
+    [
+        ([1, 1], [3, 4], [[1, 2], [3, 4]], [1], [3, 4], [[1, 2]]),
+        ([1, 2], [3, 3], [[1, 2], [3, 4]], [1, 2], [3], [[1], [3]]),
+        ([1, 1], [3, 3], [[1, 2], [3, 4]], [1], [3], [[1]]),
+        ([1, 2], [3, 4], [[1, 2], [3, 4]], [1, 2], [3, 4], [[1, 2], [3, 4]])
+    ],
+    ids=['duplicate x', 'duplicate y', 'duplicate x and y', 'no duplicates']
+)
+def test_remove_duplicate_coords(
+    x, y, data, expected_x, expected_y, expected_data
+):
+    x = xr.DataArray(x, coords=[x], dims=['x'])
+    y = xr.DataArray(y, coords=[y], dims=['y'])
+    data = xr.DataArray(data, coords=[x, y], dims=['x', 'y'], name='foo')
+
+    expected_x = xr.DataArray(expected_x, coords=[expected_x], dims=['x'])
+    expected_y = xr.DataArray(expected_y, coords=[expected_y], dims=['y'])
+    expected = xr.DataArray(
+        expected_data,
+        coords=[expected_x, expected_y],
+        dims=['x', 'y'],
+        name='foo'
+    )
+
+    # Test the DataArray case
+    result = remove_duplicate_coords(data)
+    xr.testing.assert_identical(result, expected)
+
+    # Test the Dataset case
+    data = data.to_dataset()
+    expected = expected.to_dataset()
+    result = remove_duplicate_coords(data)
+    xr.testing.assert_identical(result, expected)