ai2cm · nbren12 · Oct 16, 2019 · Oct 10, 2019 · Oct 10, 2019 · Oct 11, 2019
diff --git a/Snakefile b/Snakefile
@@ -87,7 +87,7 @@ rule prepare_restart_directory:
             ('fv_core.res', open_cubed_sphere(params.core)),
             ('fv_srf_wnd.res', open_cubed_sphere(params.srf_wnd)),
             ('sfc_data', xr.open_dataset(input.sfc_data)),
-            ('grid_spec', xr.open_mfdataset(sorted(input.grid_spec), concat_dim='tiles'))
+            ('grid_spec', xr.open_mfdataset(sorted(input.grid_spec), concat_dim='tile'))
         ]
 
         make_experiment(

diff --git a/src/data/cubedsphere.py b/src/data/cubedsphere.py
@@ -1,39 +1,43 @@
 """Tools for working with cubedsphere data"""
 import xarray as xr
-from itertools import product
-from toolz import groupby
 import numpy as np
 
 
-#TODO write a test for this method
-def combine_subtiles(tiles):
-    """Combine subtiles of a cubed-sphere dataset
+NUM_TILES = 6
+SUBTILE_FILE_PATTERN = '{prefix}.tile{tile:d}.nc.{subtile:04d}'
 
-    In v.12 of xarray, combined_by_coords behaves differently with DataArrays
-    and Datasets. It was broadcasting all the variables to a common set of
-    dimensions, dramatically increasing the size of the dataset.
 
+# TODO: write a test for this method
+def combine_subtiles(subtiles):
+    """Combine subtiles of a cubed-sphere dataset
+    In v.12 of xarray, with data_vars='all' by default, combined_by_coords
+    broadcasts all the variables to a common set of dimensions, dramatically
+    increasing the size of the dataset.  In some cases this can be avoided by
+    using data_vars='minimal'; however it then fails for combining data
+    variables that contain missing values and is also slow due to the fact that
+    it does equality checks between variables it combines.
+
+    To work around this issue, we combine the data variables of the dataset one
+    at a time with the default data_vars='all', and then merge them back
+    together.
     """
-    data_vars = list(tiles[0])
+    sample_subtile = subtiles[0]
     output_vars = []
-    for key in data_vars:
-        # the to_dataset is needed to avoid a strange error
-        tiles_for_var = [tile[key].to_dataset() for tile in tiles]
-        combined = xr.combine_by_coords(tiles_for_var)
+    for key in sample_subtile:
+        # combine_by_coords currently does not work on DataArrays; see
+        # https://github.com/pydata/xarray/issues/3248.
+        subtiles_for_var = [subtile[key].to_dataset() for subtile in subtiles]
+        combined = xr.combine_by_coords(subtiles_for_var)
         output_vars.append(combined)
     return xr.merge(output_vars)
 
 
-def file_names(prefix, num_tiles=6, num_subtiles=16):
+def subtile_filenames(prefix, tile, pattern=SUBTILE_FILE_PATTERN,
+                      num_subtiles=16):
+    for subtile in range(num_subtiles):
+        yield pattern.format(prefix=prefix, tile=tile, subtile=subtile)
 
-    tiles = list(range(1, num_tiles + 1))
-    subtiles = list(range(num_subtiles))
 
-    for (tile, proc) in product(tiles, subtiles):
-        filename = prefix + f'.tile{tile:d}.nc.{proc:04d}'
-        yield tile, proc, filename
-
-#TODO test this
 def remove_duplicate_coords(ds):
     deduped_indices = {}
     for dim in ds.dims:
@@ -45,19 +49,14 @@ def remove_duplicate_coords(ds):
 def open_cubed_sphere(prefix: str, **kwargs):
     """Open cubed-sphere data
 
-    Args:
-        prefix: the beginning part of the filename before the `.tile1.nc.0001`
-          part
-    """
-    files = file_names(prefix, **kwargs)
-    datasets = ((tile, xr.open_dataset(path, chunks={}))
-                for tile, proc, path in files)
-    tiles = groupby(lambda x: x[0], datasets)
-    tiles = {tile: [data for tile, data in values]
-             for tile, values in tiles.items()}
-
-    combined_tiles = [combine_subtiles(datasets).assign_coords(tiles=tile)
-                      for tile, datasets in tiles.items()]
-
-    data = xr.concat(combined_tiles, dim='tiles').sortby('tiles')
-    return remove_duplicate_coords(data)
+     Args:
+         prefix: the beginning part of the filename before the `.tile1.nc.0001`
+           part
+     """
+    tiles = []
+    for tile in range(1, NUM_TILES + 1):
+        files = subtile_filenames(prefix, tile, **kwargs)
+        subtiles = [xr.open_dataset(file, chunks={}) for file in files]
+        combined = combine_subtiles(subtiles).assign_coords(tile=tile)
+        tiles.append(remove_duplicate_coords(combined))
+    return xr.concat(tiles, dim='tile')
diff --git a/src/data/cubesphere.py b/src/data/cubesphere.py
diff --git a/src/data/test_cubedsphere.py b/src/data/test_cubedsphere.py
@@ -0,0 +1,42 @@
+import pytest
+import xarray as xr
+
+
+from .cubedsphere import remove_duplicate_coords
+
+
+@pytest.mark.parametrize(
+    ('x', 'y', 'data', 'expected_x', 'expected_y', 'expected_data'),
+    [
+        ([1, 1], [3, 4], [[1, 2], [3, 4]], [1], [3, 4], [[1, 2]]),
+        ([1, 2], [3, 3], [[1, 2], [3, 4]], [1, 2], [3], [[1], [3]]),
+        ([1, 1], [3, 3], [[1, 2], [3, 4]], [1], [3], [[1]]),
+        ([1, 2], [3, 4], [[1, 2], [3, 4]], [1, 2], [3, 4], [[1, 2], [3, 4]])
+    ],
+    ids=['duplicate x', 'duplicate y', 'duplicate x and y', 'no duplicates']
+)
+def test_remove_duplicate_coords(
+    x, y, data, expected_x, expected_y, expected_data
+):
+    x = xr.DataArray(x, coords=[x], dims=['x'])
+    y = xr.DataArray(y, coords=[y], dims=['y'])
+    data = xr.DataArray(data, coords=[x, y], dims=['x', 'y'], name='foo')
+
+    expected_x = xr.DataArray(expected_x, coords=[expected_x], dims=['x'])
+    expected_y = xr.DataArray(expected_y, coords=[expected_y], dims=['y'])
+    expected = xr.DataArray(
+        expected_data,
+        coords=[expected_x, expected_y],
+        dims=['x', 'y'],
+        name='foo'
+    )
+
+    # Test the DataArray case
+    result = remove_duplicate_coords(data)
+    xr.testing.assert_identical(result, expected)
+
+    # Test the Dataset case
+    data = data.to_dataset()
+    expected = expected.to_dataset()
+    result = remove_duplicate_coords(data)
+    xr.testing.assert_identical(result, expected)
diff --git a/src/fv3/coarsen.py b/src/fv3/coarsen.py
@@ -73,7 +73,7 @@ def _combine_subtiles(tiles):
 def combine_subtiles(args_list):
     tile, args_list = args_list
     subtiles = [arg[1] for arg in args_list]
-    return tile, _combine_subtiles(subtiles).assign(tiles=tile)
+    return tile, _combine_subtiles(subtiles).assign(tile=tile)
 
 
 def tile(args):
@@ -115,4 +115,4 @@ def process(args):
         .map(lambda x: x[1])
     )
 
-    return xr.concat(procs.compute(), dim='tiles')
+    return xr.concat(procs.compute(), dim='tile')
diff --git a/src/fv3/docker.py b/src/fv3/docker.py
@@ -40,7 +40,7 @@ def save_tiles_separately(sfc_data, prefix, output_directory):
     for i in range(6):
         output_path = join(output_directory, f"{prefix}.tile{i+1}.nc")
         logging.info(f"saving data to {output_path}")
-        sfc_data.isel(tiles=i).to_netcdf(output_path)
+        sfc_data.isel(tile=i).to_netcdf(output_path)
 
 
 def rundir(directory):