pydata · shoyer · Jun 29, 2019 · Jan 24, 2019 · Jan 24, 2019 · Jan 24, 2019
diff --git a/doc/io.rst b/doc/io.rst
@@ -604,6 +604,14 @@ store is already present at that path, an error will be raised, preventing it
 from being overwritten. To override this behavior and overwrite an existing
 store, add ``mode='w'`` when invoking ``to_zarr``.
 
+It is also possible to append to an existing store. For that, add ``mode='a'``
+and set ``append_dim`` to the name of the dimension along which to append.
+It is necessary that the data only contains types which are either a subclass
+of ``np.number`` or ``np.string_``. It as well needs to be taken into account,
+that the size of the ``np.string_`` in the first chunk sets the maximum
+string size for all following ones. To encode the data consider using
+:py:func:`~xarray.core.api.encode_utf8`.
+
 To read back a zarr dataset that has been created this way, we use the
 :py:func:`~xarray.open_zarr` method:
 

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -188,6 +188,7 @@ Other enhancements
   report showing what exactly differs between the two objects (dimensions /
   coordinates / variables / attributes)  (:issue:`1507`).
   By `Benoit Bovy <https://github.com/benbovy>`_.
+- Added append capability to the zarr store.
 - Add ``tolerance`` option to ``resample()`` methods ``bfill``, ``pad``,
   ``nearest``. (:issue:`2695`)
   By `Hauke Schulz <https://github.com/observingClouds>`_.

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -4,16 +4,21 @@
 from io import BytesIO
 from numbers import Number
 from pathlib import Path
+import re
 
 import numpy as np
+import pandas as pd
 
 from .. import Dataset, backends, conventions
 from ..core import indexing
 from ..core.combine import (
     _CONCAT_DIM_DEFAULT, _auto_combine, _infer_concat_order_from_positions)
 from ..core.utils import close_on_error, is_grib_path, is_remote_uri
+from ..core.variable import Variable
 from .common import ArrayWriter
 from .locks import _get_scheduler
+from ..coding.variables import safe_setitem, unpack_for_encoding
+from ..coding.strings import encode_string_array
 
 DATAARRAY_NAME = '__xarray_dataarray_name__'
 DATAARRAY_VARIABLE = '__xarray_dataarray_variable__'
@@ -1003,8 +1008,30 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
                              for w, s in zip(writes, stores)])
 
 
+def encode_utf8(var, string_max_length):
+    dims, data, attrs, encoding = unpack_for_encoding(var)
+    missing = pd.isnull(data)
+    data[missing] = ""
+    data = encode_string_array(data, 'utf-8')
+    data = data.astype(np.dtype("S{}".format(string_max_length * 2)))
+    return Variable(dims, data, attrs, encoding)
+
+
+def _validate_datatypes_for_zarr_append(dataset):
+    """DataArray.name and Dataset keys must be a string or None"""
+    def check_dtype(var):
+        if (not np.issubdtype(var.dtype, np.number)
+                and not np.issubdtype(var.dtype, np.string_)):
+            # and not re.match('^bytes[1-9]+$', var.dtype.name)):
+            raise ValueError('Invalid dtype for DataVariable: {} '
+                             'dtype must be a subtype of number or '
+                             'a fixed sized string'.format(var))
+    for k in dataset.data_vars.values():
+        check_dtype(k)
+
+
 def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None,
-            encoding=None, compute=True, consolidated=False):
+            encoding=None, compute=True, consolidated=False, append_dim=None):
     """This function creates an appropriate datastore for writing a dataset to
     a zarr ztore
 
@@ -1019,11 +1046,14 @@ def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None,
     _validate_dataset_names(dataset)
     _validate_attrs(dataset)
 
+    if mode == "a":
+        _validate_datatypes_for_zarr_append(dataset)
+
     zstore = backends.ZarrStore.open_group(store=store, mode=mode,
                                            synchronizer=synchronizer,
                                            group=group,
                                            consolidate_on_close=consolidated)
-
+    zstore.append_dim = append_dim
     writer = ArrayWriter()
     # TODO: figure out how to properly handle unlimited_dims
     dump_to_store(dataset, zstore, writer, encoding=encoding)

diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -158,24 +158,35 @@ class ArrayWriter:
     def __init__(self, lock=None):
         self.sources = []
         self.targets = []
+        self.regions = []
         self.lock = lock
 
-    def add(self, source, target):
+    def add(self, source, target, region=None):
         if isinstance(source, dask_array_type):
             self.sources.append(source)
             self.targets.append(target)
+            if region:
+                self.regions.append(region)
         else:
-            target[...] = source
+            if region:
+                target[region] = source
+            else:
+                target[...] = source
 
     def sync(self, compute=True):
         if self.sources:
             import dask.array as da
             # TODO: consider wrapping targets with dask.delayed, if this makes
             # for any discernable difference in perforance, e.g.,
             # targets = [dask.delayed(t) for t in self.targets]
+
+            if not self.regions:
+                regions = None
+            else:
+                regions = self.regions
             delayed_store = da.store(self.sources, self.targets,
                                      lock=self.lock, compute=compute,
-                                     flush=True)
+                                     flush=True, regions=regions)
-                                     flush=True, regions=regions)
+                                     flush=True, regions=self.regions)
-                                     flush=True, regions=regions)
+                                     flush=True, regions=self.regions)
             self.sources = []
             self.targets = []
             return delayed_store

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -8,7 +8,8 @@
 from ..core import indexing
 from ..core.pycompat import integer_types
 from ..core.utils import FrozenOrderedDict, HiddenKeyDict
-from .common import AbstractWritableDataStore, BackendArray
+from .common import AbstractWritableDataStore, BackendArray, \
+    _encode_variable_name
 
 # need some special secret attributes to tell us the dimensions
 _DIMENSION_KEY = '_ARRAY_DIMENSIONS'
@@ -257,6 +258,7 @@ def __init__(self, zarr_group, consolidate_on_close=False):
         self._synchronizer = self.ds.synchronizer
         self._group = self.ds.path
         self._consolidate_on_close = consolidate_on_close
+        self.append_dim = None
 
     def open_store_variable(self, name, zarr_array):
         data = indexing.LazilyOuterIndexedArray(ZarrArrayWrapper(name, self))
@@ -313,40 +315,125 @@ def encode_variable(self, variable):
     def encode_attribute(self, a):
         return _encode_zarr_attr_value(a)
 
-    def prepare_variable(self, name, variable, check_encoding=False,
-                         unlimited_dims=None):
-
-        attrs = variable.attrs.copy()
-        dims = variable.dims
-        dtype = variable.dtype
-        shape = variable.shape
-
-        fill_value = attrs.pop('_FillValue', None)
-        if variable.encoding == {'_FillValue': None} and fill_value is None:
-            variable.encoding = {}
-
-        encoding = _extract_zarr_variable_encoding(
-            variable, raise_on_invalid=check_encoding)
-
-        encoded_attrs = OrderedDict()
-        # the magic for storing the hidden dimension data
-        encoded_attrs[_DIMENSION_KEY] = dims
-        for k, v in attrs.items():
-            encoded_attrs[k] = self.encode_attribute(v)
-
-        zarr_array = self.ds.create(name, shape=shape, dtype=dtype,
-                                    fill_value=fill_value, **encoding)
-        zarr_array.attrs.put(encoded_attrs)
-
-        return zarr_array, variable.data
-
-    def store(self, variables, attributes, *args, **kwargs):
-        AbstractWritableDataStore.store(self, variables, attributes,
-                                        *args, **kwargs)
+    def store(self, variables, attributes, check_encoding_set=frozenset(),
+              writer=None, unlimited_dims=None):
+        """
+        Top level method for putting data on this store, this method:
+          - encodes variables/attributes
+          - sets dimensions
+          - sets variables
+
+        Parameters
+        ----------
+        variables : dict-like
+            Dictionary of key/value (variable name / xr.Variable) pairs
+        attributes : dict-like
+            Dictionary of key/value (attribute name / attribute) pairs
+        check_encoding_set : list-like
+            List of variables that should be checked for invalid encoding
+            values
+        writer : ArrayWriter
+        unlimited_dims : list-like
+            List of dimension names that should be treated as unlimited
+            dimensions.
+            dimension on which the zarray will be appended
+            only needed in append mode
+        """
+
+        existing_variables = set([vn for vn in variables
+                                 if _encode_variable_name(vn) in self.ds])
+        new_variables = set(variables) - existing_variables
+        variables_without_encoding = OrderedDict([(vn, variables[vn])
+                                                 for vn in new_variables])
+        variables_encoded, attributes = self.encode(
+            variables_without_encoding, attributes)
+
+        if len(existing_variables) > 0:
+            # there are variables to append
+            # their encoding must be the same as in the store
+            ds = open_zarr(self.ds.store, auto_chunk=False)
+            variables_with_encoding = OrderedDict()
+            for vn in existing_variables:
+                variables_with_encoding[vn] = variables[vn]
+                variables_with_encoding[vn].encoding = ds[vn].encoding
+            variables_with_encoding, _ = self.encode(variables_with_encoding,
+                                                     {})
+            variables_encoded.update(variables_with_encoding)
+
+        self.set_attributes(attributes)
+        self.set_dimensions(variables_encoded, unlimited_dims=unlimited_dims)
+        self.set_variables(variables_encoded, check_encoding_set, writer,
+                           unlimited_dims=unlimited_dims)
 
     def sync(self):
         pass
 
+    def set_variables(self, variables, check_encoding_set, writer,
+                      unlimited_dims=None):
+        """
+        This provides a centralized method to set the variables on the data
+        store.
+
+        Parameters
+        ----------
+        variables : dict-like
+            Dictionary of key/value (variable name / xr.Variable) pairs
+        check_encoding_set : list-like
+            List of variables that should be checked for invalid encoding
+            values
+        writer :
+        unlimited_dims : list-like
+            List of dimension names that should be treated as unlimited
+            dimensions.
+        """
+        for vn, v in variables.items():
+            name = _encode_variable_name(vn)
+            check = vn in check_encoding_set
+            attrs = v.attrs.copy()
+            dims = v.dims
+            dtype = v.dtype
+            shape = v.shape
+
+            fill_value = attrs.pop('_FillValue', None)
+            if v.encoding == {'_FillValue': None} and fill_value is None:
+                v.encoding = {}
+            if name in self.ds:
+                # append to existing variable
+                zarr_array = self.ds[name]
+                if self.append_dim is None:
+                    raise ValueError(
+                        'dimension being appended is unknown; '
+                        'did you forget to call to_zarr with append_dim '
+                        'argument?')
+                if self.append_dim in dims:
+                    # this is the DataArray that has append_dim as a
+                    # dimension
+                    append_axis = dims.index(self.append_dim)
+                    new_shape = list(zarr_array.shape)
+                    new_shape[append_axis] += v.shape[append_axis]
+                    new_region = [slice(None)] * len(new_shape)
+                    new_region[append_axis] = slice(
+                        zarr_array.shape[append_axis],
+                        None
+                    )
+                    zarr_array.resize(new_shape)
+                    writer.add(v.data, zarr_array,
+                               region=tuple(new_region))
+            else:
+                # new variable
+                encoding = _extract_zarr_variable_encoding(
+                    v, raise_on_invalid=check)
+                encoded_attrs = OrderedDict()
+                # the magic for storing the hidden dimension data
+                encoded_attrs[_DIMENSION_KEY] = dims
+                for k2, v2 in attrs.items():
+                    encoded_attrs[k2] = self.encode_attribute(v2)
+
+                zarr_array = self.ds.create(name, shape=shape, dtype=dtype,
+                                            fill_value=fill_value, **encoding)
+                zarr_array.attrs.put(encoded_attrs)
+                writer.add(v.data, zarr_array)
+
     def close(self):
         if self._consolidate_on_close:
             import zarr

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -1322,7 +1322,8 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
                          compute=compute)
 
     def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None,
-                encoding=None, compute=True, consolidated=False):
+                encoding=None, compute=True, consolidated=False,
+                append_dim=None):
         """Write dataset contents to a zarr group.
 
         .. note:: Experimental
@@ -1333,9 +1334,10 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None,
         ----------
         store : MutableMapping or str, optional
             Store or path to directory in file system.
-        mode : {'w', 'w-'}
+        mode : {'w', 'w-', 'a'}
             Persistence mode: 'w' means create (overwrite if exists);
-            'w-' means create (fail if exists).
+            'w-' means create (fail if exists);
+            'a' means append (create if does not exist).
         synchronizer : object, optional
             Array synchronizer
         group : str, obtional
@@ -1350,21 +1352,23 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None,
         consolidated: bool, optional
             If True, apply zarr's `consolidate_metadata` function to the store
             after writing.
+        append_dim: str, optional
+            If mode='a', the dimension on which the data will be appended.
 
         References
         ----------
         https://zarr.readthedocs.io/
         """
         if encoding is None:
             encoding = {}
-        if mode not in ['w', 'w-']:
-            # TODO: figure out how to handle 'r+' and 'a'
+        if mode not in ['w', 'w-', 'a']:
+            # TODO: figure out how to handle 'r+'
             raise ValueError("The only supported options for mode are 'w' "
                              "and 'w-'.")
         from ..backends.api import to_zarr
         return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer,
                        group=group, encoding=encoding, compute=compute,
-                       consolidated=consolidated)
+                       consolidated=consolidated, append_dim=append_dim)
 
     def __repr__(self):
         return formatting.dataset_repr(self)