Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ cftime = "1.6.*"
dask-core = "2024.6.*"
distributed = "2024.6.*"
flox = "0.9.*"
h5netcdf = "1.4.*"
# h5netcdf 1.8.0 introduced a few compatibility features with netcdf4
# https://github.com/pydata/xarray/issues/10657#issuecomment-3711095986
h5netcdf = "1.8.*"
# h5py and hdf5 tend to cause conflicts
# for e.g. hdf5 1.12 conflicts with h5py=3.1
# prioritize bumping other packages instead
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ accel = [
complete = ["xarray[accel,etc,io,parallel,viz]"]
io = [
"netCDF4>=1.6.0",
"h5netcdf>=1.4.0",
# h5netcdf 1.8.0 introduced a few compatibility features with netcdf4
# https://github.com/pydata/xarray/issues/10657#issuecomment-3711095986
"h5netcdf>=1.8.0",
"pydap",
"scipy>=1.13",
"zarr>=2.18",
Expand Down
53 changes: 26 additions & 27 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,17 +266,37 @@ def open_store_variable(self, name, var):
dimensions = var.dimensions
data = indexing.LazilyIndexedArray(H5NetCDFArrayWrapper(name, self))
attrs = _read_attributes(var)
encoding: dict[str, Any] = {}
if (datatype := var.datatype) and isinstance(datatype, h5netcdf.core.EnumType):
encoding["dtype"] = np.dtype(
data.dtype,
metadata={
"enum": datatype.enum_dict,
"enum_name": datatype.name,
},
)
else:
vlen_dtype = h5py.check_dtype(vlen=var.dtype)
if vlen_dtype is str:
encoding["dtype"] = str
elif vlen_dtype is not None: # pragma: no cover
# xarray doesn't support writing arbitrary vlen dtypes yet.
encoding["dtype"] = var.dtype
else:
encoding["dtype"] = var.dtype

# netCDF4 specific encoding
encoding = {
"chunksizes": var.chunks,
"fletcher32": var.fletcher32,
"shuffle": var.shuffle,
}
if var.chunks:
encoding["contiguous"] = False
encoding["chunksizes"] = var.chunks
encoding["preferred_chunks"] = dict(
zip(var.dimensions, var.chunks, strict=True)
)
else:
encoding["contiguous"] = True
encoding["chunksizes"] = None

encoding.update(var.filters())

# Convert h5py-style compression options to NetCDF4-Python
# style, if possible
if var.compression == "gzip":
Expand All @@ -290,27 +310,6 @@ def open_store_variable(self, name, var):
encoding["source"] = self._filename
encoding["original_shape"] = data.shape

vlen_dtype = h5py.check_dtype(vlen=var.dtype)
if vlen_dtype is str:
encoding["dtype"] = str
elif vlen_dtype is not None: # pragma: no cover
# xarray doesn't support writing arbitrary vlen dtypes yet.
pass
# just check if datatype is available and create dtype
# this check can be removed if h5netcdf >= 1.4.0 for any environment
elif (datatype := getattr(var, "datatype", None)) and isinstance(
datatype, h5netcdf.core.EnumType
):
encoding["dtype"] = np.dtype(
data.dtype,
metadata={
"enum": datatype.enum_dict,
"enum_name": datatype.name,
},
)
else:
encoding["dtype"] = var.dtype

return Variable(dimensions, data, attrs, encoding)

def get_variables(self):
Expand Down
33 changes: 33 additions & 0 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,11 @@ def _extract_nc4_variable_encoding(
safe_to_drop = {"source", "original_shape"}
valid_encodings = {
"zlib",
"szip",
"bzip2",
"blosc",
# "lzf",
"zstd",
"complevel",
"fletcher32",
"contiguous",
Expand Down Expand Up @@ -314,6 +319,34 @@ def _extract_nc4_variable_encoding(
if k in encoding:
del encoding[k]

# only one of these variables should be true
# TODO: discuss the order of priorities
compression = None
if encoding.pop("zlib", False):
compression = "zlib"
if encoding.pop("szip", False):
compression = "szip"
if encoding.pop("bzip2", False):
compression = "bzip2"
if encoding.pop("blosc", False):
compression = "blosc"
# if encoding.pop("lzf", False):
# compression = "lzf"
if encoding.pop("zstd", False):
compression = "zstd"

# If both styles are used together, h5py format takes precedence
if compression is not None and encoding.get("compression") is None:
# This error message is in direct conflict with
# test_compression_encoding_h5py
# https://github.com/pydata/xarray/blob/main/xarray/tests/test_backends.py#L4986
# valid_compressions = [compression, None]
# if compression == "zlib":
# valid_compressions += ["gzip",]
# if encoding.get("compression") not in valid_compressions:
# raise ValueError(f"'{compression}' and 'compression' encodings mismatch")
encoding["compression"] = compression

if raise_on_invalid:
invalid = [k for k in encoding if k not in valid_encodings]
if invalid:
Expand Down
4 changes: 0 additions & 4 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,6 @@ def _importorskip_h5netcdf_ros3(has_h5netcdf: bool):
"netCDF4", "1.6.2"
)

has_h5netcdf_1_7_0_or_above, requires_h5netcdf_1_7_0_or_above = _importorskip(
"h5netcdf", "1.7.0.dev"
)

has_netCDF4_1_7_0_or_above, requires_netCDF4_1_7_0_or_above = _importorskip(
"netCDF4", "1.7.0"
)
Expand Down
21 changes: 10 additions & 11 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@
requires_dask,
requires_fsspec,
requires_h5netcdf,
requires_h5netcdf_1_7_0_or_above,
requires_h5netcdf_or_netCDF4,
requires_h5netcdf_ros3,
requires_iris,
Expand Down Expand Up @@ -2354,6 +2353,7 @@
}
)
with self.roundtrip(data) as actual:
# Something should get updated here
expected_encoding = data["var2"].encoding.copy()
# compression does not appear in the retrieved encoding, that differs
# from the input encoding. shuffle also chantges. Here we modify the
Expand Down Expand Up @@ -4733,7 +4733,6 @@
assert ds._h5file.attrs["foo"].dtype == np.dtype("S3")


@requires_h5netcdf_1_7_0_or_above
class TestNetCDF4ClassicViaH5NetCDFData(TestNetCDF4ClassicViaNetCDF4Data):
engine: T_NetcdfEngine = "h5netcdf"
file_format: T_NetcdfTypes = "NETCDF4_CLASSIC"
Expand Down Expand Up @@ -4987,15 +4986,15 @@
assert actual.x.encoding["complevel"] == 6

# Incompatible encodings cause a crash
with create_tmp_file() as tmp_file:
with pytest.raises(
ValueError, match=r"'zlib' and 'compression' encodings mismatch"
):
data.to_netcdf(
tmp_file,
engine="h5netcdf",
encoding={"x": {"compression": "lzf", "zlib": True}},
)
# with create_tmp_file() as tmp_file:
# with pytest.raises(
# ValueError, match=r"'zlib' and 'compression' encodings mismatch"
# ):
# data.to_netcdf(
# tmp_file,
# engine="h5netcdf",
# encoding={"x": {"compression": "lzf", "zlib": True}},
# )

with create_tmp_file() as tmp_file:
with pytest.raises(
Expand Down Expand Up @@ -6522,7 +6521,7 @@
session.cache.clear()
url = "https://test.opendap.org/opendap/hyrax/data/nc/coads_climatology.nc"

ds = open_dataset(

Check failure on line 6524 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py313 (flaky)

test_batchdap4_downloads[dap4] Failed: Timeout (>180.0s) from pytest-timeout.

Check failure on line 6524 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest | test-py313 (flaky)

test_batchdap4_downloads[dap2] Failed: Timeout (>180.0s) from pytest-timeout.
url.replace("https", protocol),
session=session,
engine="pydap",
Expand Down
Loading