From 740710cae748a8bc5f6bf55fb3ee6d85b311d945 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Thu, 6 Nov 2025 14:35:49 +0100 Subject: [PATCH 1/4] WIP: allow uploading of nifti --- src/datasets/features/nifti.py | 60 +++++++++++++++++++++++++++++++++- src/datasets/features/pdf.py | 12 +++++++ tests/features/test_nifti.py | 22 +++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py index bac91e2af4b..f9a7112c087 100644 --- a/src/datasets/features/nifti.py +++ b/src/datasets/features/nifti.py @@ -10,7 +10,7 @@ from ..download.download_config import DownloadConfig from ..table import array_cast from ..utils.file_utils import is_local_path, xopen -from ..utils.py_utils import string_to_dict +from ..utils.py_utils import no_op_if_value_is_null, string_to_dict if TYPE_CHECKING: @@ -81,6 +81,9 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "nib.Nifti1Im else: nib = None + import pdb + + pdb.set_trace() if isinstance(value, str): return {"path": value, "bytes": None} elif isinstance(value, Path): @@ -137,10 +140,16 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif token_per_repo_id = {} path, bytes_ = value["path"], value["bytes"] + import pdb + + pdb.set_trace() if bytes_ is None: if path is None: raise ValueError(f"A nifti should have one of 'path' or 'bytes' but both are None in {value}.") else: + import pdb + + pdb.set_trace() if is_local_path(path): nifti = nib.load(path) else: @@ -172,6 +181,55 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif return nifti + def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray: + """Embed NifTI files into the Arrow array. + + Args: + storage (`pa.StructArray`): + PyArrow array to embed. + + Returns: + `pa.StructArray`: Array in the NifTI arrow storage type, that is + `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. + """ + if config.NIBABEL_AVAILABLE: + import nibabel as nib + else: + raise ImportError("To support embedding NIfTI files, please install 'nibabel'.") + + if token_per_repo_id is None: + token_per_repo_id = {} + + @no_op_if_value_is_null + def path_to_bytes(path): + source_url = path.split("::")[-1] + pattern = ( + config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL + ) + source_url_fields = string_to_dict(source_url, pattern) + token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None + download_config = DownloadConfig(token=token) + with xopen(path, "rb", download_config=download_config) as f: + bytes_data = f.read() + bio = BytesIO(bytes_data) + fh = nib.FileHolder(fileobj=bio) + nifti = nib.Nifti1Image.from_file_map({"header": fh, "image": fh}) + return nifti.to_bytes() + + bytes_array = pa.array( + [ + (path_to_bytes(x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None + for x in storage.to_pylist() + ], + type=pa.binary(), + ) + path_array = pa.array( + [os.path.basename(path) if path is not None else None for path in storage.field("path").to_pylist()], + type=pa.string(), + ) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null()) + return array_cast(storage, self.pa_type) + def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]: """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.""" from .features import Value diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py index 756530554d4..382b11701d8 100644 --- a/src/datasets/features/pdf.py +++ b/src/datasets/features/pdf.py @@ -90,6 +90,9 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.p else: pdfplumber = None + import pdb + + pdb.set_trace() if isinstance(value, str): return {"path": value, "bytes": None} elif isinstance(value, Path): @@ -141,6 +144,9 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf token_per_repo_id = {} path, bytes_ = value["path"], value["bytes"] + import pdb + + pdb.set_trace() if bytes_ is None: if path is None: raise ValueError(f"A pdf should have one of 'path' or 'bytes' but both are None in {value}.") @@ -229,6 +235,9 @@ def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.S `pa.StructArray`: Array in the PDF arrow storage type, that is `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. """ + import pdb + + pdb.set_trace() if token_per_repo_id is None: token_per_repo_id = {} @@ -272,6 +281,9 @@ def encode_pdfplumber_pdf(pdf: "pdfplumber.pdf.PDF") -> dict: Returns: dict: A dictionary with "path" or "bytes" field. """ + import pdb + + pdb.set_trace() if hasattr(pdf, "stream") and hasattr(pdf.stream, "name") and pdf.stream.name: # Return the path if the PDF has an associated file path return {"path": pdf.stream.name, "bytes": None} diff --git a/tests/features/test_nifti.py b/tests/features/test_nifti.py index 077a7519431..c908b1dcdb6 100644 --- a/tests/features/test_nifti.py +++ b/tests/features/test_nifti.py @@ -2,6 +2,7 @@ from pathlib import Path +import pyarrow as pa import pytest from datasets import Dataset, Features, Nifti @@ -89,3 +90,24 @@ def test_encode_nibabel_image(shared_datadir): assert isinstance(encoded_example_bytes, dict) assert encoded_example_bytes["bytes"] is not None and encoded_example_bytes["path"] is None # this cannot be converted back from bytes (yet) + + +@require_nibabel +def test_embed_storage(shared_datadir): + import nibabel + + nifti_path = str(shared_datadir / "test_nifti.nii") + img = nibabel.load(nifti_path) + nifti = Nifti() + + bytes_array = pa.array([None], type=pa.binary()) + path_array = pa.array([nifti_path], type=pa.string()) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"]) + + embedded_storage = nifti.embed_storage(storage) + + embedded_bytes = embedded_storage[0]["bytes"].as_py() + original_bytes = img.to_bytes() + + assert embedded_bytes is not None + assert embedded_bytes == original_bytes From ecf8d59912474e6cc18ff0046bb608dca5bf972a Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Thu, 6 Nov 2025 14:52:52 +0100 Subject: [PATCH 2/4] remove debug statements and fix test --- src/datasets/features/nifti.py | 22 ++-------------------- tests/features/test_nifti.py | 15 +++++++++++---- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py index f9a7112c087..f9a044b391c 100644 --- a/src/datasets/features/nifti.py +++ b/src/datasets/features/nifti.py @@ -81,9 +81,6 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "nib.Nifti1Im else: nib = None - import pdb - - pdb.set_trace() if isinstance(value, str): return {"path": value, "bytes": None} elif isinstance(value, Path): @@ -131,25 +128,14 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif if not self.decode: raise NotImplementedError("Decoding is disabled for this feature. Please use Nifti(decode=True) instead.") - if config.NIBABEL_AVAILABLE: - import nibabel as nib - else: - raise ImportError("To support decoding NIfTI files, please install 'nibabel'.") - if token_per_repo_id is None: token_per_repo_id = {} path, bytes_ = value["path"], value["bytes"] - import pdb - - pdb.set_trace() if bytes_ is None: if path is None: raise ValueError(f"A nifti should have one of 'path' or 'bytes' but both are None in {value}.") else: - import pdb - - pdb.set_trace() if is_local_path(path): nifti = nib.load(path) else: @@ -193,7 +179,7 @@ def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.S `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. """ if config.NIBABEL_AVAILABLE: - import nibabel as nib + pass else: raise ImportError("To support embedding NIfTI files, please install 'nibabel'.") @@ -210,11 +196,7 @@ def path_to_bytes(path): token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None download_config = DownloadConfig(token=token) with xopen(path, "rb", download_config=download_config) as f: - bytes_data = f.read() - bio = BytesIO(bytes_data) - fh = nib.FileHolder(fileobj=bio) - nifti = nib.Nifti1Image.from_file_map({"header": fh, "image": fh}) - return nifti.to_bytes() + return f.read() bytes_array = pa.array( [ diff --git a/tests/features/test_nifti.py b/tests/features/test_nifti.py index c908b1dcdb6..1c1bb36da62 100644 --- a/tests/features/test_nifti.py +++ b/tests/features/test_nifti.py @@ -94,10 +94,12 @@ def test_encode_nibabel_image(shared_datadir): @require_nibabel def test_embed_storage(shared_datadir): - import nibabel + from io import BytesIO + + import nibabel as nib nifti_path = str(shared_datadir / "test_nifti.nii") - img = nibabel.load(nifti_path) + img = nib.load(nifti_path) nifti = Nifti() bytes_array = pa.array([None], type=pa.binary()) @@ -107,7 +109,12 @@ def test_embed_storage(shared_datadir): embedded_storage = nifti.embed_storage(storage) embedded_bytes = embedded_storage[0]["bytes"].as_py() - original_bytes = img.to_bytes() + + bio = BytesIO(embedded_bytes) + fh = nib.FileHolder(fileobj=bio) + nifti_img = nib.Nifti1Image.from_file_map({"header": fh, "image": fh}) assert embedded_bytes is not None - assert embedded_bytes == original_bytes + assert nifti_img.header == img.header + assert (nifti_img.affine == img.affine).all() + assert (nifti_img.get_fdata() == img.get_fdata()).all() From 3d97bb8c450eb507581d76c29a5e1d9f9141911e Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Thu, 6 Nov 2025 16:05:01 +0100 Subject: [PATCH 3/4] remove debug statements --- src/datasets/features/nifti.py | 22 ++++++++++++---------- tests/features/test_nifti.py | 12 +++++++++++- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py index f9a044b391c..b27d4ad2e3f 100644 --- a/src/datasets/features/nifti.py +++ b/src/datasets/features/nifti.py @@ -125,6 +125,11 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif Returns: `nibabel.Nifti1Image` objects """ + if config.NIBABEL_AVAILABLE: + import nibabel as nib + else: + nib = None + if not self.decode: raise NotImplementedError("Decoding is disabled for this feature. Please use Nifti(decode=True) instead.") @@ -136,6 +141,9 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif if path is None: raise ValueError(f"A nifti should have one of 'path' or 'bytes' but both are None in {value}.") else: + # gzipped files have the structure: 'gzip://T1.nii::' + if path.startswith("gzip://") and is_local_path(path.split("::")[-1]): + path = path.split("::")[-1] if is_local_path(path): nifti = nib.load(path) else: @@ -145,11 +153,10 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL ) - try: - repo_id = string_to_dict(source_url, pattern)["repo_id"] - token = token_per_repo_id.get(repo_id) - except ValueError: - token = None + source_url_fields = string_to_dict(source_url, pattern) + token = ( + token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None + ) download_config = DownloadConfig(token=token) with xopen(path, "rb", download_config=download_config) as f: nifti = nib.load(f) @@ -178,11 +185,6 @@ def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.S `pa.StructArray`: Array in the NifTI arrow storage type, that is `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. """ - if config.NIBABEL_AVAILABLE: - pass - else: - raise ImportError("To support embedding NIfTI files, please install 'nibabel'.") - if token_per_repo_id is None: token_per_repo_id = {} diff --git a/tests/features/test_nifti.py b/tests/features/test_nifti.py index 1c1bb36da62..b5f0be42f3e 100644 --- a/tests/features/test_nifti.py +++ b/tests/features/test_nifti.py @@ -5,7 +5,7 @@ import pyarrow as pa import pytest -from datasets import Dataset, Features, Nifti +from datasets import Dataset, Features, Nifti, load_dataset from src.datasets.features.nifti import encode_nibabel_image from ..utils import require_nibabel @@ -118,3 +118,13 @@ def test_embed_storage(shared_datadir): assert nifti_img.header == img.header assert (nifti_img.affine == img.affine).all() assert (nifti_img.get_fdata() == img.get_fdata()).all() + + +@require_nibabel +def test_load_zipped_file_locally(shared_datadir): + import nibabel as nib + + nifti_path = str(shared_datadir / "test_nifti.nii.gz") + + ds = load_dataset("niftifolder", data_files=nifti_path) + assert isinstance(ds["train"][0]["nifti"], nib.nifti1.Nifti1Image) From 256fc162d669038c398313278e7c30f76989fd96 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Thu, 6 Nov 2025 16:09:29 +0100 Subject: [PATCH 4/4] remove debug statements --- src/datasets/features/nifti.py | 5 +---- src/datasets/features/pdf.py | 12 ------------ 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py index b27d4ad2e3f..f63b8cf6aa1 100644 --- a/src/datasets/features/nifti.py +++ b/src/datasets/features/nifti.py @@ -128,10 +128,7 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif if config.NIBABEL_AVAILABLE: import nibabel as nib else: - nib = None - - if not self.decode: - raise NotImplementedError("Decoding is disabled for this feature. Please use Nifti(decode=True) instead.") + raise ImportError("To support decoding NIfTI files, please install 'nibabel'.") if token_per_repo_id is None: token_per_repo_id = {} diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py index 382b11701d8..756530554d4 100644 --- a/src/datasets/features/pdf.py +++ b/src/datasets/features/pdf.py @@ -90,9 +90,6 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.p else: pdfplumber = None - import pdb - - pdb.set_trace() if isinstance(value, str): return {"path": value, "bytes": None} elif isinstance(value, Path): @@ -144,9 +141,6 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf token_per_repo_id = {} path, bytes_ = value["path"], value["bytes"] - import pdb - - pdb.set_trace() if bytes_ is None: if path is None: raise ValueError(f"A pdf should have one of 'path' or 'bytes' but both are None in {value}.") @@ -235,9 +229,6 @@ def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.S `pa.StructArray`: Array in the PDF arrow storage type, that is `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. """ - import pdb - - pdb.set_trace() if token_per_repo_id is None: token_per_repo_id = {} @@ -281,9 +272,6 @@ def encode_pdfplumber_pdf(pdf: "pdfplumber.pdf.PDF") -> dict: Returns: dict: A dictionary with "path" or "bytes" field. """ - import pdb - - pdb.set_trace() if hasattr(pdf, "stream") and hasattr(pdf.stream, "name") and pdf.stream.name: # Return the path if the PDF has an associated file path return {"path": pdf.stream.name, "bytes": None}