From 740710cae748a8bc5f6bf55fb3ee6d85b311d945 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Thu, 6 Nov 2025 14:35:49 +0100
Subject: [PATCH 1/4] WIP: allow uploading of nifti

---
 src/datasets/features/nifti.py | 60 +++++++++++++++++++++++++++++++++-
 src/datasets/features/pdf.py   | 12 +++++++
 tests/features/test_nifti.py   | 22 +++++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py
index bac91e2af4b..f9a7112c087 100644
--- a/src/datasets/features/nifti.py
+++ b/src/datasets/features/nifti.py
@@ -10,7 +10,7 @@
 from ..download.download_config import DownloadConfig
 from ..table import array_cast
 from ..utils.file_utils import is_local_path, xopen
-from ..utils.py_utils import string_to_dict
+from ..utils.py_utils import no_op_if_value_is_null, string_to_dict
 
 
 if TYPE_CHECKING:
@@ -81,6 +81,9 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "nib.Nifti1Im
         else:
             nib = None
 
+        import pdb
+
+        pdb.set_trace()
         if isinstance(value, str):
             return {"path": value, "bytes": None}
         elif isinstance(value, Path):
@@ -137,10 +140,16 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif
             token_per_repo_id = {}
 
         path, bytes_ = value["path"], value["bytes"]
+        import pdb
+
+        pdb.set_trace()
         if bytes_ is None:
             if path is None:
                 raise ValueError(f"A nifti should have one of 'path' or 'bytes' but both are None in {value}.")
             else:
+                import pdb
+
+                pdb.set_trace()
                 if is_local_path(path):
                     nifti = nib.load(path)
                 else:
@@ -172,6 +181,55 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif
 
         return nifti
 
+    def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray:
+        """Embed NifTI files into the Arrow array.
+
+        Args:
+            storage (`pa.StructArray`):
+                PyArrow array to embed.
+
+        Returns:
+            `pa.StructArray`: Array in the NifTI arrow storage type, that is
+                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
+        """
+        if config.NIBABEL_AVAILABLE:
+            import nibabel as nib
+        else:
+            raise ImportError("To support embedding NIfTI files, please install 'nibabel'.")
+
+        if token_per_repo_id is None:
+            token_per_repo_id = {}
+
+        @no_op_if_value_is_null
+        def path_to_bytes(path):
+            source_url = path.split("::")[-1]
+            pattern = (
+                config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL
+            )
+            source_url_fields = string_to_dict(source_url, pattern)
+            token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
+            download_config = DownloadConfig(token=token)
+            with xopen(path, "rb", download_config=download_config) as f:
+                bytes_data = f.read()
+                bio = BytesIO(bytes_data)
+                fh = nib.FileHolder(fileobj=bio)
+                nifti = nib.Nifti1Image.from_file_map({"header": fh, "image": fh})
+                return nifti.to_bytes()
+
+        bytes_array = pa.array(
+            [
+                (path_to_bytes(x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None
+                for x in storage.to_pylist()
+            ],
+            type=pa.binary(),
+        )
+        path_array = pa.array(
+            [os.path.basename(path) if path is not None else None for path in storage.field("path").to_pylist()],
+            type=pa.string(),
+        )
+        storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null())
+        return array_cast(storage, self.pa_type)
+
     def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]:
         """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary."""
         from .features import Value
diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py
index 756530554d4..382b11701d8 100644
--- a/src/datasets/features/pdf.py
+++ b/src/datasets/features/pdf.py
@@ -90,6 +90,9 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.p
         else:
             pdfplumber = None
 
+        import pdb
+
+        pdb.set_trace()
         if isinstance(value, str):
             return {"path": value, "bytes": None}
         elif isinstance(value, Path):
@@ -141,6 +144,9 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf
             token_per_repo_id = {}
 
         path, bytes_ = value["path"], value["bytes"]
+        import pdb
+
+        pdb.set_trace()
         if bytes_ is None:
             if path is None:
                 raise ValueError(f"A pdf should have one of 'path' or 'bytes' but both are None in {value}.")
@@ -229,6 +235,9 @@ def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.S
             `pa.StructArray`: Array in the PDF arrow storage type, that is
                 `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
         """
+        import pdb
+
+        pdb.set_trace()
         if token_per_repo_id is None:
             token_per_repo_id = {}
 
@@ -272,6 +281,9 @@ def encode_pdfplumber_pdf(pdf: "pdfplumber.pdf.PDF") -> dict:
     Returns:
         dict: A dictionary with "path" or "bytes" field.
     """
+    import pdb
+
+    pdb.set_trace()
     if hasattr(pdf, "stream") and hasattr(pdf.stream, "name") and pdf.stream.name:
         # Return the path if the PDF has an associated file path
         return {"path": pdf.stream.name, "bytes": None}
diff --git a/tests/features/test_nifti.py b/tests/features/test_nifti.py
index 077a7519431..c908b1dcdb6 100644
--- a/tests/features/test_nifti.py
+++ b/tests/features/test_nifti.py
@@ -2,6 +2,7 @@
 
 from pathlib import Path
 
+import pyarrow as pa
 import pytest
 
 from datasets import Dataset, Features, Nifti
@@ -89,3 +90,24 @@ def test_encode_nibabel_image(shared_datadir):
     assert isinstance(encoded_example_bytes, dict)
     assert encoded_example_bytes["bytes"] is not None and encoded_example_bytes["path"] is None
     # this cannot be converted back from bytes (yet)
+
+
+@require_nibabel
+def test_embed_storage(shared_datadir):
+    import nibabel
+
+    nifti_path = str(shared_datadir / "test_nifti.nii")
+    img = nibabel.load(nifti_path)
+    nifti = Nifti()
+
+    bytes_array = pa.array([None], type=pa.binary())
+    path_array = pa.array([nifti_path], type=pa.string())
+    storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"])
+
+    embedded_storage = nifti.embed_storage(storage)
+
+    embedded_bytes = embedded_storage[0]["bytes"].as_py()
+    original_bytes = img.to_bytes()
+
+    assert embedded_bytes is not None
+    assert embedded_bytes == original_bytes

From ecf8d59912474e6cc18ff0046bb608dca5bf972a Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Thu, 6 Nov 2025 14:52:52 +0100
Subject: [PATCH 2/4] remove debug statements and fix test

---
 src/datasets/features/nifti.py | 22 ++--------------------
 tests/features/test_nifti.py   | 15 +++++++++++----
 2 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py
index f9a7112c087..f9a044b391c 100644
--- a/src/datasets/features/nifti.py
+++ b/src/datasets/features/nifti.py
@@ -81,9 +81,6 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "nib.Nifti1Im
         else:
             nib = None
 
-        import pdb
-
-        pdb.set_trace()
         if isinstance(value, str):
             return {"path": value, "bytes": None}
         elif isinstance(value, Path):
@@ -131,25 +128,14 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif
         if not self.decode:
             raise NotImplementedError("Decoding is disabled for this feature. Please use Nifti(decode=True) instead.")
 
-        if config.NIBABEL_AVAILABLE:
-            import nibabel as nib
-        else:
-            raise ImportError("To support decoding NIfTI files, please install 'nibabel'.")
-
         if token_per_repo_id is None:
             token_per_repo_id = {}
 
         path, bytes_ = value["path"], value["bytes"]
-        import pdb
-
-        pdb.set_trace()
         if bytes_ is None:
             if path is None:
                 raise ValueError(f"A nifti should have one of 'path' or 'bytes' but both are None in {value}.")
             else:
-                import pdb
-
-                pdb.set_trace()
                 if is_local_path(path):
                     nifti = nib.load(path)
                 else:
@@ -193,7 +179,7 @@ def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.S
                 `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
         """
         if config.NIBABEL_AVAILABLE:
-            import nibabel as nib
+            pass
         else:
             raise ImportError("To support embedding NIfTI files, please install 'nibabel'.")
 
@@ -210,11 +196,7 @@ def path_to_bytes(path):
             token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
             download_config = DownloadConfig(token=token)
             with xopen(path, "rb", download_config=download_config) as f:
-                bytes_data = f.read()
-                bio = BytesIO(bytes_data)
-                fh = nib.FileHolder(fileobj=bio)
-                nifti = nib.Nifti1Image.from_file_map({"header": fh, "image": fh})
-                return nifti.to_bytes()
+                return f.read()
 
         bytes_array = pa.array(
             [
diff --git a/tests/features/test_nifti.py b/tests/features/test_nifti.py
index c908b1dcdb6..1c1bb36da62 100644
--- a/tests/features/test_nifti.py
+++ b/tests/features/test_nifti.py
@@ -94,10 +94,12 @@ def test_encode_nibabel_image(shared_datadir):
 
 @require_nibabel
 def test_embed_storage(shared_datadir):
-    import nibabel
+    from io import BytesIO
+
+    import nibabel as nib
 
     nifti_path = str(shared_datadir / "test_nifti.nii")
-    img = nibabel.load(nifti_path)
+    img = nib.load(nifti_path)
     nifti = Nifti()
 
     bytes_array = pa.array([None], type=pa.binary())
@@ -107,7 +109,12 @@ def test_embed_storage(shared_datadir):
     embedded_storage = nifti.embed_storage(storage)
 
     embedded_bytes = embedded_storage[0]["bytes"].as_py()
-    original_bytes = img.to_bytes()
+
+    bio = BytesIO(embedded_bytes)
+    fh = nib.FileHolder(fileobj=bio)
+    nifti_img = nib.Nifti1Image.from_file_map({"header": fh, "image": fh})
 
     assert embedded_bytes is not None
-    assert embedded_bytes == original_bytes
+    assert nifti_img.header == img.header
+    assert (nifti_img.affine == img.affine).all()
+    assert (nifti_img.get_fdata() == img.get_fdata()).all()

From 3d97bb8c450eb507581d76c29a5e1d9f9141911e Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Thu, 6 Nov 2025 16:05:01 +0100
Subject: [PATCH 3/4] remove debug statements

---
 src/datasets/features/nifti.py | 22 ++++++++++++----------
 tests/features/test_nifti.py   | 12 +++++++++++-
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py
index f9a044b391c..b27d4ad2e3f 100644
--- a/src/datasets/features/nifti.py
+++ b/src/datasets/features/nifti.py
@@ -125,6 +125,11 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif
         Returns:
             `nibabel.Nifti1Image` objects
         """
+        if config.NIBABEL_AVAILABLE:
+            import nibabel as nib
+        else:
+            nib = None
+
         if not self.decode:
             raise NotImplementedError("Decoding is disabled for this feature. Please use Nifti(decode=True) instead.")
 
@@ -136,6 +141,9 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif
             if path is None:
                 raise ValueError(f"A nifti should have one of 'path' or 'bytes' but both are None in {value}.")
             else:
+                # gzipped files have the structure: 'gzip://T1.nii::<local_path>'
+                if path.startswith("gzip://") and is_local_path(path.split("::")[-1]):
+                    path = path.split("::")[-1]
                 if is_local_path(path):
                     nifti = nib.load(path)
                 else:
@@ -145,11 +153,10 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif
                         if source_url.startswith(config.HF_ENDPOINT)
                         else config.HUB_DATASETS_HFFS_URL
                     )
-                    try:
-                        repo_id = string_to_dict(source_url, pattern)["repo_id"]
-                        token = token_per_repo_id.get(repo_id)
-                    except ValueError:
-                        token = None
+                    source_url_fields = string_to_dict(source_url, pattern)
+                    token = (
+                        token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
+                    )
                     download_config = DownloadConfig(token=token)
                     with xopen(path, "rb", download_config=download_config) as f:
                         nifti = nib.load(f)
@@ -178,11 +185,6 @@ def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.S
             `pa.StructArray`: Array in the NifTI arrow storage type, that is
                 `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
         """
-        if config.NIBABEL_AVAILABLE:
-            pass
-        else:
-            raise ImportError("To support embedding NIfTI files, please install 'nibabel'.")
-
         if token_per_repo_id is None:
             token_per_repo_id = {}
 
diff --git a/tests/features/test_nifti.py b/tests/features/test_nifti.py
index 1c1bb36da62..b5f0be42f3e 100644
--- a/tests/features/test_nifti.py
+++ b/tests/features/test_nifti.py
@@ -5,7 +5,7 @@
 import pyarrow as pa
 import pytest
 
-from datasets import Dataset, Features, Nifti
+from datasets import Dataset, Features, Nifti, load_dataset
 from src.datasets.features.nifti import encode_nibabel_image
 
 from ..utils import require_nibabel
@@ -118,3 +118,13 @@ def test_embed_storage(shared_datadir):
     assert nifti_img.header == img.header
     assert (nifti_img.affine == img.affine).all()
     assert (nifti_img.get_fdata() == img.get_fdata()).all()
+
+
+@require_nibabel
+def test_load_zipped_file_locally(shared_datadir):
+    import nibabel as nib
+
+    nifti_path = str(shared_datadir / "test_nifti.nii.gz")
+
+    ds = load_dataset("niftifolder", data_files=nifti_path)
+    assert isinstance(ds["train"][0]["nifti"], nib.nifti1.Nifti1Image)

From 256fc162d669038c398313278e7c30f76989fd96 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Thu, 6 Nov 2025 16:09:29 +0100
Subject: [PATCH 4/4] remove debug statements

---
 src/datasets/features/nifti.py |  5 +----
 src/datasets/features/pdf.py   | 12 ------------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py
index b27d4ad2e3f..f63b8cf6aa1 100644
--- a/src/datasets/features/nifti.py
+++ b/src/datasets/features/nifti.py
@@ -128,10 +128,7 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif
         if config.NIBABEL_AVAILABLE:
             import nibabel as nib
         else:
-            nib = None
-
-        if not self.decode:
-            raise NotImplementedError("Decoding is disabled for this feature. Please use Nifti(decode=True) instead.")
+            raise ImportError("To support decoding NIfTI files, please install 'nibabel'.")
 
         if token_per_repo_id is None:
             token_per_repo_id = {}
diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py
index 382b11701d8..756530554d4 100644
--- a/src/datasets/features/pdf.py
+++ b/src/datasets/features/pdf.py
@@ -90,9 +90,6 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.p
         else:
             pdfplumber = None
 
-        import pdb
-
-        pdb.set_trace()
         if isinstance(value, str):
             return {"path": value, "bytes": None}
         elif isinstance(value, Path):
@@ -144,9 +141,6 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf
             token_per_repo_id = {}
 
         path, bytes_ = value["path"], value["bytes"]
-        import pdb
-
-        pdb.set_trace()
         if bytes_ is None:
             if path is None:
                 raise ValueError(f"A pdf should have one of 'path' or 'bytes' but both are None in {value}.")
@@ -235,9 +229,6 @@ def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.S
             `pa.StructArray`: Array in the PDF arrow storage type, that is
                 `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
         """
-        import pdb
-
-        pdb.set_trace()
         if token_per_repo_id is None:
             token_per_repo_id = {}
 
@@ -281,9 +272,6 @@ def encode_pdfplumber_pdf(pdf: "pdfplumber.pdf.PDF") -> dict:
     Returns:
         dict: A dictionary with "path" or "bytes" field.
     """
-    import pdb
-
-    pdb.set_trace()
     if hasattr(pdf, "stream") and hasattr(pdf.stream, "name") and pdf.stream.name:
         # Return the path if the PDF has an associated file path
         return {"path": pdf.stream.name, "bytes": None}