diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index d42fe53f009..eeab56db89b 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -1,6 +1,7 @@ import os from dataclasses import dataclass, field from io import BytesIO +from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union import numpy as np @@ -25,6 +26,7 @@ class Audio: Input: The Audio feature accepts as input: - A `str`: Absolute path to the audio file (i.e. random access is allowed). + - A `pathlib.Path`: path to the audio file (i.e. random access is allowed). - A `dict` with the keys: - `path`: String with relative path of the audio file to the archive file. @@ -112,6 +114,8 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder if isinstance(value, str): return {"bytes": None, "path": value} + elif isinstance(value, Path): + return {"bytes": None, "path": str(value.absolute())} elif isinstance(value, (bytes, bytearray)): return {"bytes": value, "path": None} elif AudioDecoder is not None and isinstance(value, AudioDecoder): diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py index ad2e6bdfaec..fecc2fc5ccd 100644 --- a/src/datasets/features/image.py +++ b/src/datasets/features/image.py @@ -3,6 +3,7 @@ import warnings from dataclasses import dataclass, field from io import BytesIO +from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union import numpy as np @@ -48,6 +49,7 @@ class Image: Input: The Image feature accepts as input: - A `str`: Absolute path to the image file (i.e. random access is allowed). + - A `pathlib.Path`: path to the image file (i.e. random access is allowed). - A `dict` with the keys: - `path`: String with relative path of the image file to the archive file. @@ -113,6 +115,8 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, np.ndarray, " if isinstance(value, str): return {"path": value, "bytes": None} + elif isinstance(value, Path): + return {"path": str(value.absolute()), "bytes": None} elif isinstance(value, (bytes, bytearray)): return {"path": None, "bytes": value} elif isinstance(value, np.ndarray): diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py index f88b222d56a..414c497356c 100644 --- a/src/datasets/features/pdf.py +++ b/src/datasets/features/pdf.py @@ -1,6 +1,7 @@ import os from dataclasses import dataclass, field from io import BytesIO +from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union import pyarrow as pa @@ -34,6 +35,7 @@ class Pdf: Input: The Pdf feature accepts as input: - A `str`: Absolute path to the pdf file (i.e. random access is allowed). + - A `pathlib.Path`: path to the pdf file (i.e. random access is allowed). - A `dict` with the keys: - `path`: String with relative path of the pdf file in a dataset repository. - `bytes`: Bytes of the pdf file. @@ -92,6 +94,8 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "pdfplumber.p if isinstance(value, str): return {"path": value, "bytes": None} + elif isinstance(value, Path): + return {"path": str(value.absolute()), "bytes": None} elif isinstance(value, (bytes, bytearray)): return {"path": None, "bytes": value} elif pdfplumber is not None and isinstance(value, pdfplumber.pdf.PDF): diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index 6b4bdc97a6c..adbfaaa30f3 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -1,5 +1,6 @@ import os from dataclasses import dataclass, field +from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, TypedDict, Union import numpy as np @@ -31,6 +32,7 @@ class Video: Input: The Video feature accepts as input: - A `str`: Absolute path to the video file (i.e. random access is allowed). + - A `pathlib.Path`: path to the video file (i.e. random access is allowed). - A `dict` with the keys: - `path`: String with relative path of the video file in a dataset repository. @@ -125,6 +127,8 @@ def encode_example(self, value: Union[str, bytes, bytearray, Example, np.ndarray if isinstance(value, str): return {"path": value, "bytes": None} + elif isinstance(value, Path): + return {"path": str(value.absolute()), "bytes": None} elif isinstance(value, (bytes, bytearray)): return {"path": None, "bytes": value} elif isinstance(value, np.ndarray): diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index dae082429ed..b914e191af1 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -1,6 +1,7 @@ import os import tarfile from itertools import product +from pathlib import Path import numpy as np import pyarrow as pa @@ -64,6 +65,7 @@ def test_audio_feature_type_to_arrow(): "build_example", [ lambda audio_path: audio_path, + lambda audio_path: Path(audio_path), lambda audio_path: open(audio_path, "rb").read(), lambda audio_path: {"path": audio_path}, lambda audio_path: {"path": audio_path, "bytes": None}, diff --git a/tests/features/test_image.py b/tests/features/test_image.py index 0b6774330b5..68e6f4b91cc 100644 --- a/tests/features/test_image.py +++ b/tests/features/test_image.py @@ -3,6 +3,7 @@ import tarfile import warnings from io import BytesIO +from pathlib import Path import numpy as np import pandas as pd @@ -54,6 +55,7 @@ def test_image_feature_type_to_arrow(): "build_example", [ lambda image_path: image_path, + lambda image_path: Path(image_path), lambda image_path: open(image_path, "rb").read(), lambda image_path: {"path": image_path}, lambda image_path: {"path": image_path, "bytes": None}, diff --git a/tests/features/test_pdf.py b/tests/features/test_pdf.py index 7365fd8b635..fe0b521c96c 100644 --- a/tests/features/test_pdf.py +++ b/tests/features/test_pdf.py @@ -1,3 +1,5 @@ +from pathlib import Path + import pytest from datasets import Dataset, Features, Pdf @@ -10,6 +12,7 @@ "build_example", [ lambda pdf_path: pdf_path, + lambda pdf_path: Path(pdf_path), lambda pdf_path: open(pdf_path, "rb").read(), lambda pdf_path: {"path": pdf_path}, lambda pdf_path: {"path": pdf_path, "bytes": None}, diff --git a/tests/features/test_video.py b/tests/features/test_video.py index 64c1441227c..131b01be6d2 100644 --- a/tests/features/test_video.py +++ b/tests/features/test_video.py @@ -1,3 +1,5 @@ +from pathlib import Path + import pytest from datasets import Column, Dataset, Features, Value, Video, load_dataset @@ -10,6 +12,7 @@ "build_example", [ lambda video_path: video_path, + lambda video_path: Path(video_path), lambda video_path: open(video_path, "rb").read(), lambda video_path: {"path": video_path}, lambda video_path: {"path": video_path, "bytes": None},