From 944dccfc197275f4cc5c43139726711c5d75adfb Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 22 May 2025 10:05:22 -0400 Subject: [PATCH] Add embed_storage in Pdf feature --- src/datasets/features/pdf.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py index afba910387a..b195fb8192a 100644 --- a/src/datasets/features/pdf.py +++ b/src/datasets/features/pdf.py @@ -9,7 +9,7 @@ from ..download.download_config import DownloadConfig from ..table import array_cast from ..utils.file_utils import is_local_path, xopen -from ..utils.py_utils import string_to_dict +from ..utils.py_utils import no_op_if_value_is_null, string_to_dict if TYPE_CHECKING: @@ -216,6 +216,38 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null()) return array_cast(storage, self.pa_type) + def embed_storage(self, storage: pa.StructArray) -> pa.StructArray: + """Embed PDF files into the Arrow array. + + Args: + storage (`pa.StructArray`): + PyArrow array to embed. + + Returns: + `pa.StructArray`: Array in the PDF arrow storage type, that is + `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. + """ + + @no_op_if_value_is_null + def path_to_bytes(path): + with xopen(path, "rb") as f: + bytes_ = f.read() + return bytes_ + + bytes_array = pa.array( + [ + (path_to_bytes(x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None + for x in storage.to_pylist() + ], + type=pa.binary(), + ) + path_array = pa.array( + [os.path.basename(path) if path is not None else None for path in storage.field("path").to_pylist()], + type=pa.string(), + ) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null()) + return array_cast(storage, self.pa_type) + def encode_pdfplumber_pdf(pdf: "pdfplumber.pdf.PDF") -> dict: """