From a2577caf200f7042a90e62a837706945a57c4175 Mon Sep 17 00:00:00 2001 From: Riccardo Bucco Date: Tue, 18 Oct 2022 10:21:12 +0200 Subject: [PATCH 1/5] Make filename matching more robust (#5046) --- .../folder_based_builder/folder_based_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index 6c96d402aa1..7b49a4dfba0 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -316,7 +316,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad pa_metadata_table = pa_metadata_table.drop(["file_name"]) metadata_dir = os.path.dirname(metadata_file) metadata_dict = { - file_name: sample_metadata + os.path.normpath(file_name): sample_metadata for file_name, sample_metadata in zip( pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table) ) @@ -385,7 +385,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad pa_metadata_table = pa_metadata_table.drop(["file_name"]) metadata_dir = os.path.dirname(downloaded_metadata_file) metadata_dict = { - file_name: sample_metadata + os.path.normpath(file_name): sample_metadata for file_name, sample_metadata in zip( pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table) ) From a4174021a009891a03d13844c37f4a01d89bbcd4 Mon Sep 17 00:00:00 2001 From: Riccardo Bucco Date: Wed, 19 Oct 2022 11:06:15 +0200 Subject: [PATCH 2/5] Use '/' on all the platforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mario Šaško --- .../folder_based_builder/folder_based_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index 7b49a4dfba0..1ccde6ff107 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -316,7 +316,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad pa_metadata_table = pa_metadata_table.drop(["file_name"]) metadata_dir = os.path.dirname(metadata_file) metadata_dict = { - os.path.normpath(file_name): sample_metadata + os.path.normpath(file_name).replace("\\", "/"): sample_metadata for file_name, sample_metadata in zip( pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table) ) From efd14c086d4b08276c7615c76706942e4fd4173a Mon Sep 17 00:00:00 2001 From: Riccardo Bucco Date: Wed, 19 Oct 2022 11:19:13 +0200 Subject: [PATCH 3/5] Use '/' on all the platforms and get rid of useless function call --- .../folder_based_builder/folder_based_builder.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index 1ccde6ff107..0a562f87a7a 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -310,9 +310,6 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad ) pa_metadata_table = self._read_metadata(downloaded_metadata_file) pa_file_name_array = pa_metadata_table["file_name"] - pa_file_name_array = pc.replace_substring( - pa_file_name_array, pattern="\\", replacement="/" - ) pa_metadata_table = pa_metadata_table.drop(["file_name"]) metadata_dir = os.path.dirname(metadata_file) metadata_dict = { @@ -379,13 +376,10 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad ) pa_metadata_table = self._read_metadata(downloaded_metadata_file) pa_file_name_array = pa_metadata_table["file_name"] - pa_file_name_array = pc.replace_substring( - pa_file_name_array, pattern="\\", replacement="/" - ) pa_metadata_table = pa_metadata_table.drop(["file_name"]) metadata_dir = os.path.dirname(downloaded_metadata_file) metadata_dict = { - os.path.normpath(file_name): sample_metadata + os.path.normpath(file_name).replace("\\", "/"): sample_metadata for file_name, sample_metadata in zip( pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table) ) From 3b3d76df38f1fd30bb918debfe15ef43fcd32c15 Mon Sep 17 00:00:00 2001 From: Riccardo Bucco Date: Wed, 19 Oct 2022 11:22:30 +0200 Subject: [PATCH 4/5] Remove useless import statement --- .../folder_based_builder/folder_based_builder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index 0a562f87a7a..e2277ba24e4 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -6,7 +6,6 @@ import pandas as pd import pyarrow as pa -import pyarrow.compute as pc import pyarrow.json as paj import datasets From f96dbd7640cd01914a6e8e336fb7f5c7e8561a4e Mon Sep 17 00:00:00 2001 From: Riccardo Bucco Date: Wed, 19 Oct 2022 18:12:41 +0200 Subject: [PATCH 5/5] make sure ./ is ignored --- tests/packaged_modules/test_folder_based_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/packaged_modules/test_folder_based_builder.py b/tests/packaged_modules/test_folder_based_builder.py index 34fe3a62db7..5dbc59b3fca 100644 --- a/tests/packaged_modules/test_folder_based_builder.py +++ b/tests/packaged_modules/test_folder_based_builder.py @@ -132,7 +132,7 @@ def data_files_with_one_split_and_metadata(tmp_path, auto_text_file): """\ {"file_name": "file.txt", "additional_feature": "Dummy file"} {"file_name": "file2.txt", "additional_feature": "Second dummy file"} - {"file_name": "subdir/file3.txt", "additional_feature": "Third dummy file"} + {"file_name": "./subdir/file3.txt", "additional_feature": "Third dummy file"} """ ) with open(metadata_filename, "w", encoding="utf-8") as f: