Skip to content

Commit f2831a4

Browse files
Make filename matching more robust (#5128)
* Make filename matching more robust (#5046) * Use '/' on all the platforms Co-authored-by: Mario Šaško <[email protected]> * Use '/' on all the platforms and get rid of useless function call * Remove useless import statement * make sure ./ is ignored Co-authored-by: Mario Šaško <[email protected]>
1 parent db617dd commit f2831a4

File tree

2 files changed

+3
-10
lines changed

2 files changed

+3
-10
lines changed

src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
import pandas as pd
88
import pyarrow as pa
9-
import pyarrow.compute as pc
109
import pyarrow.json as paj
1110

1211
import datasets
@@ -310,13 +309,10 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
310309
)
311310
pa_metadata_table = self._read_metadata(downloaded_metadata_file)
312311
pa_file_name_array = pa_metadata_table["file_name"]
313-
pa_file_name_array = pc.replace_substring(
314-
pa_file_name_array, pattern="\\", replacement="/"
315-
)
316312
pa_metadata_table = pa_metadata_table.drop(["file_name"])
317313
metadata_dir = os.path.dirname(metadata_file)
318314
metadata_dict = {
319-
file_name: sample_metadata
315+
os.path.normpath(file_name).replace("\\", "/"): sample_metadata
320316
for file_name, sample_metadata in zip(
321317
pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table)
322318
)
@@ -379,13 +375,10 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
379375
)
380376
pa_metadata_table = self._read_metadata(downloaded_metadata_file)
381377
pa_file_name_array = pa_metadata_table["file_name"]
382-
pa_file_name_array = pc.replace_substring(
383-
pa_file_name_array, pattern="\\", replacement="/"
384-
)
385378
pa_metadata_table = pa_metadata_table.drop(["file_name"])
386379
metadata_dir = os.path.dirname(downloaded_metadata_file)
387380
metadata_dict = {
388-
file_name: sample_metadata
381+
os.path.normpath(file_name).replace("\\", "/"): sample_metadata
389382
for file_name, sample_metadata in zip(
390383
pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table)
391384
)

tests/packaged_modules/test_folder_based_builder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def data_files_with_one_split_and_metadata(tmp_path, auto_text_file):
132132
"""\
133133
{"file_name": "file.txt", "additional_feature": "Dummy file"}
134134
{"file_name": "file2.txt", "additional_feature": "Second dummy file"}
135-
{"file_name": "subdir/file3.txt", "additional_feature": "Third dummy file"}
135+
{"file_name": "./subdir/file3.txt", "additional_feature": "Third dummy file"}
136136
"""
137137
)
138138
with open(metadata_filename, "w", encoding="utf-8") as f:

0 commit comments

Comments
 (0)