@@ -164,15 +164,15 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
164164
165165 # Check that all metadata files share the same format
166166 metadata_ext = {
167- os .path .splitext (downloaded_metadata_file )[ 1 ][ 1 : ]
168- for _ , downloaded_metadata_file in itertools .chain .from_iterable (metadata_files .values ())
167+ os .path .splitext (original_metadata_file )[ - 1 ]
168+ for original_metadata_file , _ in itertools .chain .from_iterable (metadata_files .values ())
169169 }
170170 if len (metadata_ext ) > 1 :
171171 raise ValueError (f"Found metadata files with different extensions: { list (metadata_ext )} " )
172172 metadata_ext = metadata_ext .pop ()
173173
174174 for _ , downloaded_metadata_file in itertools .chain .from_iterable (metadata_files .values ()):
175- pa_metadata_table = self ._read_metadata (downloaded_metadata_file )
175+ pa_metadata_table = self ._read_metadata (downloaded_metadata_file , metadata_ext = metadata_ext )
176176 features_per_metadata_file .append (
177177 (downloaded_metadata_file , datasets .Features .from_arrow_schema (pa_metadata_table .schema ))
178178 )
@@ -236,9 +236,8 @@ def _split_files_and_archives(self, data_files):
236236 archives .append (data_file )
237237 return files , archives
238238
239- def _read_metadata (self , metadata_file ):
240- metadata_file_ext = os .path .splitext (metadata_file )[1 ][1 :]
241- if metadata_file_ext == "csv" :
239+ def _read_metadata (self , metadata_file , metadata_ext : str = "" ):
240+ if metadata_ext == ".csv" :
242241 # Use `pd.read_csv` (although slower) instead of `pyarrow.csv.read_csv` for reading CSV files for consistency with the CSV packaged module
243242 return pa .Table .from_pandas (pd .read_csv (metadata_file ))
244243 else :
@@ -255,10 +254,10 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
255254 metadata_dict = None
256255 downloaded_metadata_file = None
257256
257+ metadata_ext = ""
258258 if split_metadata_files :
259259 metadata_ext = {
260- os .path .splitext (downloaded_metadata_file )[1 ][1 :]
261- for _ , downloaded_metadata_file in split_metadata_files
260+ os .path .splitext (original_metadata_file )[- 1 ] for original_metadata_file , _ in split_metadata_files
262261 }
263262 metadata_ext = metadata_ext .pop ()
264263
@@ -290,7 +289,9 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
290289 _ , metadata_file , downloaded_metadata_file = min (
291290 metadata_file_candidates , key = lambda x : count_path_segments (x [0 ])
292291 )
293- pa_metadata_table = self ._read_metadata (downloaded_metadata_file )
292+ pa_metadata_table = self ._read_metadata (
293+ downloaded_metadata_file , metadata_ext = metadata_ext
294+ )
294295 pa_file_name_array = pa_metadata_table ["file_name" ]
295296 pa_metadata_table = pa_metadata_table .drop (["file_name" ])
296297 metadata_dir = os .path .dirname (metadata_file )
@@ -302,7 +303,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
302303 }
303304 else :
304305 raise ValueError (
305- f"One or several metadata. { metadata_ext } were found, but not in the same directory or in a parent directory of { downloaded_file_or_dir } ."
306+ f"One or several metadata{ metadata_ext } were found, but not in the same directory or in a parent directory of { downloaded_file_or_dir } ."
306307 )
307308 if metadata_dir is not None and downloaded_metadata_file is not None :
308309 file_relpath = os .path .relpath (original_file , metadata_dir )
@@ -314,7 +315,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
314315 sample_metadata = metadata_dict [file_relpath ]
315316 else :
316317 raise ValueError (
317- f"One or several metadata. { metadata_ext } were found, but not in the same directory or in a parent directory of { downloaded_file_or_dir } ."
318+ f"One or several metadata{ metadata_ext } were found, but not in the same directory or in a parent directory of { downloaded_file_or_dir } ."
318319 )
319320 else :
320321 sample_metadata = {}
@@ -356,7 +357,9 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
356357 _ , metadata_file , downloaded_metadata_file = min (
357358 metadata_file_candidates , key = lambda x : count_path_segments (x [0 ])
358359 )
359- pa_metadata_table = self ._read_metadata (downloaded_metadata_file )
360+ pa_metadata_table = self ._read_metadata (
361+ downloaded_metadata_file , metadata_ext = metadata_ext
362+ )
360363 pa_file_name_array = pa_metadata_table ["file_name" ]
361364 pa_metadata_table = pa_metadata_table .drop (["file_name" ])
362365 metadata_dir = os .path .dirname (downloaded_metadata_file )
@@ -368,7 +371,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
368371 }
369372 else :
370373 raise ValueError (
371- f"One or several metadata. { metadata_ext } were found, but not in the same directory or in a parent directory of { downloaded_dir_file } ."
374+ f"One or several metadata{ metadata_ext } were found, but not in the same directory or in a parent directory of { downloaded_dir_file } ."
372375 )
373376 if metadata_dir is not None and downloaded_metadata_file is not None :
374377 downloaded_dir_file_relpath = os .path .relpath (downloaded_dir_file , metadata_dir )
@@ -380,7 +383,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
380383 sample_metadata = metadata_dict [downloaded_dir_file_relpath ]
381384 else :
382385 raise ValueError (
383- f"One or several metadata. { metadata_ext } were found, but not in the same directory or in a parent directory of { downloaded_dir_file } ."
386+ f"One or several metadata{ metadata_ext } were found, but not in the same directory or in a parent directory of { downloaded_dir_file } ."
384387 )
385388 else :
386389 sample_metadata = {}
0 commit comments