Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def resolve_pattern(
allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
Returns:
List[Union[Path, Url]]: List of paths or URLs to the local or remote files that match the patterns.
List[str]: List of paths or URLs to the local or remote files that match the patterns.
"""
if is_relative_path(pattern):
pattern = xjoin(base_path, pattern)
Expand Down
24 changes: 16 additions & 8 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,10 +848,15 @@ def get_module(self) -> DatasetModule:
dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
# even if metadata_configs_dict is not None (which means that we will resolve files for each config later)
# we cannot skip resolving all files because we need to infer module name by files extensions
# we need a set of data files to find which dataset builder to use
# because we need to infer module name by files extensions
base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix()
patterns = sanitize_patterns(self.data_files) if self.data_files is not None else get_data_patterns(base_path)
if self.data_files is not None:
patterns = sanitize_patterns(self.data_files)
if metadata_configs and "data_files" in next(iter(metadata_configs.values())):
patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if various configs in the metadata_configs use various type of files? (edge case maybe)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not supported yet, and would require a refactor.

In a subsequent PR we can raise an error if it happens. Currently it raises an error if it fails to load data files if it uses the wrong dataset builder but this is confusing

else:
patterns = get_data_patterns(base_path)
data_files = DataFilesDict.from_patterns(
patterns,
base_path=base_path,
Expand Down Expand Up @@ -1027,11 +1032,14 @@ def get_module(self) -> DatasetModule:
dataset_card_data = DatasetCardData()
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
patterns = (
sanitize_patterns(self.data_files)
if self.data_files is not None
else get_data_patterns(base_path, download_config=self.download_config)
)
# we need a set of data files to find which dataset builder to use
# because we need to infer module name by files extensions
if self.data_files is not None:
patterns = sanitize_patterns(self.data_files)
if metadata_configs and "data_files" in next(iter(metadata_configs.values())):
patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
else:
patterns = get_data_patterns(base_path, download_config=self.download_config)
data_files = DataFilesDict.from_patterns(
patterns,
base_path=base_path,
Expand Down