Skip to content

Commit 9510252

Browse files
authored
Better error handling in dataset_module_factory (#6959)
* Better error handling in dataset_module_factory * fix test * Add custom message on GatedRepoError
1 parent 97513be commit 9510252

File tree

2 files changed

+23
-28
lines changed

2 files changed

+23
-28
lines changed

src/datasets/load.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import yaml
3838
from fsspec.core import url_to_fs
3939
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
40+
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError
4041

4142
from . import config
4243
from .arrow_dataset import Dataset
@@ -1836,28 +1837,26 @@ def dataset_module_factory(
18361837
token=download_config.token,
18371838
timeout=100.0,
18381839
)
1839-
except Exception as e: # noqa catch any exception of hf_hub and consider that the dataset doesn't exist
1840-
if isinstance(
1841-
e,
1842-
(
1843-
OfflineModeIsEnabled,
1844-
requests.exceptions.ConnectTimeout,
1845-
requests.exceptions.ConnectionError,
1846-
),
1847-
):
1848-
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
1849-
elif "404" in str(e):
1850-
msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed"
1851-
raise DatasetNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
1852-
elif "401" in str(e):
1853-
msg = f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed"
1854-
msg = msg + f" at revision '{revision}'" if revision else msg
1855-
raise DatasetNotFoundError(
1856-
msg
1857-
+ f". If the dataset is private or gated, make sure to log in with `huggingface-cli login` or visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
1858-
)
1859-
else:
1860-
raise e
1840+
except (
1841+
OfflineModeIsEnabled,
1842+
requests.exceptions.ConnectTimeout,
1843+
requests.exceptions.ConnectionError,
1844+
) as e:
1845+
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e
1846+
except GatedRepoError as e:
1847+
message = f"Dataset '{path}' is a gated dataset on the Hub."
1848+
if "401 Client Error" in str(e):
1849+
message += " You must be authenticated to access it."
1850+
elif "403 Client Error" in str(e):
1851+
message += f" Visit the dataset page at https://huggingface.co/datasets/{path} to ask for access."
1852+
raise DatasetNotFoundError(message) from e
1853+
except RevisionNotFoundError as e:
1854+
raise DatasetNotFoundError(
1855+
f"Revision '{revision}' doesn't exist for dataset '{path}' on the Hub."
1856+
) from e
1857+
except RepositoryNotFoundError as e:
1858+
raise DatasetNotFoundError(f"Dataset '{path}' doesn't exist on the Hub or cannot be accessed.") from e
1859+
18611860
if filename in [sibling.rfilename for sibling in dataset_info.siblings]: # contains a dataset script
18621861
fs = HfFileSystem(endpoint=config.HF_ENDPOINT, token=download_config.token)
18631862
if _require_custom_configs or (revision and revision != "main"):

tests/test_load.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1067,13 +1067,9 @@ def test_load_dataset_from_hub(self):
10671067
str(context.exception),
10681068
)
10691069
with self.assertRaises(DatasetNotFoundError) as context:
1070-
datasets.load_dataset("_dummy", revision="0.0.0")
1070+
datasets.load_dataset("HuggingFaceFW/fineweb-edu", revision="0.0.0")
10711071
self.assertIn(
1072-
"Dataset '_dummy' doesn't exist on the Hub",
1073-
str(context.exception),
1074-
)
1075-
self.assertIn(
1076-
"at revision '0.0.0'",
1072+
"Revision '0.0.0' doesn't exist for dataset 'HuggingFaceFW/fineweb-edu' on the Hub.",
10771073
str(context.exception),
10781074
)
10791075
for offline_simulation_mode in list(OfflineSimulationMode):

0 commit comments

Comments
 (0)