use the Hub instead of GitHub

lhoestq · lhoestq · commit c900ef8e2b14 · 2022-09-13T16:59:50.000+02:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -9,7 +9,6 @@ on:
       - main
 
 env:
-  HF_SCRIPTS_VERSION: main
   HF_ALLOW_CODE_EVAL: 1
 
 jobs:
diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -36,8 +36,6 @@
         "If you are running this in a Google Colab, you should probably just restart the runtime to use the right version of `pyarrow`."
     )
 
-SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__
-
 del platform
 del pyarrow
 del version
diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -424,93 +424,6 @@ def get_module(self) -> MetricModule:
         raise NotImplementedError
 
 
-class GithubDatasetModuleFactory(_DatasetModuleFactory):
-    """
-    Get the module of a dataset from GitHub (legacy).
-    The dataset script is downloaded from GitHub.
-    This class will eventually be removed and a HubDatasetModuleFactory will be used instead.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        revision: Optional[Union[str, Version]] = None,
-        download_config: Optional[DownloadConfig] = None,
-        download_mode: Optional[DownloadMode] = None,
-        dynamic_modules_path: Optional[str] = None,
-    ):
-        self.name = name
-        self.revision = revision
-        self.download_config = download_config.copy() if download_config else DownloadConfig()
-        if self.download_config.max_retries < 3:
-            self.download_config.max_retries = 3
-        self.download_mode = download_mode
-        self.dynamic_modules_path = dynamic_modules_path
-        assert self.name.count("/") == 0
-        increase_load_count(name, resource_type="dataset")
-
-    def download_loading_script(self, revision: Optional[str]) -> str:
-        file_path = hf_github_url(path=self.name, name=self.name + ".py", revision=revision)
-        download_config = self.download_config.copy()
-        if download_config.download_desc is None:
-            download_config.download_desc = "Downloading builder script"
-        return cached_path(file_path, download_config=download_config)
-
-    def download_dataset_infos_file(self, revision: Optional[str]) -> str:
-        dataset_infos = hf_github_url(path=self.name, name=config.DATASETDICT_INFOS_FILENAME, revision=revision)
-        # Download the dataset infos file if available
-        download_config = self.download_config.copy()
-        if download_config.download_desc is None:
-            download_config.download_desc = "Downloading metadata"
-        try:
-            return cached_path(
-                dataset_infos,
-                download_config=download_config,
-            )
-        except (FileNotFoundError, ConnectionError):
-            return None
-
-    def get_module(self) -> DatasetModule:
-        # get script and other files
-        revision = self.revision
-        try:
-            local_path = self.download_loading_script(revision)
-        except FileNotFoundError:
-            if revision is not None or os.getenv("HF_SCRIPTS_VERSION", None) is not None:
-                raise
-            else:
-                revision = "main"
-                local_path = self.download_loading_script(revision)
-                logger.warning(
-                    f"Couldn't find a directory or a dataset named '{self.name}' in this version. "
-                    f"It was picked from the main branch on github instead."
-                )
-        dataset_infos_path = self.download_dataset_infos_file(revision)
-        imports = get_imports(local_path)
-        local_imports = _download_additional_modules(
-            name=self.name,
-            base_path=hf_github_url(path=self.name, name="", revision=revision),
-            imports=imports,
-            download_config=self.download_config,
-        )
-        additional_files = [(config.DATASETDICT_INFOS_FILENAME, dataset_infos_path)] if dataset_infos_path else []
-        # copy the script and the files in an importable directory
-        dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
-        module_path, hash = _create_importable_file(
-            local_path=local_path,
-            local_imports=local_imports,
-            additional_files=additional_files,
-            dynamic_modules_path=dynamic_modules_path,
-            module_namespace="datasets",
-            name=self.name,
-            download_mode=self.download_mode,
-        )
-        # make the new module to be noticed by the import system
-        importlib.invalidate_caches()
-        builder_kwargs = {"hash": hash, "base_path": hf_hub_url(self.name, "", revision=self.revision)}
-        return DatasetModule(module_path, hash, builder_kwargs)
-
-
 class GithubMetricModuleFactory(_MetricModuleFactory):
     """Get the module of a metric. The metric script is downloaded from GitHub.
 
@@ -554,7 +467,7 @@ def get_module(self) -> MetricModule:
             local_path = self.download_loading_script(revision)
             revision = self.revision
         except FileNotFoundError:
-            if revision is not None or os.getenv("HF_SCRIPTS_VERSION", None) is not None:
+            if revision is not None:
                 raise
             else:
                 revision = "main"
@@ -917,11 +830,11 @@ def __init__(
         self.download_config = download_config or DownloadConfig()
         self.download_mode = download_mode
         self.dynamic_modules_path = dynamic_modules_path
-        assert self.name.count("/") == 1
+        assert self.name.count("/") <= 1
         increase_load_count(name, resource_type="dataset")
 
     def download_loading_script(self) -> str:
-        file_path = hf_hub_url(repo_id=self.name, path=self.name.split("/")[1] + ".py", revision=self.revision)
+        file_path = hf_hub_url(repo_id=self.name, path=self.name.split("/")[-1] + ".py", revision=self.revision)
         download_config = self.download_config.copy()
         if download_config.download_desc is None:
             download_config.download_desc = "Downloading builder script"
@@ -1197,67 +1110,57 @@ def dataset_module_factory(
     elif is_relative_path(path) and path.count("/") <= 1:
         try:
             _raise_if_offline_mode_is_enabled()
-            if path.count("/") == 0:  # even though the dataset is on the Hub, we get it from GitHub for now
-                # TODO(QL): use a Hub dataset module factory instead of GitHub
-                return GithubDatasetModuleFactory(
+            hf_api = HfApi(config.HF_ENDPOINT)
+            try:
+                if isinstance(download_config.use_auth_token, bool):
+                    token = HfFolder.get_token() if download_config.use_auth_token else None
+                else:
+                    token = download_config.use_auth_token
+                dataset_info = hf_api.dataset_info(
+                    repo_id=path,
+                    revision=revision,
+                    token=token if token else "no-token",
+                    timeout=100.0,
+                )
+            except Exception as e:  # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
+                if isinstance(
+                    e,
+                    (
+                        OfflineModeIsEnabled,
+                        requests.exceptions.ConnectTimeout,
+                        requests.exceptions.ConnectionError,
+                    ),
+                ):
+                    raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
+                elif "404" in str(e):
+                    msg = f"Dataset '{path}' doesn't exist on the Hub"
+                    raise FileNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
+                elif "401" in str(e):
+                    msg = f"Dataset '{path}' doesn't exist on the Hub"
+                    msg = msg + f" at revision '{revision}'" if revision else msg
+                    raise FileNotFoundError(
+                        msg
+                        + ". If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`."
+                    )
+                else:
+                    raise e
+            if filename in [sibling.rfilename for sibling in dataset_info.siblings]:
+                return HubDatasetModuleFactoryWithScript(
                     path,
                     revision=revision,
                     download_config=download_config,
                     download_mode=download_mode,
                     dynamic_modules_path=dynamic_modules_path,
                 ).get_module()
-            elif path.count("/") == 1:  # community dataset on the Hub
-                hf_api = HfApi(config.HF_ENDPOINT)
-                try:
-                    if isinstance(download_config.use_auth_token, bool):
-                        token = HfFolder.get_token() if download_config.use_auth_token else None
-                    else:
-                        token = download_config.use_auth_token
-                    dataset_info = hf_api.dataset_info(
-                        repo_id=path,
-                        revision=revision,
-                        token=token if token else "no-token",
-                        timeout=100.0,
-                    )
-                except Exception as e:  # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
-                    if isinstance(
-                        e,
-                        (
-                            OfflineModeIsEnabled,
-                            requests.exceptions.ConnectTimeout,
-                            requests.exceptions.ConnectionError,
-                        ),
-                    ):
-                        raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
-                    elif "404" in str(e):
-                        msg = f"Dataset '{path}' doesn't exist on the Hub"
-                        raise FileNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
-                    elif "401" in str(e):
-                        msg = f"Dataset '{path}' doesn't exist on the Hub"
-                        msg = msg + f" at revision '{revision}'" if revision else msg
-                        raise FileNotFoundError(
-                            msg
-                            + ". If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`."
-                        )
-                    else:
-                        raise e
-                if filename in [sibling.rfilename for sibling in dataset_info.siblings]:
-                    return HubDatasetModuleFactoryWithScript(
-                        path,
-                        revision=revision,
-                        download_config=download_config,
-                        download_mode=download_mode,
-                        dynamic_modules_path=dynamic_modules_path,
-                    ).get_module()
-                else:
-                    return HubDatasetModuleFactoryWithoutScript(
-                        path,
-                        revision=revision,
-                        data_dir=data_dir,
-                        data_files=data_files,
-                        download_config=download_config,
-                        download_mode=download_mode,
-                    ).get_module()
+            else:
+                return HubDatasetModuleFactoryWithoutScript(
+                    path,
+                    revision=revision,
+                    data_dir=data_dir,
+                    data_files=data_files,
+                    download_config=download_config,
+                    download_mode=download_mode,
+                ).get_module()
         except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
             try:
                 return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
@@ -1624,7 +1527,6 @@ def load_dataset(
             Dataset scripts are small python scripts that define dataset builders. They define the citation, info and format of the dataset,
             contain the path or URL to the original data files and the code to load examples from the original data files.
 
-            You can find some of the scripts here: https://github.com/huggingface/datasets/tree/main/datasets
             You can find the complete list of datasets in the Datasets Hub at https://huggingface.co/datasets
 
         2. Run the dataset script which will:
diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
@@ -23,6 +23,7 @@
 from urllib.parse import urljoin, urlparse
 
 import requests
+from packaging import version
 
 from .. import __version__, config
 from ..download.download_config import DownloadConfig
@@ -97,9 +98,9 @@ def head_hf_s3(
 
 
 def hf_github_url(path: str, name: str, dataset=True, revision: Optional[str] = None) -> str:
-    from .. import SCRIPTS_VERSION
 
-    revision = revision or os.getenv("HF_SCRIPTS_VERSION", SCRIPTS_VERSION)
+    default_revision = "main" if version.parse(__version__).is_devrelease else __version__
+    revision = revision or default_revision
     if dataset:
         return config.REPO_DATASETS_URL.format(revision=revision, path=path, name=name)
     else:
diff --git a/tests/test_load.py b/tests/test_load.py
@@ -13,7 +13,7 @@
 import requests
 
 import datasets
-from datasets import SCRIPTS_VERSION, config, load_dataset, load_from_disk
+from datasets import config, load_dataset, load_from_disk
 from datasets.arrow_dataset import Dataset
 from datasets.builder import DatasetBuilder
 from datasets.data_files import DataFilesDict
@@ -24,7 +24,6 @@
 from datasets.load import (
     CachedDatasetModuleFactory,
     CachedMetricModuleFactory,
-    GithubDatasetModuleFactory,
     GithubMetricModuleFactory,
     HubDatasetModuleFactoryWithoutScript,
     HubDatasetModuleFactoryWithScript,
@@ -255,15 +254,6 @@ def setUp(self):
             hf_modules_cache=self.hf_modules_cache,
         )
 
-    def test_GithubDatasetModuleFactory(self):
-        # "wmt_t2t" has additional imports (internal)
-        factory = GithubDatasetModuleFactory(
-            "wmt_t2t", download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path
-        )
-        module_factory_result = factory.get_module()
-        assert importlib.import_module(module_factory_result.module_path) is not None
-        assert module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
-
     def test_GithubMetricModuleFactory_with_internal_import(self):
         # "squad_v2" requires additional imports (internal)
         factory = GithubMetricModuleFactory(
@@ -479,7 +469,6 @@ def test_CachedMetricModuleFactory(self):
     [
         CachedDatasetModuleFactory,
         CachedMetricModuleFactory,
-        GithubDatasetModuleFactory,
         GithubMetricModuleFactory,
         HubDatasetModuleFactoryWithoutScript,
         HubDatasetModuleFactoryWithScript,
@@ -490,10 +479,7 @@ def test_CachedMetricModuleFactory(self):
     ],
 )
 def test_module_factories(factory_class):
-    if issubclass(factory_class, (HubDatasetModuleFactoryWithoutScript, HubDatasetModuleFactoryWithScript)):
-        name = "dummy_org/dummy_name"
-    else:
-        name = "dummy_name"
+    name = "dummy_name"
     factory = factory_class(name)
     assert factory.name == name
 
@@ -576,18 +562,21 @@ def test_offline_dataset_module_factory(self):
                 self.assertNotEqual(dataset_module_1.module_path, dataset_module_3.module_path)
                 self.assertIn("Using the latest cached version of the module", self._caplog.text)
 
-    def test_load_dataset_from_github(self):
-        scripts_version = os.getenv("HF_SCRIPTS_VERSION", SCRIPTS_VERSION)
+    def test_load_dataset_from_hub(self):
         with self.assertRaises(FileNotFoundError) as context:
             datasets.load_dataset("_dummy")
         self.assertIn(
-            "https://raw.githubusercontent.com/huggingface/datasets/main/datasets/_dummy/_dummy.py",
+            "Dataset '_dummy' doesn't exist on the Hub",
             str(context.exception),
         )
         with self.assertRaises(FileNotFoundError) as context:
             datasets.load_dataset("_dummy", revision="0.0.0")
         self.assertIn(
-            "https://raw.githubusercontent.com/huggingface/datasets/0.0.0/datasets/_dummy/_dummy.py",
+            "Dataset '_dummy' doesn't exist on the Hub",
+            str(context.exception),
+        )
+        self.assertIn(
+            "at revision '0.0.0'",
             str(context.exception),
         )
         for offline_simulation_mode in list(OfflineSimulationMode):
@@ -596,7 +585,7 @@ def test_load_dataset_from_github(self):
                     datasets.load_dataset("_dummy")
                 if offline_simulation_mode != OfflineSimulationMode.HF_DATASETS_OFFLINE_SET_TO_1:
                     self.assertIn(
-                        f"https://raw.githubusercontent.com/huggingface/datasets/{scripts_version}/datasets/_dummy/_dummy.py",
+                        "Couldn't reach '_dummy' on the Hub",
                         str(context.exception),
                     )
 

Original file line number	Diff line number	Diff line change
`@@ -36,8 +36,6 @@`
`36`	`36`	"If you are running this in a Google Colab, you should probably just restart the runtime to use the right version of `pyarrow`."
`37`	`37`	`)`
`38`	`38`
`39`		`-SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__`
`40`		`-`
`41`	`39`	`del platform`
`42`	`40`	`del pyarrow`
`43`	`41`	`del version`