huggingface · julien-c · Nov 10, 2020 · Nov 2, 2020 · Nov 2, 2020 · Nov 5, 2020
diff --git a/model_cards/t5-11b-README.md b/model_cards/t5-11b-README.md
@@ -12,8 +12,8 @@ inference: false
 
 ## Disclaimer
 
-Due do it's immense size, `t5-11b` requires some special treatment. 
-First, `t5-11b` should be loaded with flag `use_cdn` set to `False` as follows:
+**Before `transformers` v3.5.0**, due do its immense size, `t5-11b` required some special treatment. 
+If you're using transformers `<= v3.4.0`, `t5-11b` should be loaded with flag `use_cdn` set to `False` as follows:
 
 ```python
 t5 = transformers.T5ForConditionalGeneration.from_pretrained('t5-11b', use_cdn = False)

diff --git a/scripts/fsmt/convert-allenai-wmt16.sh b/scripts/fsmt/convert-allenai-wmt16.sh
@@ -56,7 +56,3 @@ cd -
 perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
 # add/remove files as needed
 
-# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload
-# So the only way to start using the new model sooner is either:
-# 1. download it to a local path and use that path as model_name
-# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere
diff --git a/scripts/fsmt/convert-allenai-wmt19.sh b/scripts/fsmt/convert-allenai-wmt19.sh
@@ -44,7 +44,3 @@ cd -
 perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
 # add/remove files as needed
 
-# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload
-# So the only way to start using the new model sooner is either:
-# 1. download it to a local path and use that path as model_name
-# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere
diff --git a/scripts/fsmt/convert-facebook-wmt19.sh b/scripts/fsmt/convert-facebook-wmt19.sh
@@ -55,7 +55,3 @@ cd -
 perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for map { "wmt19-$_" } ("en-ru", "ru-en", "de-en", "en-de")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
 # add/remove files as needed
 
-# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload
-# So the only way to start using the new model sooner is either:
-# 1. download it to a local path and use that path as model_name
-# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -362,14 +362,15 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs) -> Tuple[
         resume_download = kwargs.pop("resume_download", False)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
 
         if os.path.isdir(pretrained_model_name_or_path):
             config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
         elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
             config_file = pretrained_model_name_or_path
         else:
             config_file = hf_bucket_url(
-                pretrained_model_name_or_path, filename=CONFIG_NAME, use_cdn=False, mirror=None
+                pretrained_model_name_or_path, filename=CONFIG_NAME, revision=revision, mirror=None
             )
 
         try:
@@ -383,11 +384,10 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs) -> Tuple[
                 local_files_only=local_files_only,
             )
             # Load config dict
-            if resolved_config_file is None:
-                raise EnvironmentError
             config_dict = cls._dict_from_json_file(resolved_config_file)
 
-        except EnvironmentError:
+        except EnvironmentError as err:
+            logger.error(err)
             msg = (
                 f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
                 f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"

diff --git a/src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
@@ -248,10 +248,6 @@ def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder
     print("\nLast step is to upload the files to s3")
     print(f"cd {data_root}")
     print(f"transformers-cli upload {model_dir}")
-    print(
-        "Note: CDN caches files for up to 24h, so either use a local model path "
-        "or use `from_pretrained(mname, use_cdn=False)` to use the non-cached version."
-    )
 
 
 if __name__ == "__main__":

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
@@ -4,6 +4,7 @@
 """
 
 import fnmatch
+import io
 import json
 import os
 import re
@@ -17,7 +18,7 @@
 from functools import partial, wraps
 from hashlib import sha256
 from pathlib import Path
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, BinaryIO, Dict, Optional, Tuple, Union
 from urllib.parse import urlparse
 from zipfile import ZipFile, is_zipfile
 
@@ -217,6 +218,8 @@
 
 S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
 CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
+# HUGGINGFACE_CO_PREFIX = "http://huggingface.test/{model_id}/resolve/{revision}/{filename}"
+HUGGINGFACE_CO_PREFIX = "https://moon-preprod.huggingface.co/{model_id}/resolve/{revision}/{filename}"
 PRESET_MIRROR_DICT = {
     "tuna": "https://mirrors.tuna.tsinghua.edu.cn/hugging-face-models",
     "bfsu": "https://mirrors.bfsu.edu.cn/hugging-face-models",
@@ -825,48 +828,48 @@ def is_remote_url(url_or_filename):
     return parsed.scheme in ("http", "https")
 
 
-def hf_bucket_url(model_id: str, filename: str, use_cdn=True, mirror=None) -> str:
+def hf_bucket_url(model_id: str, filename: str, revision: Optional[str] = None, mirror=None) -> str:
     """
-    Resolve a model identifier, and a file name, to a HF-hosted url on either S3 or Cloudfront (a Content Delivery
-    Network, or CDN).
+    Resolve a model identifier, and a file name, to a huggingface.co-hosted url, potentially redirecting to Cloudfront
+    (a Content Delivery Network, or CDN).
 
     Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
     bandwidth costs). However, it is more aggressively cached by default, so may not always reflect the latest changes
     to the underlying file (default TTL is 24 hours).
 
-    In terms of client-side caching from this library, even though Cloudfront relays the ETags from S3, using one or
-    the other (or switching from one to the other) will affect caching: cached files are not shared between the two
-    because the cached file's name contains a hash of the url.
+    This is not an issue here however, because since migrating to git-based model versioning on huggingface.co, we now
+    store the files on S3/Cloudfront in a content-addressable way (i.e., the file name is its hash).
+
+    TODO(update) In terms of client-side caching from this library, even though Cloudfront relays the ETags from S3,
+    using one or the other (or switching from one to the other) will affect caching: cached files are not shared
+    between the two because the cached file's name contains a hash of the url.
     """
-    endpoint = (
-        PRESET_MIRROR_DICT.get(mirror, mirror)
-        if mirror
-        else CLOUDFRONT_DISTRIB_PREFIX
-        if use_cdn
-        else S3_BUCKET_PREFIX
-    )
-    legacy_format = "/" not in model_id
-    if legacy_format:
-        return f"{endpoint}/{model_id}-{filename}"
-    else:
-        return f"{endpoint}/{model_id}/{filename}"
+    if mirror:
+        endpoint = PRESET_MIRROR_DICT.get(mirror, mirror)
+        legacy_format = "/" not in model_id
+        if legacy_format:
+            return f"{endpoint}/{model_id}-{filename}"
+        else:
+            return f"{endpoint}/{model_id}/{filename}"
 
+    if revision is None:
+        revision = "main"
+    return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)
 
-def url_to_filename(url, etag=None):
+
+def url_to_filename(url: str, etag: Optional[str] = None) -> str:
     """
     Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
     delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
     identify it as a HDF5 file (see
     https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
     """
     url_bytes = url.encode("utf-8")
-    url_hash = sha256(url_bytes)
-    filename = url_hash.hexdigest()
+    filename = sha256(url_bytes).hexdigest()
 
     if etag:
         etag_bytes = etag.encode("utf-8")
-        etag_hash = sha256(etag_bytes)
-        filename += "." + etag_hash.hexdigest()
+        filename += "." + sha256(etag_bytes).hexdigest()
 
     if url.endswith(".h5"):
         filename += ".h5"
@@ -927,8 +930,10 @@ def cached_path(
             re-extract the archive and override the folder where it was extracted.
 
     Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
-        otherwise
+        Local path (string) of file or if networking is off, last version of file cached on disk.
+
+    Raises:
+        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
     """
     if cache_dir is None:
         cache_dir = TRANSFORMERS_CACHE
@@ -992,7 +997,10 @@ def cached_path(
     return output_path
 
 
-def http_get(url, temp_file, proxies=None, resume_size=0, user_agent: Union[Dict, str, None] = None):
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+    """
+    Formats a user-agent string with basic info about a request.
+    """
     ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
     if is_torch_available():
         ua += "; torch/{}".format(torch.__version__)
@@ -1002,13 +1010,19 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent: Union[Dict
         ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
     elif isinstance(user_agent, str):
         ua += "; " + user_agent
-    headers = {"user-agent": ua}
+    return ua
+
+
+def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, user_agent: Union[Dict, str, None] = None):
+    """
+    Donwload remote file. Do not gobble up errors.
+    """
+    headers = {"user-agent": http_user_agent(user_agent)}
     if resume_size > 0:
         headers["Range"] = "bytes=%d-" % (resume_size,)
-    response = requests.get(url, stream=True, proxies=proxies, headers=headers)
-    if response.status_code == 416:  # Range not satisfiable
-        return
-    content_length = response.headers.get("Content-Length")
+    r = requests.get(url, stream=True, proxies=proxies, headers=headers)
+    r.raise_for_status()
+    content_length = r.headers.get("Content-Length")
     total = resume_size + int(content_length) if content_length is not None else None
     progress = tqdm(
         unit="B",
@@ -1018,15 +1032,15 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent: Union[Dict
         desc="Downloading",
         disable=bool(logging.get_verbosity() == logging.NOTSET),
     )
-    for chunk in response.iter_content(chunk_size=1024):
+    for chunk in r.iter_content(chunk_size=1024):
         if chunk:  # filter out keep-alive new chunks
             progress.update(len(chunk))
             temp_file.write(chunk)
     progress.close()
 
 
 def get_from_cache(
-    url,
+    url: str,
     cache_dir=None,
     force_download=False,
     proxies=None,
@@ -1040,8 +1054,10 @@ def get_from_cache(
     path to the cached file.
 
     Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
-        otherwise
+        Local path (string) of file or if networking is off, last version of file cached on disk.
+
+    Raises:
+        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
     """
     if cache_dir is None:
         cache_dir = TRANSFORMERS_CACHE
@@ -1050,13 +1066,28 @@ def get_from_cache(
 
     os.makedirs(cache_dir, exist_ok=True)
 
+    url_to_download = url
     etag = None
     if not local_files_only:
         try:
-            response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
-            if response.status_code == 200:
-                etag = response.headers.get("ETag")
-        except (EnvironmentError, requests.exceptions.Timeout):
+            headers = {"user-agent": http_user_agent(user_agent)}
+            r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
+            r.raise_for_status()
+            etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
+            # We favor a custom header indicating the etag of the linked resource, and
+            # we fallback to the regular etag header.
+            # If we don't have any of those, raise an error.
+            if etag is None:
+                raise OSError(
+                    "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+                )
+            # In case of a redirect,
+            # save an extra redirect on the request.get call,
+            # and ensure we download the exact atomic version even if it changed
+            # between the HEAD and the GET (unlikely, but hey).
+            if 300 <= r.status_code <= 399:
+                url_to_download = r.headers["Location"]
+        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
             # etag is already None
             pass
 
@@ -1065,7 +1096,7 @@ def get_from_cache(
     # get cache path to put the file
     cache_path = os.path.join(cache_dir, filename)
 
-    # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
+    # etag is None == we don't have a connection or we passed local_files_only.
     # try to get the last downloaded one
     if etag is None:
         if os.path.exists(cache_path):
@@ -1088,7 +1119,11 @@ def get_from_cache(
                         " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
                         " to False."
                     )
-                return None
+                else:
+                    raise ValueError(
+                        "Connection error, and we cannot find the requested files in the cached path."
+                        " Please try again or make sure your Internet connection is on."
+                    )
 
     # From now on, etag is not None.
     if os.path.exists(cache_path) and not force_download:
@@ -1107,8 +1142,8 @@ def get_from_cache(
             incomplete_path = cache_path + ".incomplete"
 
             @contextmanager
-            def _resumable_file_manager():
-                with open(incomplete_path, "a+b") as f:
+            def _resumable_file_manager() -> "io.BufferedWriter":
+                with open(incomplete_path, "ab") as f:
                     yield f
 
             temp_file_manager = _resumable_file_manager
@@ -1117,15 +1152,15 @@ def _resumable_file_manager():
             else:
                 resume_size = 0
         else:
-            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
+            temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
             resume_size = 0
 
         # Download to temporary file, then copy to cache dir once finished.
         # Otherwise you get corrupt cache entries if the download gets interrupted.
         with temp_file_manager() as temp_file:
             logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
 
-            http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
+            http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
 
         logger.info("storing %s in cache at %s", url, cache_path)
         os.replace(temp_file.name, cache_path)

diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
@@ -144,9 +144,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
             model_card_file = pretrained_model_name_or_path
         else:
-            model_card_file = hf_bucket_url(
-                pretrained_model_name_or_path, filename=MODEL_CARD_NAME, use_cdn=False, mirror=None
-            )
+            model_card_file = hf_bucket_url(pretrained_model_name_or_path, filename=MODEL_CARD_NAME, mirror=None)
 
         if find_from_standard_name or pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
             model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
@@ -156,8 +154,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         try:
             # Load from URL or cache if already cached
             resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, proxies=proxies)
-            if resolved_model_card_file is None:
-                raise EnvironmentError
             if resolved_model_card_file == model_card_file:
                 logger.info("loading model card file {}".format(model_card_file))
             else:

diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
@@ -523,9 +523,10 @@
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
             local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not to only look at local files (e.g., not try downloading the model).
-            use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
-                our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
+            revision(:obj:`str`, `optional`, defaults to :obj:`main`):
+                Whether to pin to a specific model version (can be a branch name, a tag name, or a commit id). We use a
+                git-based model for storing models and other artefacts on huggingface.co, so ``revision`` is any
+                identifier allowed by git. TODO(if agreed upon, duplicate this doc elsewhere.)
             kwargs (additional keyword arguments, `optional`):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or

diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
@@ -107,7 +107,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         proxies = kwargs.pop("proxies", None)
         # output_loading_info = kwargs.pop("output_loading_info", False)
         local_files_only = kwargs.pop("local_files_only", False)
-        use_cdn = kwargs.pop("use_cdn", True)
 
         # Load config if we don't provide a configuration
         if not isinstance(config, PretrainedConfig):
@@ -131,7 +130,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             else:
-                archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=WEIGHTS_NAME, use_cdn=use_cdn)
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=WEIGHTS_NAME)
 
             # redirect to the cache, if necessary
             try: