Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions model_cards/t5-11b-README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ inference: false

## Disclaimer

Due do it's immense size, `t5-11b` requires some special treatment.
First, `t5-11b` should be loaded with flag `use_cdn` set to `False` as follows:
**Before `transformers` v3.5.0**, due do its immense size, `t5-11b` required some special treatment.
If you're using transformers `<= v3.4.0`, `t5-11b` should be loaded with flag `use_cdn` set to `False` as follows:

```python
t5 = transformers.T5ForConditionalGeneration.from_pretrained('t5-11b', use_cdn = False)
Expand Down
4 changes: 0 additions & 4 deletions scripts/fsmt/convert-allenai-wmt16.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,3 @@ cd -
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
# add/remove files as needed

# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload
# So the only way to start using the new model sooner is either:
# 1. download it to a local path and use that path as model_name
# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere
Comment on lines -59 to -62
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also cc'ing @stas00 on this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the heads up, @julien-c - that's a wonderful news/change!

4 changes: 0 additions & 4 deletions scripts/fsmt/convert-allenai-wmt19.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,3 @@ cd -
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
# add/remove files as needed

# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload
# So the only way to start using the new model sooner is either:
# 1. download it to a local path and use that path as model_name
# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere
4 changes: 0 additions & 4 deletions scripts/fsmt/convert-facebook-wmt19.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,3 @@ cd -
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for map { "wmt19-$_" } ("en-ru", "ru-en", "de-en", "en-de")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
# add/remove files as needed

# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload
# So the only way to start using the new model sooner is either:
# 1. download it to a local path and use that path as model_name
# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere
8 changes: 4 additions & 4 deletions src/transformers/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,14 +362,15 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs) -> Tuple[
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", False)
revision = kwargs.pop("revision", None)

if os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
config_file = pretrained_model_name_or_path
else:
config_file = hf_bucket_url(
pretrained_model_name_or_path, filename=CONFIG_NAME, use_cdn=False, mirror=None
pretrained_model_name_or_path, filename=CONFIG_NAME, revision=revision, mirror=None
)

try:
Expand All @@ -383,11 +384,10 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs) -> Tuple[
local_files_only=local_files_only,
)
# Load config dict
if resolved_config_file is None:
raise EnvironmentError
config_dict = cls._dict_from_json_file(resolved_config_file)

except EnvironmentError:
except EnvironmentError as err:
logger.error(err)
msg = (
f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,6 @@ def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder
print("\nLast step is to upload the files to s3")
print(f"cd {data_root}")
print(f"transformers-cli upload {model_dir}")
print(
"Note: CDN caches files for up to 24h, so either use a local model path "
"or use `from_pretrained(mname, use_cdn=False)` to use the non-cached version."
)


if __name__ == "__main__":
Expand Down
127 changes: 81 additions & 46 deletions src/transformers/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import fnmatch
import io
import json
import os
import re
Expand All @@ -17,7 +18,7 @@
from functools import partial, wraps
from hashlib import sha256
from pathlib import Path
from typing import Any, Dict, Optional, Tuple, Union
from typing import Any, BinaryIO, Dict, Optional, Tuple, Union
from urllib.parse import urlparse
from zipfile import ZipFile, is_zipfile

Expand Down Expand Up @@ -217,6 +218,8 @@

S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
# HUGGINGFACE_CO_PREFIX = "http://huggingface.test/{model_id}/resolve/{revision}/{filename}"
HUGGINGFACE_CO_PREFIX = "https://moon-preprod.huggingface.co/{model_id}/resolve/{revision}/{filename}"
PRESET_MIRROR_DICT = {
"tuna": "https://mirrors.tuna.tsinghua.edu.cn/hugging-face-models",
"bfsu": "https://mirrors.bfsu.edu.cn/hugging-face-models",
Expand Down Expand Up @@ -825,48 +828,48 @@ def is_remote_url(url_or_filename):
return parsed.scheme in ("http", "https")


def hf_bucket_url(model_id: str, filename: str, use_cdn=True, mirror=None) -> str:
def hf_bucket_url(model_id: str, filename: str, revision: Optional[str] = None, mirror=None) -> str:
"""
Resolve a model identifier, and a file name, to a HF-hosted url on either S3 or Cloudfront (a Content Delivery
Network, or CDN).
Resolve a model identifier, and a file name, to a huggingface.co-hosted url, potentially redirecting to Cloudfront
(a Content Delivery Network, or CDN).

Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
bandwidth costs). However, it is more aggressively cached by default, so may not always reflect the latest changes
to the underlying file (default TTL is 24 hours).

In terms of client-side caching from this library, even though Cloudfront relays the ETags from S3, using one or
the other (or switching from one to the other) will affect caching: cached files are not shared between the two
because the cached file's name contains a hash of the url.
This is not an issue here however, because since migrating to git-based model versioning on huggingface.co, we now
store the files on S3/Cloudfront in a content-addressable way (i.e., the file name is its hash).

TODO(update) In terms of client-side caching from this library, even though Cloudfront relays the ETags from S3,
using one or the other (or switching from one to the other) will affect caching: cached files are not shared
between the two because the cached file's name contains a hash of the url.
"""
endpoint = (
PRESET_MIRROR_DICT.get(mirror, mirror)
if mirror
else CLOUDFRONT_DISTRIB_PREFIX
if use_cdn
else S3_BUCKET_PREFIX
)
legacy_format = "/" not in model_id
if legacy_format:
return f"{endpoint}/{model_id}-{filename}"
else:
return f"{endpoint}/{model_id}/{filename}"
if mirror:
endpoint = PRESET_MIRROR_DICT.get(mirror, mirror)
legacy_format = "/" not in model_id
if legacy_format:
return f"{endpoint}/{model_id}-{filename}"
else:
return f"{endpoint}/{model_id}/{filename}"

if revision is None:
revision = "main"
return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)

def url_to_filename(url, etag=None):

def url_to_filename(url: str, etag: Optional[str] = None) -> str:
"""
Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
identify it as a HDF5 file (see
https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
"""
url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes)
filename = url_hash.hexdigest()
filename = sha256(url_bytes).hexdigest()

if etag:
etag_bytes = etag.encode("utf-8")
etag_hash = sha256(etag_bytes)
filename += "." + etag_hash.hexdigest()
filename += "." + sha256(etag_bytes).hexdigest()

if url.endswith(".h5"):
filename += ".h5"
Expand Down Expand Up @@ -927,8 +930,10 @@ def cached_path(
re-extract the archive and override the folder where it was extracted.

Return:
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
otherwise
Local path (string) of file or if networking is off, last version of file cached on disk.

Raises:
In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
Expand Down Expand Up @@ -992,7 +997,10 @@ def cached_path(
return output_path


def http_get(url, temp_file, proxies=None, resume_size=0, user_agent: Union[Dict, str, None] = None):
def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
"""
Formats a user-agent string with basic info about a request.
"""
ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
if is_torch_available():
ua += "; torch/{}".format(torch.__version__)
Expand All @@ -1002,13 +1010,19 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent: Union[Dict
ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
elif isinstance(user_agent, str):
ua += "; " + user_agent
headers = {"user-agent": ua}
return ua


def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, user_agent: Union[Dict, str, None] = None):
"""
Donwload remote file. Do not gobble up errors.
"""
headers = {"user-agent": http_user_agent(user_agent)}
if resume_size > 0:
headers["Range"] = "bytes=%d-" % (resume_size,)
response = requests.get(url, stream=True, proxies=proxies, headers=headers)
if response.status_code == 416: # Range not satisfiable
return
content_length = response.headers.get("Content-Length")
r = requests.get(url, stream=True, proxies=proxies, headers=headers)
r.raise_for_status()
content_length = r.headers.get("Content-Length")
total = resume_size + int(content_length) if content_length is not None else None
progress = tqdm(
unit="B",
Expand All @@ -1018,15 +1032,15 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent: Union[Dict
desc="Downloading",
disable=bool(logging.get_verbosity() == logging.NOTSET),
)
for chunk in response.iter_content(chunk_size=1024):
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()


def get_from_cache(
url,
url: str,
cache_dir=None,
force_download=False,
proxies=None,
Expand All @@ -1040,8 +1054,10 @@ def get_from_cache(
path to the cached file.

Return:
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
otherwise
Local path (string) of file or if networking is off, last version of file cached on disk.

Raises:
In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
Expand All @@ -1050,13 +1066,28 @@ def get_from_cache(

os.makedirs(cache_dir, exist_ok=True)

url_to_download = url
etag = None
if not local_files_only:
try:
response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
if response.status_code == 200:
etag = response.headers.get("ETag")
except (EnvironmentError, requests.exceptions.Timeout):
headers = {"user-agent": http_user_agent(user_agent)}
r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
r.raise_for_status()
etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
# We favor a custom header indicating the etag of the linked resource, and
# we fallback to the regular etag header.
# If we don't have any of those, raise an error.
if etag is None:
raise OSError(
"Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
)
# In case of a redirect,
# save an extra redirect on the request.get call,
# and ensure we download the exact atomic version even if it changed
# between the HEAD and the GET (unlikely, but hey).
if 300 <= r.status_code <= 399:
url_to_download = r.headers["Location"]
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
# etag is already None
pass

Expand All @@ -1065,7 +1096,7 @@ def get_from_cache(
# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)

# etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
# etag is None == we don't have a connection or we passed local_files_only.
# try to get the last downloaded one
if etag is None:
if os.path.exists(cache_path):
Expand All @@ -1088,7 +1119,11 @@ def get_from_cache(
" disabled. To enable model look-ups and downloads online, set 'local_files_only'"
" to False."
)
return None
else:
raise ValueError(
"Connection error, and we cannot find the requested files in the cached path."
" Please try again or make sure your Internet connection is on."
)

# From now on, etag is not None.
if os.path.exists(cache_path) and not force_download:
Expand All @@ -1107,8 +1142,8 @@ def get_from_cache(
incomplete_path = cache_path + ".incomplete"

@contextmanager
def _resumable_file_manager():
with open(incomplete_path, "a+b") as f:
def _resumable_file_manager() -> "io.BufferedWriter":
with open(incomplete_path, "ab") as f:
yield f

temp_file_manager = _resumable_file_manager
Expand All @@ -1117,15 +1152,15 @@ def _resumable_file_manager():
else:
resume_size = 0
else:
temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
resume_size = 0

# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with temp_file_manager() as temp_file:
logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)

http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)

logger.info("storing %s in cache at %s", url, cache_path)
os.replace(temp_file.name, cache_path)
Expand Down
6 changes: 1 addition & 5 deletions src/transformers/modelcard.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
model_card_file = pretrained_model_name_or_path
else:
model_card_file = hf_bucket_url(
pretrained_model_name_or_path, filename=MODEL_CARD_NAME, use_cdn=False, mirror=None
)
model_card_file = hf_bucket_url(pretrained_model_name_or_path, filename=MODEL_CARD_NAME, mirror=None)

if find_from_standard_name or pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
Expand All @@ -156,8 +154,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
try:
# Load from URL or cache if already cached
resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, proxies=proxies)
if resolved_model_card_file is None:
raise EnvironmentError
if resolved_model_card_file == model_card_file:
logger.info("loading model card file {}".format(model_card_file))
else:
Expand Down
7 changes: 4 additions & 3 deletions src/transformers/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,9 +523,10 @@
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only look at local files (e.g., not try downloading the model).
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
revision(:obj:`str`, `optional`, defaults to :obj:`main`):
Whether to pin to a specific model version (can be a branch name, a tag name, or a commit id). We use a
git-based model for storing models and other artefacts on huggingface.co, so ``revision`` is any
identifier allowed by git. TODO(if agreed upon, duplicate this doc elsewhere.)
kwargs (additional keyword arguments, `optional`):
Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
:obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
Expand Down
3 changes: 1 addition & 2 deletions src/transformers/modeling_flax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
proxies = kwargs.pop("proxies", None)
# output_loading_info = kwargs.pop("output_loading_info", False)
local_files_only = kwargs.pop("local_files_only", False)
use_cdn = kwargs.pop("use_cdn", True)

# Load config if we don't provide a configuration
if not isinstance(config, PretrainedConfig):
Expand All @@ -131,7 +130,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
archive_file = pretrained_model_name_or_path
else:
archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=WEIGHTS_NAME, use_cdn=use_cdn)
archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=WEIGHTS_NAME)

# redirect to the cache, if necessary
try:
Expand Down
Loading