Skip to content

Commit aed1ef8

Browse files
Avoid calling http_head for non-HTTP URLs (#7062)
Avoid calling http_head for non-http URLs
1 parent 943594a commit aed1ef8

File tree

1 file changed

+43
-42
lines changed

1 file changed

+43
-42
lines changed

src/datasets/utils/file_utils.py

Lines changed: 43 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -572,49 +572,50 @@ def get_from_cache(
572572
# s3fs uses "ETag", gcsfs uses "etag"
573573
etag = (response.get("ETag", None) or response.get("etag", None)) if use_etag else None
574574
connected = True
575-
try:
576-
response = http_head(
577-
url,
578-
allow_redirects=True,
579-
proxies=proxies,
580-
timeout=etag_timeout,
581-
max_retries=max_retries,
582-
headers=headers,
583-
)
584-
if response.status_code == 200: # ok
585-
etag = response.headers.get("ETag") if use_etag else None
586-
for k, v in response.cookies.items():
587-
# In some edge cases, we need to get a confirmation token
588-
if k.startswith("download_warning") and "drive.google.com" in url:
589-
url += "&confirm=" + v
590-
cookies = response.cookies
591-
connected = True
592-
# Fix Google Drive URL to avoid Virus scan warning
593-
if "drive.google.com" in url and "confirm=" not in url:
594-
url += "&confirm=t"
595-
# In some edge cases, head request returns 400 but the connection is actually ok
596-
elif (
597-
(response.status_code == 400 and "firebasestorage.googleapis.com" in url)
598-
or (response.status_code == 405 and "drive.google.com" in url)
599-
or (
600-
response.status_code == 403
601-
and (
602-
re.match(r"^https?://github.com/.*?/.*?/releases/download/.*?/.*?$", url)
603-
or re.match(r"^https://.*?s3.*?amazonaws.com/.*?$", response.url)
604-
)
605-
)
606-
or (response.status_code == 403 and "ndownloader.figstatic.com" in url)
607-
):
608-
connected = True
609-
logger.info(f"Couldn't get ETag version for url {url}")
610-
elif response.status_code == 401 and config.HF_ENDPOINT in url and token is None:
611-
raise ConnectionError(
612-
f"Unauthorized for URL {url}. Please use the parameter `token=True` after logging in with `huggingface-cli login`"
575+
else:
576+
try:
577+
response = http_head(
578+
url,
579+
allow_redirects=True,
580+
proxies=proxies,
581+
timeout=etag_timeout,
582+
max_retries=max_retries,
583+
headers=headers,
613584
)
614-
except (OSError, requests.exceptions.Timeout) as e:
615-
# not connected
616-
head_error = e
617-
pass
585+
if response.status_code == 200: # ok
586+
etag = response.headers.get("ETag") if use_etag else None
587+
for k, v in response.cookies.items():
588+
# In some edge cases, we need to get a confirmation token
589+
if k.startswith("download_warning") and "drive.google.com" in url:
590+
url += "&confirm=" + v
591+
cookies = response.cookies
592+
connected = True
593+
# Fix Google Drive URL to avoid Virus scan warning
594+
if "drive.google.com" in url and "confirm=" not in url:
595+
url += "&confirm=t"
596+
# In some edge cases, head request returns 400 but the connection is actually ok
597+
elif (
598+
(response.status_code == 400 and "firebasestorage.googleapis.com" in url)
599+
or (response.status_code == 405 and "drive.google.com" in url)
600+
or (
601+
response.status_code == 403
602+
and (
603+
re.match(r"^https?://github.com/.*?/.*?/releases/download/.*?/.*?$", url)
604+
or re.match(r"^https://.*?s3.*?amazonaws.com/.*?$", response.url)
605+
)
606+
)
607+
or (response.status_code == 403 and "ndownloader.figstatic.com" in url)
608+
):
609+
connected = True
610+
logger.info(f"Couldn't get ETag version for url {url}")
611+
elif response.status_code == 401 and config.HF_ENDPOINT in url and token is None:
612+
raise ConnectionError(
613+
f"Unauthorized for URL {url}. Please use the parameter `token=True` after logging in with `huggingface-cli login`"
614+
)
615+
except (OSError, requests.exceptions.Timeout) as e:
616+
# not connected
617+
head_error = e
618+
pass
618619

619620
# connected == False = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
620621
# try to get the last downloaded one

0 commit comments

Comments
 (0)