Skip to content

Commit f28ecfd

Browse files
make FetchResolveCache
- pipe in headers arg - provide full context in Link.comes_from - pull in etag and date and cache the outputs - handle --no-cache-dir - add NEWS - remove quotes from etag and use binary checksum to save a few bytes - parse http modified date to compress the cached representation
1 parent b02915a commit f28ecfd

File tree

7 files changed

+316
-36
lines changed

7 files changed

+316
-36
lines changed

news/12257.feature.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled.

src/pip/_internal/cache.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,9 @@ def __init__(self, cache_dir: str) -> None:
9292
assert not cache_dir or os.path.isabs(cache_dir)
9393
self.cache_dir = cache_dir or None
9494

95-
def _get_cache_path_parts(self, link: Link) -> List[str]:
95+
def _get_cache_path_parts(
96+
self, link: Link, *, interpreter_dependent: bool
97+
) -> List[str]:
9698
"""Get parts of part that must be os.path.joined with cache_dir"""
9799

98100
# We want to generate an url to use as our cache key, we don't want to
@@ -104,13 +106,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:
104106
if link.subdirectory_fragment:
105107
key_parts["subdirectory"] = link.subdirectory_fragment
106108

107-
# Include interpreter name, major and minor version in cache key
108-
# to cope with ill-behaved sdists that build a different wheel
109-
# depending on the python version their setup.py is being run on,
110-
# and don't encode the difference in compatibility tags.
111-
# https://github.com/pypa/pip/issues/7296
112-
key_parts["interpreter_name"] = interpreter_name()
113-
key_parts["interpreter_version"] = interpreter_version()
109+
if interpreter_dependent:
110+
# Include interpreter name, major and minor version in cache key
111+
# to cope with ill-behaved sdists that build a different wheel
112+
# depending on the python version their setup.py is being run on,
113+
# and don't encode the difference in compatibility tags.
114+
# https://github.com/pypa/pip/issues/7296
115+
key_parts["interpreter_name"] = interpreter_name()
116+
key_parts["interpreter_version"] = interpreter_version()
114117

115118
# Encode our key url with sha224, we'll use this because it has similar
116119
# security properties to sha256, but with a shorter total output (and
@@ -138,11 +141,20 @@ class LinkMetadataCache(Cache):
138141
"""Persistently store the metadata of dists found at each link."""
139142

140143
def get_path_for_link(self, link: Link) -> str:
141-
parts = self._get_cache_path_parts(link)
144+
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
142145
assert self.cache_dir
143146
return os.path.join(self.cache_dir, "link-metadata", *parts)
144147

145148

149+
class FetchResolveCache(Cache):
150+
def get_path_for_link(self, link: Link) -> str:
151+
# We are reading index links to extract other links from, not executing any
152+
# python code, so these caches are interpreter-independent.
153+
parts = self._get_cache_path_parts(link, interpreter_dependent=False)
154+
assert self.cache_dir
155+
return os.path.join(self.cache_dir, "fetch-resolve", *parts)
156+
157+
146158
class WheelCacheBase(Cache):
147159
"""Specializations to the cache concept for wheels."""
148160

@@ -197,7 +209,7 @@ def get_path_for_link(self, link: Link) -> str:
197209
198210
:param link: The link of the sdist for which this will cache wheels.
199211
"""
200-
parts = self._get_cache_path_parts(link)
212+
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
201213
assert self.cache_dir
202214
# Store wheels within the root cache_dir
203215
return os.path.join(self.cache_dir, "wheels", *parts)

src/pip/_internal/cli/req_command.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from optparse import Values
1111
from typing import Any, List, Optional, Tuple
1212

13-
from pip._internal.cache import LinkMetadataCache, WheelCache
13+
from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
1414
from pip._internal.cli import cmdoptions
1515
from pip._internal.cli.index_command import IndexGroupCommand
1616
from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin
@@ -333,8 +333,13 @@ def _build_package_finder(
333333
ignore_requires_python=ignore_requires_python,
334334
)
335335

336+
if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled):
337+
fetch_resolve_cache = FetchResolveCache(options.cache_dir)
338+
else:
339+
fetch_resolve_cache = None
336340
return PackageFinder.create(
337341
link_collector=link_collector,
338342
selection_prefs=selection_prefs,
339343
target_python=target_python,
344+
fetch_resolve_cache=fetch_resolve_cache,
340345
)

src/pip/_internal/index/collector.py

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,9 @@ class _NotHTTP(Exception):
9292
pass
9393

9494

95-
def _ensure_api_response(url: str, session: PipSession) -> None:
95+
def _ensure_api_response(
96+
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
97+
) -> None:
9698
"""
9799
Send a HEAD request to the URL, and ensure the response contains a simple
98100
API Response.
@@ -104,13 +106,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
104106
if scheme not in {"http", "https"}:
105107
raise _NotHTTP()
106108

107-
resp = session.head(url, allow_redirects=True)
109+
resp = session.head(url, allow_redirects=True, headers=headers)
108110
raise_for_status(resp)
109111

110112
_ensure_api_header(resp)
111113

112114

113-
def _get_simple_response(url: str, session: PipSession) -> Response:
115+
def _get_simple_response(
116+
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
117+
) -> Response:
114118
"""Access an Simple API response with GET, and return the response.
115119
116120
This consists of three parts:
@@ -124,10 +128,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
124128
and raise `_NotAPIContent` otherwise.
125129
"""
126130
if is_archive_file(Link(url).filename):
127-
_ensure_api_response(url, session=session)
131+
_ensure_api_response(url, session=session, headers=headers)
128132

129133
logger.debug("Getting page %s", redact_auth_from_url(url))
130134

135+
logger.debug("headers: %s", str(headers))
136+
if headers is None:
137+
headers = {}
131138
resp = session.get(
132139
url,
133140
headers={
@@ -152,6 +159,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
152159
# once per 10 minutes.
153160
# For more information, please see pypa/pip#5670.
154161
"Cache-Control": "max-age=0",
162+
**headers,
155163
},
156164
)
157165
raise_for_status(resp)
@@ -230,7 +238,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
230238
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
231239
data = json.loads(page.content)
232240
for file in data.get("files", []):
233-
link = Link.from_json(file, page.url)
241+
link = Link.from_json(file, page.url, page_content=page)
234242
if link is None:
235243
continue
236244
yield link
@@ -243,7 +251,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
243251
url = page.url
244252
base_url = parser.base_url or url
245253
for anchor in parser.anchors:
246-
link = Link.from_element(anchor, page_url=url, base_url=base_url)
254+
link = Link.from_element(
255+
anchor, page_url=url, base_url=base_url, page_content=page
256+
)
247257
if link is None:
248258
continue
249259
yield link
@@ -258,13 +268,17 @@ class IndexContent:
258268
:param cache_link_parsing: whether links parsed from this page's url
259269
should be cached. PyPI index urls should
260270
have this set to False, for example.
271+
:param etag: The ``ETag`` header from an HTTP request against ``url``.
272+
:param date: The ``Date`` header from an HTTP request against ``url``.
261273
"""
262274

263275
content: bytes
264276
content_type: str
265277
encoding: Optional[str]
266278
url: str
267279
cache_link_parsing: bool = True
280+
etag: Optional[str] = None
281+
date: Optional[str] = None
268282

269283
def __str__(self) -> str:
270284
return redact_auth_from_url(self.url)
@@ -309,7 +323,8 @@ def _handle_get_simple_fail(
309323

310324

311325
def _make_index_content(
312-
response: Response, cache_link_parsing: bool = True
326+
response: Response,
327+
cache_link_parsing: bool = True,
313328
) -> IndexContent:
314329
encoding = _get_encoding_from_headers(response.headers)
315330
return IndexContent(
@@ -318,11 +333,15 @@ def _make_index_content(
318333
encoding=encoding,
319334
url=response.url,
320335
cache_link_parsing=cache_link_parsing,
336+
etag=response.headers.get("ETag", None),
337+
date=response.headers.get("Date", None),
321338
)
322339

323340

324-
def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]:
325-
url = link.url.split("#", 1)[0]
341+
def _get_index_content(
342+
link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None
343+
) -> Optional["IndexContent"]:
344+
url = link.url_without_fragment
326345

327346
# Check for VCS schemes that do not support lookup as web pages.
328347
vcs_scheme = _match_vcs_scheme(url)
@@ -349,7 +368,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
349368
logger.debug(" file: URL is directory, getting %s", url)
350369

351370
try:
352-
resp = _get_simple_response(url, session=session)
371+
resp = _get_simple_response(url, session=session, headers=headers)
353372
except _NotHTTP:
354373
logger.warning(
355374
"Skipping page %s because it looks like an archive, and cannot "
@@ -365,9 +384,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
365384
exc.request_desc,
366385
exc.content_type,
367386
)
368-
except NetworkConnectionError as exc:
369-
_handle_get_simple_fail(link, exc)
370-
except RetryError as exc:
387+
except (NetworkConnectionError, RetryError) as exc:
371388
_handle_get_simple_fail(link, exc)
372389
except SSLError as exc:
373390
reason = "There was a problem confirming the ssl certificate: "
@@ -441,11 +458,14 @@ def create(
441458
def find_links(self) -> List[str]:
442459
return self.search_scope.find_links
443460

444-
def fetch_response(self, location: Link) -> Optional[IndexContent]:
461+
def fetch_response(
462+
self, location: Link, headers: Optional[Dict[str, str]] = None
463+
) -> Optional[IndexContent]:
445464
"""
446465
Fetch an HTML page containing package links.
447466
"""
448-
return _get_index_content(location, session=self.session)
467+
logger.debug("headers: %s", str(headers))
468+
return _get_index_content(location, session=self.session, headers=headers)
449469

450470
def collect_sources(
451471
self,

0 commit comments

Comments
 (0)