@@ -92,7 +92,9 @@ class _NotHTTP(Exception):
9292 pass
9393
9494
95- def _ensure_api_response (url : str , session : PipSession ) -> None :
95+ def _ensure_api_response (
96+ url : str , session : PipSession , headers : Optional [Dict [str , str ]] = None
97+ ) -> None :
9698 """
9799 Send a HEAD request to the URL, and ensure the response contains a simple
98100 API Response.
@@ -104,13 +106,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
104106 if scheme not in {"http" , "https" }:
105107 raise _NotHTTP ()
106108
107- resp = session .head (url , allow_redirects = True )
109+ resp = session .head (url , allow_redirects = True , headers = headers )
108110 raise_for_status (resp )
109111
110112 _ensure_api_header (resp )
111113
112114
113- def _get_simple_response (url : str , session : PipSession ) -> Response :
115+ def _get_simple_response (
116+ url : str , session : PipSession , headers : Optional [Dict [str , str ]] = None
117+ ) -> Response :
114118 """Access an Simple API response with GET, and return the response.
115119
116120 This consists of three parts:
@@ -124,10 +128,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
124128 and raise `_NotAPIContent` otherwise.
125129 """
126130 if is_archive_file (Link (url ).filename ):
127- _ensure_api_response (url , session = session )
131+ _ensure_api_response (url , session = session , headers = headers )
128132
129133 logger .debug ("Getting page %s" , redact_auth_from_url (url ))
130134
135+ logger .debug ("headers: %s" , str (headers ))
136+ if headers is None :
137+ headers = {}
131138 resp = session .get (
132139 url ,
133140 headers = {
@@ -152,6 +159,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
152159 # once per 10 minutes.
153160 # For more information, please see pypa/pip#5670.
154161 "Cache-Control" : "max-age=0" ,
162+ ** headers ,
155163 },
156164 )
157165 raise_for_status (resp )
@@ -230,7 +238,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
230238 if content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
231239 data = json .loads (page .content )
232240 for file in data .get ("files" , []):
233- link = Link .from_json (file , page .url )
241+ link = Link .from_json (file , page .url , page_content = page )
234242 if link is None :
235243 continue
236244 yield link
@@ -243,7 +251,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
243251 url = page .url
244252 base_url = parser .base_url or url
245253 for anchor in parser .anchors :
246- link = Link .from_element (anchor , page_url = url , base_url = base_url )
254+ link = Link .from_element (
255+ anchor , page_url = url , base_url = base_url , page_content = page
256+ )
247257 if link is None :
248258 continue
249259 yield link
@@ -258,13 +268,17 @@ class IndexContent:
258268 :param cache_link_parsing: whether links parsed from this page's url
259269 should be cached. PyPI index urls should
260270 have this set to False, for example.
271+ :param etag: The ``ETag`` header from an HTTP request against ``url``.
272+ :param date: The ``Date`` header from an HTTP request against ``url``.
261273 """
262274
263275 content : bytes
264276 content_type : str
265277 encoding : Optional [str ]
266278 url : str
267279 cache_link_parsing : bool = True
280+ etag : Optional [str ] = None
281+ date : Optional [str ] = None
268282
269283 def __str__ (self ) -> str :
270284 return redact_auth_from_url (self .url )
@@ -309,7 +323,8 @@ def _handle_get_simple_fail(
309323
310324
311325def _make_index_content (
312- response : Response , cache_link_parsing : bool = True
326+ response : Response ,
327+ cache_link_parsing : bool = True ,
313328) -> IndexContent :
314329 encoding = _get_encoding_from_headers (response .headers )
315330 return IndexContent (
@@ -318,11 +333,15 @@ def _make_index_content(
318333 encoding = encoding ,
319334 url = response .url ,
320335 cache_link_parsing = cache_link_parsing ,
336+ etag = response .headers .get ("ETag" , None ),
337+ date = response .headers .get ("Date" , None ),
321338 )
322339
323340
324- def _get_index_content (link : Link , * , session : PipSession ) -> Optional ["IndexContent" ]:
325- url = link .url .split ("#" , 1 )[0 ]
341+ def _get_index_content (
342+ link : Link , * , session : PipSession , headers : Optional [Dict [str , str ]] = None
343+ ) -> Optional ["IndexContent" ]:
344+ url = link .url_without_fragment
326345
327346 # Check for VCS schemes that do not support lookup as web pages.
328347 vcs_scheme = _match_vcs_scheme (url )
@@ -349,7 +368,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
349368 logger .debug (" file: URL is directory, getting %s" , url )
350369
351370 try :
352- resp = _get_simple_response (url , session = session )
371+ resp = _get_simple_response (url , session = session , headers = headers )
353372 except _NotHTTP :
354373 logger .warning (
355374 "Skipping page %s because it looks like an archive, and cannot "
@@ -365,9 +384,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
365384 exc .request_desc ,
366385 exc .content_type ,
367386 )
368- except NetworkConnectionError as exc :
369- _handle_get_simple_fail (link , exc )
370- except RetryError as exc :
387+ except (NetworkConnectionError , RetryError ) as exc :
371388 _handle_get_simple_fail (link , exc )
372389 except SSLError as exc :
373390 reason = "There was a problem confirming the ssl certificate: "
@@ -441,11 +458,14 @@ def create(
441458 def find_links (self ) -> List [str ]:
442459 return self .search_scope .find_links
443460
444- def fetch_response (self , location : Link ) -> Optional [IndexContent ]:
461+ def fetch_response (
462+ self , location : Link , headers : Optional [Dict [str , str ]] = None
463+ ) -> Optional [IndexContent ]:
445464 """
446465 Fetch an HTML page containing package links.
447466 """
448- return _get_index_content (location , session = self .session )
467+ logger .debug ("headers: %s" , str (headers ))
468+ return _get_index_content (location , session = self .session , headers = headers )
449469
450470 def collect_sources (
451471 self ,
0 commit comments