66import email .message
77import functools
88import itertools
9+ import json
910import logging
1011import os
1112import re
@@ -65,32 +66,44 @@ def _match_vcs_scheme(url: str) -> Optional[str]:
6566 return None
6667
6768
68- class _NotHTML (Exception ):
69+ class _NotAPIContent (Exception ):
6970 def __init__ (self , content_type : str , request_desc : str ) -> None :
7071 super ().__init__ (content_type , request_desc )
7172 self .content_type = content_type
7273 self .request_desc = request_desc
7374
7475
75- def _ensure_html_header (response : Response ) -> None :
76- """Check the Content-Type header to ensure the response contains HTML.
76+ def _ensure_api_header (response : Response ) -> None :
77+ """
78+ Check the Content-Type header to ensure the response contains a Simple
79+ API Response.
7780
78- Raises `_NotHTML ` if the content type is not text/html .
81+ Raises `_NotAPIContent ` if the content type is not a valid content-type .
7982 """
8083 content_type = response .headers .get ("Content-Type" , "" )
81- if not content_type .lower ().startswith ("text/html" ):
82- raise _NotHTML (content_type , response .request .method )
84+
85+ content_type_l = content_type .lower ()
86+ if content_type_l .startswith ("text/html" ):
87+ return
88+ elif content_type_l .startswith ("application/vnd.pypi.simple.v1+html" ):
89+ return
90+ elif content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
91+ return
92+
93+ raise _NotAPIContent (content_type , response .request .method )
8394
8495
8596class _NotHTTP (Exception ):
8697 pass
8798
8899
89- def _ensure_html_response (url : str , session : PipSession ) -> None :
90- """Send a HEAD request to the URL, and ensure the response contains HTML.
100+ def _ensure_api_response (url : str , session : PipSession ) -> None :
101+ """
102+ Send a HEAD request to the URL, and ensure the response contains a simple
103+ API Response.
91104
92105 Raises `_NotHTTP` if the URL is not available for a HEAD request, or
93- `_NotHTML ` if the content type is not text/html .
106+ `_NotAPIContent ` if the content type is not a valid content type .
94107 """
95108 scheme , netloc , path , query , fragment = urllib .parse .urlsplit (url )
96109 if scheme not in {"http" , "https" }:
@@ -99,31 +112,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None:
99112 resp = session .head (url , allow_redirects = True )
100113 raise_for_status (resp )
101114
102- _ensure_html_header (resp )
115+ _ensure_api_header (resp )
103116
104117
105- def _get_html_response (url : str , session : PipSession ) -> Response :
106- """Access an HTML page with GET, and return the response.
118+ def _get_simple_response (url : str , session : PipSession ) -> Response :
119+ """Access an Simple API response with GET, and return the response.
107120
108121 This consists of three parts:
109122
110123 1. If the URL looks suspiciously like an archive, send a HEAD first to
111- check the Content-Type is HTML, to avoid downloading a large file.
112- Raise `_NotHTTP` if the content type cannot be determined, or
113- `_NotHTML ` if it is not HTML.
124+ check the Content-Type is HTML or Simple API , to avoid downloading a
125+ large file. Raise `_NotHTTP` if the content type cannot be determined, or
126+ `_NotAPIContent ` if it is not HTML or a Simple API .
114127 2. Actually perform the request. Raise HTTP exceptions on network failures.
115- 3. Check the Content-Type header to make sure we got HTML, and raise
116- `_NotHTML ` otherwise.
128+ 3. Check the Content-Type header to make sure we got a Simple API response,
129+ and raise `_NotAPIContent ` otherwise.
117130 """
118131 if is_archive_file (Link (url ).filename ):
119- _ensure_html_response (url , session = session )
132+ _ensure_api_response (url , session = session )
120133
121134 logger .debug ("Getting page %s" , redact_auth_from_url (url ))
122135
123136 resp = session .get (
124137 url ,
125138 headers = {
126- "Accept" : "text/html" ,
139+ "Accept" : ", " .join (
140+ [
141+ "application/vnd.pypi.simple.v1+json" ,
142+ "application/vnd.pypi.simple.v1+html; q=0.2" ,
143+ "text/html; q=0.1" ,
144+ ]
145+ ),
127146 # We don't want to blindly returned cached data for
128147 # /simple/, because authors generally expecting that
129148 # twine upload && pip install will function, but if
@@ -145,9 +164,10 @@ def _get_html_response(url: str, session: PipSession) -> Response:
145164 # The check for archives above only works if the url ends with
146165 # something that looks like an archive. However that is not a
147166 # requirement of an url. Unless we issue a HEAD request on every
148- # url we cannot know ahead of time for sure if something is HTML
149- # or not. However we can check after we've downloaded it.
150- _ensure_html_header (resp )
167+ # url we cannot know ahead of time for sure if something is a
168+ # Simple API response or not. However we can check after we've
169+ # downloaded it.
170+ _ensure_api_header (resp )
151171
152172 return resp
153173
@@ -273,7 +293,7 @@ def _create_link_from_element(
273293
274294
275295class CacheablePageContent :
276- def __init__ (self , page : "HTMLPage " ) -> None :
296+ def __init__ (self , page : "IndexContent " ) -> None :
277297 assert page .cache_link_parsing
278298 self .page = page
279299
@@ -286,15 +306,15 @@ def __hash__(self) -> int:
286306
287307class ParseLinks (Protocol ):
288308 def __call__ (
289- self , page : "HTMLPage " , use_deprecated_html5lib : bool
309+ self , page : "IndexContent " , use_deprecated_html5lib : bool
290310 ) -> Iterable [Link ]:
291311 ...
292312
293313
294- def with_cached_html_pages (fn : ParseLinks ) -> ParseLinks :
314+ def with_cached_index_content (fn : ParseLinks ) -> ParseLinks :
295315 """
296- Given a function that parses an Iterable[Link] from an HTMLPage , cache the
297- function's result (keyed by CacheablePageContent), unless the HTMLPage
316+ Given a function that parses an Iterable[Link] from an IndexContent , cache the
317+ function's result (keyed by CacheablePageContent), unless the IndexContent
298318 `page` has `page.cache_link_parsing == False`.
299319 """
300320
@@ -305,15 +325,17 @@ def wrapper(
305325 return list (fn (cacheable_page .page , use_deprecated_html5lib ))
306326
307327 @functools .wraps (fn )
308- def wrapper_wrapper (page : "HTMLPage" , use_deprecated_html5lib : bool ) -> List [Link ]:
328+ def wrapper_wrapper (
329+ page : "IndexContent" , use_deprecated_html5lib : bool
330+ ) -> List [Link ]:
309331 if page .cache_link_parsing :
310332 return wrapper (CacheablePageContent (page ), use_deprecated_html5lib )
311333 return list (fn (page , use_deprecated_html5lib ))
312334
313335 return wrapper_wrapper
314336
315337
316- def _parse_links_html5lib (page : "HTMLPage " ) -> Iterable [Link ]:
338+ def _parse_links_html5lib (page : "IndexContent " ) -> Iterable [Link ]:
317339 """
318340 Parse an HTML document, and yield its anchor elements as Link objects.
319341
@@ -338,12 +360,35 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
338360 yield link
339361
340362
341- @with_cached_html_pages
342- def parse_links (page : "HTMLPage " , use_deprecated_html5lib : bool ) -> Iterable [Link ]:
363+ @with_cached_index_content
364+ def parse_links (page : "IndexContent " , use_deprecated_html5lib : bool ) -> Iterable [Link ]:
343365 """
344- Parse an HTML document , and yield its anchor elements as Link objects.
366+ Parse a Simple API's Index Content , and yield its anchor elements as Link objects.
345367 """
346368
369+ content_type_l = page .content_type .lower ()
370+ if content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
371+ data = json .loads (page .content )
372+ for file in data .get ("files" , []):
373+ file_url = file .get ("url" )
374+ if file_url is None :
375+ continue
376+
377+ # The Link.yanked_reason expects an empty string instead of a boolean.
378+ yanked_reason = file .get ("yanked" )
379+ if yanked_reason and not isinstance (yanked_reason , str ):
380+ yanked_reason = ""
381+ # The Link.yanked_reason expects None instead of False
382+ elif not yanked_reason :
383+ yanked_reason = None
384+
385+ yield Link (
386+ _clean_link (urllib .parse .urljoin (page .url , file_url )),
387+ comes_from = page .url ,
388+ requires_python = file .get ("requires-python" ),
389+ yanked_reason = yanked_reason ,
390+ )
391+
347392 if use_deprecated_html5lib :
348393 yield from _parse_links_html5lib (page )
349394 return
@@ -365,12 +410,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
365410 yield link
366411
367412
368- class HTMLPage :
369- """Represents one page, along with its URL"""
413+ class IndexContent :
414+ """Represents one response (or page) , along with its URL"""
370415
371416 def __init__ (
372417 self ,
373418 content : bytes ,
419+ content_type : str ,
374420 encoding : Optional [str ],
375421 url : str ,
376422 cache_link_parsing : bool = True ,
@@ -383,6 +429,7 @@ def __init__(
383429 have this set to False, for example.
384430 """
385431 self .content = content
432+ self .content_type = content_type
386433 self .encoding = encoding
387434 self .url = url
388435 self .cache_link_parsing = cache_link_parsing
@@ -419,7 +466,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
419466 return None
420467
421468
422- def _handle_get_page_fail (
469+ def _handle_get_simple_fail (
423470 link : Link ,
424471 reason : Union [str , Exception ],
425472 meth : Optional [Callable [..., None ]] = None ,
@@ -429,19 +476,22 @@ def _handle_get_page_fail(
429476 meth ("Could not fetch URL %s: %s - skipping" , link , reason )
430477
431478
432- def _make_html_page (response : Response , cache_link_parsing : bool = True ) -> HTMLPage :
479+ def _make_index_content (
480+ response : Response , cache_link_parsing : bool = True
481+ ) -> IndexContent :
433482 encoding = _get_encoding_from_headers (response .headers )
434- return HTMLPage (
483+ return IndexContent (
435484 response .content ,
485+ response .headers ["Content-Type" ],
436486 encoding = encoding ,
437487 url = response .url ,
438488 cache_link_parsing = cache_link_parsing ,
439489 )
440490
441491
442- def _get_html_page (
492+ def _get_index_content (
443493 link : Link , session : Optional [PipSession ] = None
444- ) -> Optional ["HTMLPage " ]:
494+ ) -> Optional ["IndexContent " ]:
445495 if session is None :
446496 raise TypeError (
447497 "_get_html_page() missing 1 required keyword argument: 'session'"
@@ -468,37 +518,39 @@ def _get_html_page(
468518 url += "/"
469519 url = urllib .parse .urljoin (url , "index.html" )
470520 logger .debug (" file: URL is directory, getting %s" , url )
521+ # TODO: index.json?
471522
472523 try :
473- resp = _get_html_response (url , session = session )
524+ resp = _get_simple_response (url , session = session )
474525 except _NotHTTP :
475526 logger .warning (
476527 "Skipping page %s because it looks like an archive, and cannot "
477528 "be checked by a HTTP HEAD request." ,
478529 link ,
479530 )
480- except _NotHTML as exc :
531+ except _NotAPIContent as exc :
481532 logger .warning (
482- "Skipping page %s because the %s request got Content-Type: %s."
483- "The only supported Content-Type is text/html" ,
533+ "Skipping page %s because the %s request got Content-Type: %s. "
534+ "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
535+ "application/vnd.pypi.simple.v1+html, and text/html" ,
484536 link ,
485537 exc .request_desc ,
486538 exc .content_type ,
487539 )
488540 except NetworkConnectionError as exc :
489- _handle_get_page_fail (link , exc )
541+ _handle_get_simple_fail (link , exc )
490542 except RetryError as exc :
491- _handle_get_page_fail (link , exc )
543+ _handle_get_simple_fail (link , exc )
492544 except SSLError as exc :
493545 reason = "There was a problem confirming the ssl certificate: "
494546 reason += str (exc )
495- _handle_get_page_fail (link , reason , meth = logger .info )
547+ _handle_get_simple_fail (link , reason , meth = logger .info )
496548 except requests .ConnectionError as exc :
497- _handle_get_page_fail (link , f"connection error: { exc } " )
549+ _handle_get_simple_fail (link , f"connection error: { exc } " )
498550 except requests .Timeout :
499- _handle_get_page_fail (link , "timed out" )
551+ _handle_get_simple_fail (link , "timed out" )
500552 else :
501- return _make_html_page (resp , cache_link_parsing = link .cache_link_parsing )
553+ return _make_index_content (resp , cache_link_parsing = link .cache_link_parsing )
502554 return None
503555
504556
@@ -561,11 +613,11 @@ def create(
561613 def find_links (self ) -> List [str ]:
562614 return self .search_scope .find_links
563615
564- def fetch_page (self , location : Link ) -> Optional [HTMLPage ]:
616+ def fetch_response (self , location : Link ) -> Optional [IndexContent ]:
565617 """
566618 Fetch an HTML page containing package links.
567619 """
568- return _get_html_page (location , session = self .session )
620+ return _get_index_content (location , session = self .session )
569621
570622 def collect_sources (
571623 self ,
0 commit comments