PoC of PEP 691

dstufft · dstufft · commit 6f167b55a929 · 2022-07-15T17:22:46.000-04:00
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -6,6 +6,7 @@
 import email.message
 import functools
 import itertools
+import json
 import logging
 import os
 import re
@@ -65,32 +66,44 @@ def _match_vcs_scheme(url: str) -> Optional[str]:
     return None
 
 
-class _NotHTML(Exception):
+class _NotAPIContent(Exception):
     def __init__(self, content_type: str, request_desc: str) -> None:
         super().__init__(content_type, request_desc)
         self.content_type = content_type
         self.request_desc = request_desc
 
 
-def _ensure_html_header(response: Response) -> None:
-    """Check the Content-Type header to ensure the response contains HTML.
+def _ensure_api_header(response: Response) -> None:
+    """
+    Check the Content-Type header to ensure the response contains a Simple
+    API Response.
 
-    Raises `_NotHTML` if the content type is not text/html.
+    Raises `_NotAPIContent` if the content type is not a valid content-type.
     """
     content_type = response.headers.get("Content-Type", "")
-    if not content_type.lower().startswith("text/html"):
-        raise _NotHTML(content_type, response.request.method)
+
+    content_type_l = content_type.lower()
+    if content_type_l.startswith("text/html"):
+        return
+    elif content_type_l.startswith("application/vnd.pypi.simple.v1+html"):
+        return
+    elif content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
+        return
+
+    raise _NotAPIContent(content_type, response.request.method)
 
 
 class _NotHTTP(Exception):
     pass
 
 
-def _ensure_html_response(url: str, session: PipSession) -> None:
-    """Send a HEAD request to the URL, and ensure the response contains HTML.
+def _ensure_api_response(url: str, session: PipSession) -> None:
+    """
+    Send a HEAD request to the URL, and ensure the response contains a simple
+    API Response.
 
     Raises `_NotHTTP` if the URL is not available for a HEAD request, or
-    `_NotHTML` if the content type is not text/html.
+    `_NotAPIContent` if the content type is not a valid content type.
     """
     scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
     if scheme not in {"http", "https"}:
@@ -99,31 +112,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None:
     resp = session.head(url, allow_redirects=True)
     raise_for_status(resp)
 
-    _ensure_html_header(resp)
+    _ensure_api_header(resp)
 
 
-def _get_html_response(url: str, session: PipSession) -> Response:
-    """Access an HTML page with GET, and return the response.
+def _get_simple_response(url: str, session: PipSession) -> Response:
+    """Access an Simple API response with GET, and return the response.
 
     This consists of three parts:
 
     1. If the URL looks suspiciously like an archive, send a HEAD first to
-       check the Content-Type is HTML, to avoid downloading a large file.
-       Raise `_NotHTTP` if the content type cannot be determined, or
-       `_NotHTML` if it is not HTML.
+       check the Content-Type is HTML or Simple API, to avoid downloading a
+       large file. Raise `_NotHTTP` if the content type cannot be determined, or
+       `_NotAPIContent` if it is not HTML or a Simple API.
     2. Actually perform the request. Raise HTTP exceptions on network failures.
-    3. Check the Content-Type header to make sure we got HTML, and raise
-       `_NotHTML` otherwise.
+    3. Check the Content-Type header to make sure we got a Simple API response,
+       and raise `_NotAPIContent` otherwise.
     """
     if is_archive_file(Link(url).filename):
-        _ensure_html_response(url, session=session)
+        _ensure_api_response(url, session=session)
 
     logger.debug("Getting page %s", redact_auth_from_url(url))
 
     resp = session.get(
         url,
         headers={
-            "Accept": "text/html",
+            "Accept": ", ".join(
+                [
+                    "application/vnd.pypi.simple.v1+json",
+                    "application/vnd.pypi.simple.v1+html; q=0.2",
+                    "text/html; q=0.1",
+                ]
+            ),
             # We don't want to blindly returned cached data for
             # /simple/, because authors generally expecting that
             # twine upload && pip install will function, but if
@@ -145,9 +164,10 @@ def _get_html_response(url: str, session: PipSession) -> Response:
     # The check for archives above only works if the url ends with
     # something that looks like an archive. However that is not a
     # requirement of an url. Unless we issue a HEAD request on every
-    # url we cannot know ahead of time for sure if something is HTML
-    # or not. However we can check after we've downloaded it.
-    _ensure_html_header(resp)
+    # url we cannot know ahead of time for sure if something is a
+    # Simple API response or not. However we can check after we've
+    # downloaded it.
+    _ensure_api_header(resp)
 
     return resp
 
@@ -273,7 +293,7 @@ def _create_link_from_element(
 
 
 class CacheablePageContent:
-    def __init__(self, page: "HTMLPage") -> None:
+    def __init__(self, page: "IndexContent") -> None:
         assert page.cache_link_parsing
         self.page = page
 
@@ -286,15 +306,15 @@ def __hash__(self) -> int:
 
 class ParseLinks(Protocol):
     def __call__(
-        self, page: "HTMLPage", use_deprecated_html5lib: bool
+        self, page: "IndexContent", use_deprecated_html5lib: bool
     ) -> Iterable[Link]:
         ...
 
 
-def with_cached_html_pages(fn: ParseLinks) -> ParseLinks:
+def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
     """
-    Given a function that parses an Iterable[Link] from an HTMLPage, cache the
-    function's result (keyed by CacheablePageContent), unless the HTMLPage
+    Given a function that parses an Iterable[Link] from an IndexContent, cache the
+    function's result (keyed by CacheablePageContent), unless the IndexContent
     `page` has `page.cache_link_parsing == False`.
     """
 
@@ -305,15 +325,17 @@ def wrapper(
         return list(fn(cacheable_page.page, use_deprecated_html5lib))
 
     @functools.wraps(fn)
-    def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]:
+    def wrapper_wrapper(
+        page: "IndexContent", use_deprecated_html5lib: bool
+    ) -> List[Link]:
         if page.cache_link_parsing:
             return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
         return list(fn(page, use_deprecated_html5lib))
 
     return wrapper_wrapper
 
 
-def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
+def _parse_links_html5lib(page: "IndexContent") -> Iterable[Link]:
     """
     Parse an HTML document, and yield its anchor elements as Link objects.
 
@@ -338,12 +360,35 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
         yield link
 
 
-@with_cached_html_pages
-def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]:
+@with_cached_index_content
+def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable[Link]:
     """
-    Parse an HTML document, and yield its anchor elements as Link objects.
+    Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
     """
 
+    content_type_l = page.content_type.lower()
+    if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
+        data = json.loads(page.content)
+        for file in data.get("files", []):
+            file_url = file.get("url")
+            if file_url is None:
+                continue
+
+            # The Link.yanked_reason expects an empty string instead of a boolean.
+            yanked_reason = file.get("yanked")
+            if yanked_reason and not isinstance(yanked_reason, str):
+                yanked_reason = ""
+            # The Link.yanked_reason expects None instead of False
+            elif not yanked_reason:
+                yanked_reason = None
+
+            yield Link(
+                _clean_link(urllib.parse.urljoin(page.url, file_url)),
+                comes_from=page.url,
+                requires_python=file.get("requires-python"),
+                yanked_reason=yanked_reason,
+            )
+
     if use_deprecated_html5lib:
         yield from _parse_links_html5lib(page)
         return
@@ -365,12 +410,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
         yield link
 
 
-class HTMLPage:
-    """Represents one page, along with its URL"""
+class IndexContent:
+    """Represents one response (or page), along with its URL"""
 
     def __init__(
         self,
         content: bytes,
+        content_type: str,
         encoding: Optional[str],
         url: str,
         cache_link_parsing: bool = True,
@@ -383,6 +429,7 @@ def __init__(
                                    have this set to False, for example.
         """
         self.content = content
+        self.content_type = content_type
         self.encoding = encoding
         self.url = url
         self.cache_link_parsing = cache_link_parsing
@@ -419,7 +466,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
         return None
 
 
-def _handle_get_page_fail(
+def _handle_get_simple_fail(
     link: Link,
     reason: Union[str, Exception],
     meth: Optional[Callable[..., None]] = None,
@@ -429,19 +476,22 @@ def _handle_get_page_fail(
     meth("Could not fetch URL %s: %s - skipping", link, reason)
 
 
-def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage:
+def _make_index_content(
+    response: Response, cache_link_parsing: bool = True
+) -> IndexContent:
     encoding = _get_encoding_from_headers(response.headers)
-    return HTMLPage(
+    return IndexContent(
         response.content,
+        response.headers["Content-Type"],
         encoding=encoding,
         url=response.url,
         cache_link_parsing=cache_link_parsing,
     )
 
 
-def _get_html_page(
+def _get_index_content(
     link: Link, session: Optional[PipSession] = None
-) -> Optional["HTMLPage"]:
+) -> Optional["IndexContent"]:
     if session is None:
         raise TypeError(
             "_get_html_page() missing 1 required keyword argument: 'session'"
@@ -468,37 +518,39 @@ def _get_html_page(
             url += "/"
         url = urllib.parse.urljoin(url, "index.html")
         logger.debug(" file: URL is directory, getting %s", url)
+        # TODO: index.json?
 
     try:
-        resp = _get_html_response(url, session=session)
+        resp = _get_simple_response(url, session=session)
     except _NotHTTP:
         logger.warning(
             "Skipping page %s because it looks like an archive, and cannot "
             "be checked by a HTTP HEAD request.",
             link,
         )
-    except _NotHTML as exc:
+    except _NotAPIContent as exc:
         logger.warning(
-            "Skipping page %s because the %s request got Content-Type: %s."
-            "The only supported Content-Type is text/html",
+            "Skipping page %s because the %s request got Content-Type: %s. "
+            "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
+            "application/vnd.pypi.simple.v1+html, and text/html",
             link,
             exc.request_desc,
             exc.content_type,
         )
     except NetworkConnectionError as exc:
-        _handle_get_page_fail(link, exc)
+        _handle_get_simple_fail(link, exc)
     except RetryError as exc:
-        _handle_get_page_fail(link, exc)
+        _handle_get_simple_fail(link, exc)
     except SSLError as exc:
         reason = "There was a problem confirming the ssl certificate: "
         reason += str(exc)
-        _handle_get_page_fail(link, reason, meth=logger.info)
+        _handle_get_simple_fail(link, reason, meth=logger.info)
     except requests.ConnectionError as exc:
-        _handle_get_page_fail(link, f"connection error: {exc}")
+        _handle_get_simple_fail(link, f"connection error: {exc}")
     except requests.Timeout:
-        _handle_get_page_fail(link, "timed out")
+        _handle_get_simple_fail(link, "timed out")
     else:
-        return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing)
+        return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing)
     return None
 
 
@@ -561,11 +613,11 @@ def create(
     def find_links(self) -> List[str]:
         return self.search_scope.find_links
 
-    def fetch_page(self, location: Link) -> Optional[HTMLPage]:
+    def fetch_response(self, location: Link) -> Optional[IndexContent]:
         """
         Fetch an HTML page containing package links.
         """
-        return _get_html_page(location, session=self.session)
+        return _get_index_content(location, session=self.session)
 
     def collect_sources(
         self,
diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py
@@ -792,11 +792,11 @@ def process_project_url(
             "Fetching project page and analyzing links: %s",
             project_url,
         )
-        html_page = self._link_collector.fetch_page(project_url)
-        if html_page is None:
+        index_response = self._link_collector.fetch_response(project_url)
+        if index_response is None:
             return []
 
-        page_links = list(parse_links(html_page, self._use_deprecated_html5lib))
+        page_links = list(parse_links(index_response, self._use_deprecated_html5lib))
 
         with indent_log():
             package_links = self.evaluate_links(