22
33from __future__ import annotations
44
5- __all__ = ["HTTPRangeRequestUnsupported" , "dist_from_wheel_url" ]
5+ __all__ = ["HTTPRangeRequestUnsupported" , "dist_from_wheel_url" , "LazyHTTPFile" ]
66
77import io
88import logging
2222from pip ._internal .metadata import BaseDistribution , MemoryWheel , get_wheel_distribution
2323from pip ._internal .network .session import PipSession as Session
2424from pip ._internal .network .utils import HEADERS
25+ from pip ._internal .utils .logging import indent_log
2526
2627logger = logging .getLogger (__name__ )
2728
@@ -40,6 +41,11 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi
4041 """
4142 try :
4243 with LazyHTTPFile (url , session ) as lazy_file :
44+ with indent_log ():
45+ logger .debug ("begin prefetching for %s" , name )
46+ lazy_file .prefetch_contiguous_dist_info (name )
47+ logger .debug ("done prefetching for %s" , name )
48+
4349 # For read-only ZIP files, ZipFile only needs methods read,
4450 # seek, seekable and tell, not the whole IO protocol.
4551 wheel = MemoryWheel (lazy_file .name , lazy_file )
@@ -145,6 +151,11 @@ def __next__(self) -> bytes:
145151 raise NotImplementedError
146152
147153
154+ # The central directory for tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is
155+ # 944931 bytes, for a 459424488 byte file (about 486x as large).
156+ _DEFAULT_INITIAL_FETCH = 1_000_000
157+
158+
148159class LazyHTTPFile (ReadOnlyIOWrapper ):
149160 """File-like object mapped to a ZIP file over HTTP.
150161
@@ -159,7 +170,10 @@ class LazyHTTPFile(ReadOnlyIOWrapper):
159170 _domains_without_negative_range : ClassVar [set [str ]] = set ()
160171
161172 def __init__ (
162- self , url : str , session : Session , initial_chunk_size : int = CONTENT_CHUNK_SIZE
173+ self ,
174+ url : str ,
175+ session : Session ,
176+ initial_chunk_size : int = _DEFAULT_INITIAL_FETCH ,
163177 ) -> None :
164178 # Add delete=False and print the file's `.name` to debug invalid virtual zips.
165179 super ().__init__ (cast (BinaryIO , NamedTemporaryFile ()))
@@ -172,21 +186,20 @@ def __init__(
172186
173187 self ._length , initial_chunk = self ._extract_content_length (initial_chunk_size )
174188 self .truncate (self ._length )
175- # The central directory for
176- # tensorflow_gpu-2.5.3-cp38-cp38-manylinux2010_x86_64.whl is 944931 bytes, for
177- # a 459424488 byte file (about 486x as large).
178- self ._minimum_fetch_granularity = max (initial_chunk_size , self ._length // 400 )
179189 if initial_chunk is None :
180190 # If we could not download any file contents yet (e.g. if negative byte
181191 # ranges were not supported), then download all of this at once, hopefully
182192 # pulling in the entire central directory.
183- initial_start = max (0 , self ._length - self . _minimum_fetch_granularity )
193+ initial_start = max (0 , self ._length - initial_chunk_size )
184194 self ._download (initial_start , self ._length )
185195 else :
186- self .seek (- len (initial_chunk ), io .SEEK_END )
187- self ._file .write (initial_chunk )
188- self ._left .append (self ._length - len (initial_chunk ))
189- self ._right .append (self ._length - 1 )
196+ # If we could download file contents, then write them to the end of the
197+ # file and set up our bisect boundaries by hand.
198+ with self ._stay ():
199+ self .seek (- len (initial_chunk ), io .SEEK_END )
200+ self ._file .write (initial_chunk )
201+ self ._left .append (self ._length - len (initial_chunk ))
202+ self ._right .append (self ._length - 1 )
190203
191204 def read (self , size : int = - 1 ) -> bytes :
192205 """Read up to size bytes from the object and return them.
@@ -195,17 +208,17 @@ def read(self, size: int = -1) -> bytes:
195208 all bytes until EOF are returned. Fewer than
196209 size bytes may be returned if EOF is reached.
197210 """
198- # BUG does not download correctly if size is unspecified
199211 cur = self .tell ()
212+ logger .debug ("read size %d at %d" , size , cur )
200213 if size < 0 :
201214 assert cur <= self ._length
202215 download_size = self ._length - cur
203216 elif size == 0 :
204- return b''
217+ return b""
205218 else :
206- download_size = max ( size , self . _minimum_fetch_granularity )
219+ download_size = size
207220 stop = min (cur + download_size , self ._length )
208- self ._download (cur , stop - 1 )
221+ self ._download (cur , stop )
209222 return self ._file .read (size )
210223
211224 def __enter__ (self ) -> LazyHTTPFile :
@@ -221,7 +234,7 @@ def _content_length_from_head(self) -> int:
221234 head = self ._session .head (self ._url , headers = HEADERS )
222235 head .raise_for_status ()
223236 assert head .status_code == codes .ok
224- return int (head .headers ["content-length " ])
237+ return int (head .headers ["Content-Length " ])
225238
226239 @staticmethod
227240 def _parse_full_length_from_content_range (arg : str ) -> Optional [int ]:
@@ -233,6 +246,7 @@ def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, byte
233246 headers = HEADERS .copy ()
234247 # Perform a negative range index, which is not supported by some servers.
235248 headers ["Range" ] = f"bytes=-{ initial_chunk_size } "
249+ logger .debug ("initial bytes request: %s" , headers ["Range" ])
236250 # TODO: Get range requests to be correctly cached
237251 headers ["Cache-Control" ] = "no-cache"
238252 # TODO: If-Match (etag) to detect file changed during fetch would be a
@@ -242,7 +256,7 @@ def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, byte
242256 tail = self ._session .get (self ._url , headers = headers )
243257 tail .raise_for_status ()
244258
245- response_length = int (tail .headers ["content-length " ])
259+ response_length = int (tail .headers ["Content-Length " ])
246260 assert response_length == len (tail .content )
247261
248262 code = tail .status_code
@@ -255,12 +269,14 @@ def _try_initial_chunk_request(self, initial_chunk_size: int) -> tuple[int, byte
255269 elif code != codes .partial_content :
256270 raise HTTPRangeRequestUnsupported ("did not receive partial content or ok" )
257271
258- range_arg = tail .headers ["content-range " ]
272+ range_arg = tail .headers ["Content-Range " ]
259273 if file_length := self ._parse_full_length_from_content_range (range_arg ):
260274 return (file_length , tail .content )
261275 raise HTTPRangeRequestUnsupported (f"could not parse content-range: { range_arg } " )
262276
263- def _extract_content_length (self , initial_chunk_size : int ) -> tuple [int , Optional [bytes ]]:
277+ def _extract_content_length (
278+ self , initial_chunk_size : int
279+ ) -> tuple [int , Optional [bytes ]]:
264280 domain = urlparse (self ._url ).netloc
265281 if domain in self ._domains_without_negative_range :
266282 return (self ._content_length_from_head (), None )
@@ -287,7 +303,7 @@ def _extract_content_length(self, initial_chunk_size: int) -> tuple[int, Optiona
287303 if code == codes .requested_range_not_satisfiable :
288304 # In this case, we don't have any file content yet, but we do know the
289305 # size the file will be, so we can return that and exit here.
290- range_arg = resp .headers ["content-range " ]
306+ range_arg = resp .headers ["Content-Range " ]
291307 if length := self ._parse_full_length_from_content_range (range_arg ):
292308 return (length , None )
293309 raise HTTPRangeRequestUnsupported (
@@ -330,7 +346,7 @@ def _stream_response(self, start: int, end: int) -> Response:
330346 # https://www.rfc-editor.org/rfc/rfc9110#field.content-range
331347 headers = HEADERS .copy ()
332348 headers ["Range" ] = f"bytes={ start } -{ end } "
333- logger .debug ("%s" , headers ["Range" ])
349+ logger .debug ("streamed bytes request: %s" , headers ["Range" ])
334350 # TODO: Get range requests to be correctly cached
335351 headers ["Cache-Control" ] = "no-cache"
336352 # TODO: If-Match (etag) to detect file changed during fetch would be a
@@ -364,6 +380,8 @@ def _merge(
364380
365381 def _download (self , start : int , end : int ) -> None :
366382 """Download bytes from start to end inclusively."""
383+ # Reducing by 1 to get an inclusive end range.
384+ end -= 1
367385 with self ._stay ():
368386 left = bisect_left (self ._right , start )
369387 right = bisect_right (self ._left , end )
@@ -372,3 +390,35 @@ def _download(self, start: int, end: int) -> None:
372390 self .seek (start )
373391 for chunk in response .iter_content (CONTENT_CHUNK_SIZE ):
374392 self ._file .write (chunk )
393+
394+ def prefetch_contiguous_dist_info (self , name : str ) -> None :
395+ """
396+ Read contents of entire dist-info section of wheel.
397+
398+ pip will read every entry in this directory when generating a dist from a wheel,
399+ so prepopulating the file contents avoids waiting for multiple range requests.
400+ """
401+ dist_info_prefix = re .compile (r"^[^/]*\.dist-info/" )
402+ start : Optional [int ] = None
403+ end : Optional [int ] = None
404+
405+ zf = ZipFile (self )
406+
407+ for info in zf .infolist ():
408+ if start is None :
409+ if dist_info_prefix .search (info .filename ):
410+ start = info .header_offset
411+ continue
412+ else :
413+ if not dist_info_prefix .search (info .filename ):
414+ end = info .header_offset
415+ break
416+ if start is None :
417+ raise UnsupportedWheel (
418+ f"no { dist_info_prefix } directory found for { name } in { self .name } "
419+ )
420+ # If the last entries of the zip are the .dist-info/ dir (as usual), then give
421+ # us everything until the start of the central directory.
422+ if end is None :
423+ end = zf .start_dir
424+ self ._download (start , end )
0 commit comments