Skip to content

Commit fb4bbae

Browse files
squashed commit
add change to do parallel zipping only, no crawling modify cli arg format for medusa-zip tool update cli arg format fix non-exhaustive CopyMode usage [BROKEN] add first run of complete medusa zip with cli arg! the resulting zip cannot be zipimported yet.... medusa zipping works great now, let's revert .zip() changes bump medusa options bump more medusa options use the merged medusa command lines now manage a cache of parallel intermediate zip generation jobs! small fix much closer to mergeable now working much more complex control flow between the medusa-zip cli move medusa zip to medusa.py medusa works for packed apps now too works for everything, but kind of crazy close stdin after writing to the child process factor out a ridiculous amount of boilerplate add back the non-medusa impl for packed layouts implement a "normal" version which uses atomic directories revert unintentional whitespace changes separate the serial and parallel pex creations remove the attempts at parallelism add --medusa-path remove unused code make the medusa hook work when not provided add back a tracer revert some changes that make things harder to read revert some changes i shouldn't need make medusa work with the medusa-zip package and not subprocesses! update after adding defaults in medusa-zip python package remove -f arg for resolving medusa-zip [BROKEN] possibly obsolete! fix cli arg add stitched layout create stitch copymode no initial stitch impl
1 parent 7d6ecdd commit fb4bbae

File tree

3 files changed

+184
-25
lines changed

3 files changed

+184
-25
lines changed

pex/common.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
if TYPE_CHECKING:
2626
from typing import (
2727
Any,
28+
BinaryIO,
2829
Callable,
2930
DefaultDict,
3031
Iterable,
@@ -34,6 +35,7 @@
3435
Set,
3536
Sized,
3637
Tuple,
38+
Union,
3739
)
3840

3941
# We use the start of MS-DOS time, which is what zipfiles use (see section 4.4.6 of
@@ -220,6 +222,7 @@ def _chmod(self, info, path):
220222

221223
@contextlib.contextmanager
222224
def open_zip(path, *args, **kwargs):
225+
# type: (Union[str, BinaryIO], Any, Any) -> Iterator[PermPreservingZipFile]
223226
"""A contextmanager for zip files.
224227
225228
Passes through positional and kwargs to zipfile.ZipFile.
@@ -585,7 +588,7 @@ def delete(self):
585588

586589
def zip(
587590
self,
588-
filename, # type: str
591+
output_file, # type: Union[str, BinaryIO]
589592
mode="w", # type: str
590593
deterministic_timestamp=False, # type: bool
591594
exclude_file=lambda _: False, # type: Callable[[str], bool]
@@ -603,7 +606,7 @@ def zip(
603606
selected_files = self.files()
604607

605608
compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED
606-
with open_zip(filename, mode, compression) as zf:
609+
with open_zip(output_file, mode, compression) as zf:
607610

608611
def write_entry(
609612
filename, # type: str

pex/pex_builder.py

Lines changed: 142 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import logging
88
import os
99
import shutil
10+
import subprocess
11+
from contextlib import contextmanager
1012

1113
from pex import pex_warnings
1214
from pex.atomic_directory import atomic_directory
@@ -20,6 +22,7 @@
2022
safe_mkdir,
2123
safe_mkdtemp,
2224
safe_open,
25+
temporary_dir,
2326
)
2427
from pex.compatibility import commonpath, to_bytes
2528
from pex.compiler import Compiler
@@ -28,6 +31,7 @@
2831
from pex.environment import PEXEnvironment
2932
from pex.finders import get_entry_point_from_console_script, get_script_from_distributions
3033
from pex.interpreter import PythonInterpreter
34+
from pex.jobs import Job
3135
from pex.layout import Layout
3236
from pex.orderedset import OrderedSet
3337
from pex.pex import PEX
@@ -37,9 +41,10 @@
3741
from pex.tracer import TRACER
3842
from pex.typing import TYPE_CHECKING
3943
from pex.util import CacheHelper
44+
from pex.ziputils import ZipCommand
4045

4146
if TYPE_CHECKING:
42-
from typing import Dict, Optional
47+
from typing import BinaryIO, Callable, Dict, Iterator, Optional
4348

4449

4550
class CopyMode(Enum["CopyMode.Value"]):
@@ -88,14 +93,14 @@ def __maybe_run_venv__(pex, pex_root, pex_path):
8893
8994
venv_dir = venv_dir(
9095
pex_file=pex,
91-
pex_root=pex_root,
96+
pex_root=pex_root,
9297
pex_hash={pex_hash!r},
9398
has_interpreter_constraints={has_interpreter_constraints!r},
9499
pex_path=pex_path,
95100
)
96101
venv_pex = os.path.join(venv_dir, 'pex')
97102
if not __execute__ or not is_exe(venv_pex):
98-
# Code in bootstrap_pex will (re)create the venv after selecting the correct interpreter.
103+
# Code in bootstrap_pex will (re)create the venv after selecting the correct interpreter.
99104
return venv_dir
100105
101106
TRACER.log('Executing venv PEX for {{}} at {{}}'.format(pex, venv_pex))
@@ -434,6 +439,7 @@ def set_header(self, header):
434439
self._header = header
435440

436441
def _add_dist_dir(self, path, dist_name, fingerprint=None):
442+
# type: (str, str, Optional[str]) -> str
437443
target_dir = os.path.join(self._pex_info.internal_cache, dist_name)
438444
if self._copy_mode == CopyMode.SYMLINK:
439445
self._copy_or_link(path, target_dir, label=dist_name)
@@ -550,6 +556,7 @@ def _copy_or_link(self, src, dst, label=None):
550556
elif self._copy_mode == CopyMode.SYMLINK:
551557
self._chroot.symlink(src, dst, label)
552558
else:
559+
assert self._copy_mode == CopyMode.LINK
553560
self._chroot.link(src, dst, label)
554561

555562
def _prepare_bootstrap(self):
@@ -769,31 +776,144 @@ def zip_cache_dir(path):
769776
os.path.join(internal_cache, location),
770777
)
771778

772-
def _build_zipapp(
773-
self,
774-
filename, # type: str
775-
deterministic_timestamp=False, # type: bool
776-
compress=True, # type: bool
777-
):
778-
# type: (...) -> None
779+
def _cache_dists_for_stitching(self, compress):
780+
# type: (bool) -> Dict[str, str]
781+
merge_deps = {} # type: Dict[str, str]
782+
with TRACER.timed("caching dists for stitched output", V=3):
783+
for dist_label, fingerprint in self._pex_info.distributions.items():
784+
cache_key = "{}-{}".format(
785+
fingerprint, "compressed" if compress else "uncompressed"
786+
)
787+
cached_zip = os.path.join(
788+
self._pex_info.pex_root,
789+
"stitched_dists",
790+
cache_key,
791+
dist_label,
792+
)
793+
with atomic_directory(os.path.dirname(cached_zip)) as atomic_zip_dir:
794+
if not atomic_zip_dir.is_finalized():
795+
atomic_output_file = os.path.join(
796+
atomic_zip_dir.work_dir, os.path.basename(cached_zip)
797+
)
798+
with TRACER.timed("caching single dist {}".format(dist_label), V=3):
799+
self._chroot.zip(
800+
atomic_output_file,
801+
labels=(dist_label,),
802+
deterministic_timestamp=True,
803+
compress=compress,
804+
exclude_file=is_pyc_temporary_file,
805+
)
806+
assert os.path.isfile(cached_zip)
807+
merge_deps[dist_label] = cached_zip
808+
809+
return merge_deps
810+
811+
@contextmanager
812+
def _concatenate_cached_entries(self, zip_cmd, deterministic_timestamp, compress):
813+
# type: (ZipCommand, bool, bool) -> Iterator[BinaryIO]
814+
merge_deps = self._cache_dists_for_stitching(compress=compress)
815+
uncached_labels = sorted(frozenset(self._chroot.labels()) - frozenset(merge_deps.keys()))
816+
817+
with TRACER.timed("synthesize zipapp", V=6), temporary_dir() as td:
818+
concatenated_nonzip = os.path.join(td, "concatenated.broken-zip")
819+
with open(concatenated_nonzip, "w+b") as concat_f:
820+
with TRACER.timed("zipping up uncached sources", V=3):
821+
self._chroot.zip(
822+
concat_f,
823+
deterministic_timestamp=deterministic_timestamp,
824+
compress=compress,
825+
labels=uncached_labels,
826+
)
827+
828+
with TRACER.timed("concatenating cached dist zips", V=3):
829+
# Sort the cached zips by the prefixes of the filenames they'll be
830+
# inserting into the merged result, to get a deterministic output.
831+
for _, path in sorted(merge_deps.items(), key=lambda x: x[0]):
832+
with open(path, "rb") as f:
833+
shutil.copyfileobj(f, concat_f) # type: ignore[misc]
834+
835+
fixed_zip = os.path.join(td, "fixed.zip")
836+
zip_cmd.fix_concatenated_zips(concatenated_nonzip, fixed_zip)
837+
838+
with open(fixed_zip, "rb") as read_handle:
839+
yield read_handle
840+
841+
@contextmanager
842+
def _prepare_executable_zipapp(self, filename):
843+
# type: (str) -> Iterator[BinaryIO]
779844
with safe_open(filename, "wb") as pexfile:
780845
assert os.path.getsize(pexfile.name) == 0
781846
pexfile.write(to_bytes("{}\n".format(self._shebang)))
782847
if self._header:
783848
pexfile.write(to_bytes(self._header))
784-
with TRACER.timed("Zipping PEX file."):
849+
850+
yield pexfile
851+
852+
chmod_plus_x(pexfile.name)
853+
854+
def _uncached_zipapp(
855+
self,
856+
filename, # type: str
857+
deterministic_timestamp, # type: bool
858+
compress, # type: bool
859+
):
860+
# type: (...) -> None
861+
862+
# When configured with a `copy_mode` of `CopyMode.SYMLINK`, we symlink distributions as
863+
# pointers to installed wheel directories in ~/.pex/installed_wheels/... Since those
864+
# installed wheels reside in a shared cache, they can be in-use by other processes and so
865+
# their code may be in the process of being bytecode compiled as we attempt to zip up our
866+
# chroot. Bytecode compilation produces ephemeral temporary pyc files that we should avoid
867+
# copying since they are useless and inherently racy.
868+
exclude_file = is_pyc_temporary_file
869+
870+
with TRACER.timed("Zipping PEX file."), self._prepare_executable_zipapp(
871+
filename
872+
) as pexfile:
785873
self._chroot.zip(
786-
filename,
787-
mode="a",
874+
pexfile,
788875
deterministic_timestamp=deterministic_timestamp,
789-
# When configured with a `copy_mode` of `CopyMode.SYMLINK`, we symlink distributions
790-
# as pointers to installed wheel directories in ~/.pex/installed_wheels/... Since
791-
# those installed wheels reside in a shared cache, they can be in-use by other
792-
# processes and so their code may be in the process of being bytecode compiled as we
793-
# attempt to zip up our chroot. Bytecode compilation produces ephemeral temporary
794-
# pyc files that we should avoid copying since they are useless and inherently
795-
# racy.
796-
exclude_file=is_pyc_temporary_file,
797876
compress=compress,
877+
exclude_file=exclude_file,
798878
)
799-
chmod_plus_x(filename)
879+
880+
def _build_zipapp(
881+
self,
882+
filename, # type: str
883+
deterministic_timestamp=False, # type: bool
884+
compress=True, # type: bool
885+
):
886+
# type: (...) -> None
887+
# Naively creating a compressed zipapp with many downloaded distributions would perform
888+
# a lot of I/O on each pex invocation and spend a lot of CPU on compression. While
889+
# `--no-compress` runs significantly faster, the result may also be over twice as large.
890+
should_try_synthesizing_from_cache = bool(self._pex_info.distributions) and compress
891+
if not should_try_synthesizing_from_cache:
892+
self._uncached_zipapp(
893+
filename, deterministic_timestamp=deterministic_timestamp, compress=compress
894+
)
895+
return
896+
897+
# However, if we have access to the `zip` command, we can employ a caching strategy.
898+
zip_cmd = ZipCommand.find()
899+
if zip_cmd is None:
900+
TRACER.log(
901+
"`zip` command was not found, so compressed dist caches could not be used",
902+
V=1,
903+
)
904+
self._uncached_zipapp(
905+
filename, deterministic_timestamp=deterministic_timestamp, compress=compress
906+
)
907+
return
908+
909+
with TRACER.timed(
910+
"cache dists and synthesize zipapp", V=9
911+
), self._concatenate_cached_entries(
912+
zip_cmd,
913+
deterministic_timestamp=deterministic_timestamp,
914+
compress=compress,
915+
) as concatenated_zip_f:
916+
with TRACER.timed(
917+
"copying synthesized concatenated zip to output file", V=9
918+
), self._prepare_executable_zipapp(filename) as pexfile:
919+
shutil.copyfileobj(concatenated_zip_f, pexfile) # type: ignore[misc]

pex/ziputils.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@
77
import os
88
import shutil
99
import struct
10+
import subprocess
1011

12+
from pex.jobs import Job
13+
from pex.tracer import TRACER
1114
from pex.typing import TYPE_CHECKING
1215

1316
if TYPE_CHECKING:
14-
from typing import BinaryIO, Optional
17+
from typing import Any, BinaryIO, ClassVar, Optional
1518

1619
import attr # vendor:skip
1720
else:
@@ -242,3 +245,36 @@ def isolate_zip(self, out_fp):
242245
if self.has_header:
243246
in_fp.seek(self.header_size, os.SEEK_SET)
244247
shutil.copyfileobj(in_fp, out_fp)
248+
249+
250+
@attr.s(frozen=True)
251+
class ZipCommand(object):
252+
exe_path = attr.ib() # type: str
253+
254+
_cached = None # type: ClassVar[Optional[ZipCommand]]
255+
256+
@classmethod
257+
def find(cls):
258+
# type: () -> Optional[ZipCommand]
259+
if cls._cached is not None:
260+
return cls._cached
261+
262+
import distutils.spawn
263+
264+
zip_path = distutils.spawn.find_executable("zip")
265+
if zip_path:
266+
cls._cached = cls(exe_path=zip_path)
267+
return cls._cached
268+
269+
def __call__(self, *args, **kwargs):
270+
# type: (str, Any) -> Job
271+
command = [self.exe_path] + list(args)
272+
zip_proc = subprocess.Popen(
273+
command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, encoding="utf-8", **kwargs
274+
)
275+
return Job(command, zip_proc)
276+
277+
def fix_concatenated_zips(self, concatenated_zips_path, output_path):
278+
# type: (str, str) -> None
279+
with TRACER.timed("fixing up concatenated zips with `zip -FF`"):
280+
self("-FF", concatenated_zips_path, "--out", output_path).wait()

0 commit comments

Comments
 (0)