neuralmagic
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 3 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/conf.py‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/conf.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/models/engine_args.rst‎
Lines changed: 2 additions & 1 deletion b/‎docs/source/models/engine_args.rst‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/tensorize_vllm_model.py‎
Lines changed: 254 additions & 0 deletions b/‎examples/tensorize_vllm_model.py‎
Lines changed: 254 additions & 0 deletions
diff --git a/‎requirements-cpu.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements-cpu.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements-dev.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements-dev.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 3 additions & 0 deletions b/‎setup.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/tensorizer/__init__.py‎ b/‎tests/tensorizer/__init__.py‎
@@ -91,6 +91,9 @@ steps:
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
+- label: Tensorizer Test
+  command: apt-get install curl libsodium23 && pytest -v -s tensorizer
+
 - label: Metrics Test
   command: pytest -v -s metrics
 
 
@@ -83,6 +83,7 @@
     "vllm._C",
     "numpy",
     "tqdm",
+    "tensorizer",
 ]
 
 for mock_target in autodoc_mock_imports:
 
@@ -36,7 +36,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 
     Directory to download and load the weights, default to the default cache dir of huggingface.
 
-.. option:: --load-format {auto,pt,safetensors,npcache,dummy}
+.. option:: --load-format {auto,pt,safetensors,npcache,dummy,tensorizer}
 
     The format of the model weights to load.
 
@@ -45,6 +45,7 @@ Below, you can find an explanation of every engine argument for vLLM:
     * "safetensors" will load the weights in the safetensors format.
     * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
     * "dummy" will initialize the weights with random values, mainly for profiling.
+    * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_. See `tensorized_vllm_model.py` in the examples folder to serialize a vLLM model, and for more information. Tensorizer support for vLLM can be installed with `pip install vllm[tensorizer]`.
 
 .. option:: --dtype {auto,half,float16,bfloat16,float,float32}
 
 
@@ -0,0 +1,254 @@
+import argparse
+import dataclasses
+import os
+import time
+import uuid
+from functools import partial
+from typing import Type
+
+import torch
+import torch.nn as nn
+from tensorizer import (DecryptionParams, EncryptionParams, TensorDeserializer,
+                        TensorSerializer, stream_io)
+from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
+from transformers import AutoConfig, PretrainedConfig
+
+from vllm.distributed import initialize_model_parallel
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.tensorizer_loader import TensorizerArgs
+
+# yapf conflicts with isort for this docstring
+# yapf: disable
+"""
+tensorize_vllm_model.py is a script that can be used to serialize and 
+deserialize vLLM models. These models can be loaded using tensorizer directly 
+to the GPU extremely quickly. Tensor encryption and decryption is also 
+supported, although libsodium must be installed to use it. Install
+vllm with tensorizer support using `pip install vllm[tensorizer]`.
+
+To serialize a model, you can run something like this:
+
+python tensorize_vllm_model.py \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   serialize \
+   --serialized-directory s3://my-bucket/ \
+   --suffix vllm
+   
+Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
+and saves it to your S3 bucket. A local directory can also be used.
+
+You can also encrypt the model weights with a randomly-generated key by 
+providing a `--keyfile` argument.
+
+To deserialize a model, you can run something like this:
+
+python tensorize_vllm_model.py \
+   --model EleutherAI/gpt-j-6B \
+   --dtype float16 \
+   deserialize \
+   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
+
+Which downloads the model tensors from your S3 bucket and deserializes them.
+To provide S3 credentials, you can provide `--s3-access-key-id` and 
+`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script,
+the OpenAI entrypoint, as arguments for LLM(), or as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
+
+
+You can also provide a `--keyfile` argument to decrypt the model weights if 
+they were serialized with encryption.
+
+For more information on the available arguments, run 
+`python tensorize_vllm_model.py --help`.
+"""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="An example script that can be used to serialize and "
+        "deserialize vLLM models. These models "
+        "can be loaded using tensorizer directly to the GPU "
+        "extremely quickly. Tensor encryption and decryption is "
+        "also supported, although libsodium must be installed to "
+        "use it.")
+    parser = EngineArgs.add_cli_args(parser)
+    subparsers = parser.add_subparsers(dest='command')
+
+    serialize_parser = subparsers.add_parser(
+        'serialize', help="Serialize a model to `--serialized-directory`")
+
+    serialize_parser.add_argument(
+        "--suffix",
+        type=str,
+        required=False,
+        help=(
+            "The suffix to append to the serialized model directory, which is "
+            "used to construct the location of the serialized model tensors, "
+            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
+            "`--suffix` is `v1`, the serialized model tensors will be "
+            "saved to "
+            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
+            "If none is provided, a random UUID will be used."))
+    serialize_parser.add_argument(
+        "--serialized-directory",
+        type=str,
+        required=True,
+        help="The directory to serialize the model to. "
+        "This can be a local directory or S3 URI. The path to where the "
+        "tensors are saved is a combination of the supplied `dir` and model "
+        "reference ID. For instance, if `dir` is the serialized directory, "
+        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
+        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
+        "where `suffix` is given by `--suffix` or a random UUID if not "
+        "provided.")
+
+    serialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Encrypt the model weights with a randomly-generated binary key,"
+              " and save the key at this path"))
+
+    deserialize_parser = subparsers.add_parser(
+        'deserialize',
+        help=("Deserialize a model from `--path-to-tensors`"
+              " to verify it can be loaded and used."))
+
+    deserialize_parser.add_argument(
+        "--path-to-tensors",
+        type=str,
+        required=True,
+        help="The local path or S3 URI to the model tensors to deserialize. ")
+
+    deserialize_parser.add_argument(
+        "--keyfile",
+        type=str,
+        required=False,
+        help=("Path to a binary key to use to decrypt the model weights,"
+              " if the model was serialized with encryption"))
+
+    return parser.parse_args()
+
+
+def make_model_contiguous(model):
+    # Ensure tensors are saved in memory contiguously
+    for param in model.parameters():
+        param.data = param.data.contiguous()
+
+
+def _get_vllm_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        model_cls = ModelRegistry.load_model_cls(arch)
+        if model_cls is not None:
+            return model_cls
+    raise ValueError(
+        f"Model architectures {architectures} are not supported for now. "
+        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def serialize():
+
+    eng_args_dict = {f.name: getattr(args, f.name) for f in
+                     dataclasses.fields(EngineArgs)}
+    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
+    engine = LLMEngine.from_engine_args(engine_args)
+
+    model = (engine.model_executor.driver_worker.
+             model_runner.model)
+
+    encryption_params = EncryptionParams.random() if keyfile else None
+    if keyfile:
+        with _write_stream(keyfile) as stream:
+            stream.write(encryption_params.key)
+
+    with _write_stream(model_path) as stream:
+        serializer = TensorSerializer(stream, encryption=encryption_params)
+        serializer.write_module(model)
+        serializer.close()
+
+    print("Serialization complete. Model tensors saved to", model_path)
+    if keyfile:
+        print("Key saved to", keyfile)
+
+
+def deserialize():
+    config = AutoConfig.from_pretrained(model_ref)
+
+    with no_init_or_tensor():
+        model_class = _get_vllm_model_architecture(config)
+        model = model_class(config)
+
+    before_mem = get_mem_usage()
+    start = time.time()
+
+    if keyfile:
+        with _read_stream(keyfile) as stream:
+            key = stream.read()
+            decryption_params = DecryptionParams.from_key(key)
+            tensorizer_args.deserializer_params['encryption'] = \
+                decryption_params
+
+    with (_read_stream(model_path)) as stream, TensorDeserializer(
+            stream, **tensorizer_args.deserializer_params) as deserializer:
+        deserializer.load_into_module(model)
+        end = time.time()
+
+    # Brag about how fast we are.
+    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+    duration = end - start
+    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+    after_mem = get_mem_usage()
+    print(
+        f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s"
+    )
+    print(f"Memory usage before: {before_mem}")
+    print(f"Memory usage after: {after_mem}")
+
+    return model
+
+
+args = parse_args()
+
+s3_access_key_id = (args.s3_access_key_id or os.environ.get("S3_ACCESS_KEY_ID")
+                    or None)
+s3_secret_access_key = (args.s3_secret_access_key
+                        or os.environ.get("S3_SECRET_ACCESS_KEY") or None)
+
+s3_endpoint = (args.s3_endpoint or os.environ.get("S3_ENDPOINT_URL") or None)
+
+_read_stream, _write_stream = (partial(
+    stream_io.open_stream,
+    mode=mode,
+    s3_access_key_id=s3_access_key_id,
+    s3_secret_access_key=s3_secret_access_key,
+    s3_endpoint=s3_endpoint,
+) for mode in ("rb", "wb+"))
+
+model_ref = args.model
+
+model_name = model_ref.split("/")[1]
+
+os.environ["MASTER_ADDR"] = "127.0.0.1"
+os.environ["MASTER_PORT"] = "8080"
+
+torch.distributed.init_process_group(world_size=1, rank=0)
+initialize_model_parallel()
+
+keyfile = args.keyfile if args.keyfile else None
+
+if args.command == "serialize":
+    input_dir = args.serialized_directory.rstrip('/')
+    suffix = args.suffix if args.suffix else uuid.uuid4().hex
+    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+    model_path = f"{base_path}/model.tensors"
+    serialize()
+elif args.command == "deserialize":
+    tensorizer_args = TensorizerArgs.from_cli_args(args)
+    model_path = args.path_to_tensors
+    deserialize()
+else:
+    raise ValueError("Either serialize or deserialize must be specified.")
@@ -3,4 +3,4 @@
 
 # Dependencies for x86_64 CPUs
 torch == 2.2.1+cpu
-triton >= 2.1.0  # FIXME(woosuk): This is a hack to avoid import error.
+triton >= 2.1.0  # FIXME(woosuk): This is a hack to avoid import error.
@@ -14,6 +14,7 @@ types-setuptools
 
 # testing
 pytest
+tensorizer==2.9.0a0
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures
 
@@ -428,6 +428,9 @@ def get_extra_requirements() -> dict:
     install_requires=get_requirements(),
     extras_require=get_extra_requirements(),
     ext_modules=ext_modules,
+    extras_require={
+        "optional": ["tensorizer==2.9.0a1"],
+    },
     cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
     package_data=package_data,
 )
Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@`
`83`	`83`	`"vllm._C",`
`84`	`84`	`"numpy",`
`85`	`85`	`"tqdm",`
	`86`	`+ "tensorizer",`
`86`	`87`	`]`
`87`	`88`
`88`	`89`	`for mock_target in autodoc_mock_imports:`