From 54b1ba260f012f9db04efff95b43002f62aeb2a8 Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Mon, 15 Apr 2024 09:59:49 -0400
Subject: [PATCH 1/7] docs: Add better clarity for tensorizer usage

---
 docs/source/models/engine_args.rst       |  2 +-
 examples/tensorize_vllm_model.py         | 22 ++++++++++++++++++----
 vllm/model_executor/tensorizer_loader.py |  2 +-
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index 886a806934c0..235cb4e128c9 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -45,7 +45,7 @@ Below, you can find an explanation of every engine argument for vLLM:
     * "safetensors" will load the weights in the safetensors format.
     * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
     * "dummy" will initialize the weights with random values, mainly for profiling.
-    * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_. See `tensorized_vllm_model.py` in the examples folder to serialize a vLLM model, and for more information. Tensorizer support for vLLM can be installed with `pip install vllm[tensorizer]`.
+    * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. <https://github.com/coreweave/tensorizer>`_ See `examples/tensorize_vllm_model.py <https://github.com/vllm-project/vllm/blob/main/examples/tensorize_vllm_model.py>`_ to serialize a vLLM model, and for more information.
 
 .. option:: --dtype {auto,half,float16,bfloat16,float,float32}
 
diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index 3c20a38c7f72..7c638b5c4569 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -23,10 +23,11 @@
 # yapf: disable
 """
 tensorize_vllm_model.py is a script that can be used to serialize and 
-deserialize vLLM models. These models can be loaded using tensorizer directly 
-to the GPU extremely quickly. Tensor encryption and decryption is also 
-supported, although libsodium must be installed to use it. Install
-vllm with tensorizer support using `pip install vllm[tensorizer]`.
+deserialize vLLM models. These models can be loaded using tensorizer 
+to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
+or locally. Tensor encryption and decryption is also supported, although 
+libsodium must be installed to use it. Install vllm with tensorizer support 
+using `pip install vllm[tensorizer]`.
 
 To serialize a model, you can run something like this:
 
@@ -63,6 +64,19 @@
 
 For more information on the available arguments, run 
 `python tensorize_vllm_model.py --help`.
+
+Once a model is serialized, it can be used to load the model when running the
+OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing
+the `--tensorizer-uri` CLI argument that is functionally the same as the
+`--path-to-tensors` argument in this script, along with `--vllm-tensorized`, to
+signify that the model to be deserialized is a vLLM model, rather than a 
+HuggingFace `PreTrainedModel`, which can also be deserialized using tensorizer
+in the same inference server, albeit without the speed optimizations. To
+deserialize an encrypted file, the `--encryption-keyfile` argument can be used
+to provide the path to the keyfile used to encrypt the model weights. For
+information on all the arguments that can be used to configure tensorizer's
+deserialization, check out the tensorizer options argument group in the
+`vllm/entrypoints/openai/api_server.py` script with `--help`.
 """
 
 
diff --git a/vllm/model_executor/tensorizer_loader.py b/vllm/model_executor/tensorizer_loader.py
index ed3ad9e2ffa1..0e1a78337e52 100644
--- a/vllm/model_executor/tensorizer_loader.py
+++ b/vllm/model_executor/tensorizer_loader.py
@@ -145,7 +145,7 @@ def add_cli_args(
             parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
         """Tensorizer CLI arguments"""
 
-        # Create the argument group
+        # Tensorizer options arg group
         group = parser.add_argument_group(
             'tensorizer options',
             description=('Options for configuring the behavior of the'

From 86f67edbb9579b23716d217c75b40b5927c14912 Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Mon, 15 Apr 2024 10:37:54 -0400
Subject: [PATCH 2/7] docs: Adjust `tensorize_vllm_model.py` docstring further

---
 examples/tensorize_vllm_model.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index 7c638b5c4569..72fbbd421f1e 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -29,9 +29,9 @@
 libsodium must be installed to use it. Install vllm with tensorizer support 
 using `pip install vllm[tensorizer]`.
 
-To serialize a model, you can run something like this:
+To serialize a model, you can run something like this from the root directory:
 
-python tensorize_vllm_model.py \
+python -m examples.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    serialize \
@@ -39,25 +39,24 @@
    --suffix vllm
    
 Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
-and saves it to your S3 bucket. A local directory can also be used.
+and saves it to your S3 bucket. A local directory can also be used. This
+assumes your S3 credentials are specified as environment variables. To provide 
+S3 credentials directly, you can provide `--s3-access-key-id` and 
+`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this 
+script.
 
 You can also encrypt the model weights with a randomly-generated key by 
 providing a `--keyfile` argument.
 
 To deserialize a model, you can run something like this:
 
-python tensorize_vllm_model.py \
+python -m examples.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    deserialize \
    --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
 
 Which downloads the model tensors from your S3 bucket and deserializes them.
-To provide S3 credentials, you can provide `--s3-access-key-id` and 
-`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script,
-the OpenAI entrypoint, as arguments for LLM(), or as environment variables
-in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
-
 
 You can also provide a `--keyfile` argument to decrypt the model weights if 
 they were serialized with encryption.

From 42a7a678fa9cb4b85fb8cf171538d12167807780 Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Mon, 15 Apr 2024 10:42:29 -0400
Subject: [PATCH 3/7] docs: Add comment on getting help for CLI args for
 example script

---
 examples/tensorize_vllm_model.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index 72fbbd421f1e..9ceca7109291 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -61,8 +61,12 @@
 You can also provide a `--keyfile` argument to decrypt the model weights if 
 they were serialized with encryption.
 
-For more information on the available arguments, run 
-`python tensorize_vllm_model.py --help`.
+For more information on the available arguments for serializing, run 
+`python -m examples.tensorize_vllm_model serialize --help`.
+
+Or for deserializing:
+
+`python -m examples.tensorize_vllm_model deserialize --help`.
 
 Once a model is serialized, it can be used to load the model when running the
 OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing

From a18102f8a60f2718207f62d145bb3495c60785fe Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Mon, 15 Apr 2024 10:56:04 -0400
Subject: [PATCH 4/7] docs: Remove more unnecessary comments

---
 vllm/model_executor/tensorizer_loader.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/model_executor/tensorizer_loader.py b/vllm/model_executor/tensorizer_loader.py
index 0e1a78337e52..8550cc97aefe 100644
--- a/vllm/model_executor/tensorizer_loader.py
+++ b/vllm/model_executor/tensorizer_loader.py
@@ -126,7 +126,6 @@ def __post_init__(self):
             "s3_endpoint": self.s3_endpoint,
         }
 
-        # Omitting self.dtype and self.device as this behaves weirdly
         self.deserializer_params = {
             "verify_hash": self.verify_hash,
             "encryption": self.encryption_keyfile,
@@ -205,9 +204,7 @@ def add_cli_args(
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
-        # Get the list of attributes of this dataclass.
         attrs = [attr.name for attr in dataclasses.fields(cls)]
-        # Set the attributes from the parsed arguments.
         tensorizer_args = cls(**{
             attr: getattr(args, attr)
             for attr in attrs if hasattr(args, attr)
@@ -291,7 +288,6 @@ def deserialize(self):
             nn.Module: The deserialized model.
         """
         before_mem = get_mem_usage()
-        # Lazy load the tensors from S3 into the model.
         start = time.perf_counter()
         with open_stream(
                 self.tensorizer_args.tensorizer_uri,

From 9128f77c0d8662cfd609a0b82198efbdf215f89a Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Mon, 15 Apr 2024 13:23:44 -0400
Subject: [PATCH 5/7] Apply suggestions from code review

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 examples/tensorize_vllm_model.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index 9ceca7109291..784199d1589f 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -29,7 +29,8 @@
 libsodium must be installed to use it. Install vllm with tensorizer support 
 using `pip install vllm[tensorizer]`.
 
-To serialize a model, you can run something like this from the root directory:
+To serialize a model, you can run something like this from the root level of 
+this repository:
 
 python -m examples.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
@@ -48,7 +49,8 @@
 You can also encrypt the model weights with a randomly-generated key by 
 providing a `--keyfile` argument.
 
-To deserialize a model, you can run something like this:
+To deserialize a model, you can run something like this from the root 
+level of this repository:
 
 python -m examples.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \

From 0ec3bfd2e7578159c232f8903eba1b86212b026b Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Mon, 15 Apr 2024 13:35:18 -0400
Subject: [PATCH 6/7] fix: Resolve comments from review

---
 examples/tensorize_vllm_model.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index 784199d1589f..dcef17de3cfd 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -29,8 +29,8 @@
 libsodium must be installed to use it. Install vllm with tensorizer support 
 using `pip install vllm[tensorizer]`.
 
-To serialize a model, you can run something like this from the root level of 
-this repository:
+To serialize a model, install vLLM from source, then run something 
+like this from the root level of this repository:
 
 python -m examples.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
@@ -82,6 +82,14 @@
 information on all the arguments that can be used to configure tensorizer's
 deserialization, check out the tensorizer options argument group in the
 `vllm/entrypoints/openai/api_server.py` script with `--help`.
+
+Tensorizer can also be invoked with the `LLM` class directly to load models:
+
+    llm = LLM(model="facebook/opt-125m",
+              load_format="tensorizer",
+              tensorizer_uri=path_to_opt_tensors,
+              num_readers=3,
+              vllm_tensorized=True)
 """
 
 

From ecf4402cafe4d676754fccc2231d390fedc2b476 Mon Sep 17 00:00:00 2001
From: Sanger Steel <sangersteel@gmail.com>
Date: Mon, 15 Apr 2024 13:41:42 -0400
Subject: [PATCH 7/7] docs: Reinclude environ variables

---
 examples/tensorize_vllm_model.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
index dcef17de3cfd..8cf8be09d0b9 100644
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -41,8 +41,9 @@
    
 Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
 and saves it to your S3 bucket. A local directory can also be used. This
-assumes your S3 credentials are specified as environment variables. To provide 
-S3 credentials directly, you can provide `--s3-access-key-id` and 
+assumes your S3 credentials are specified as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
+To provide S3 credentials directly, you can provide `--s3-access-key-id` and 
 `--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this 
 script.