From 54b1ba260f012f9db04efff95b43002f62aeb2a8 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Mon, 15 Apr 2024 09:59:49 -0400 Subject: [PATCH 1/7] docs: Add better clarity for tensorizer usage --- docs/source/models/engine_args.rst | 2 +- examples/tensorize_vllm_model.py | 22 ++++++++++++++++++---- vllm/model_executor/tensorizer_loader.py | 2 +- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst index 886a806934c0..235cb4e128c9 100644 --- a/docs/source/models/engine_args.rst +++ b/docs/source/models/engine_args.rst @@ -45,7 +45,7 @@ Below, you can find an explanation of every engine argument for vLLM: * "safetensors" will load the weights in the safetensors format. * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading. * "dummy" will initialize the weights with random values, mainly for profiling. - * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. `_. See `tensorized_vllm_model.py` in the examples folder to serialize a vLLM model, and for more information. Tensorizer support for vLLM can be installed with `pip install vllm[tensorizer]`. + * "tensorizer" will load serialized weights using `CoreWeave's Tensorizer model deserializer. `_ See `examples/tensorize_vllm_model.py `_ to serialize a vLLM model, and for more information. .. option:: --dtype {auto,half,float16,bfloat16,float,float32} diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index 3c20a38c7f72..7c638b5c4569 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -23,10 +23,11 @@ # yapf: disable """ tensorize_vllm_model.py is a script that can be used to serialize and -deserialize vLLM models. These models can be loaded using tensorizer directly -to the GPU extremely quickly. Tensor encryption and decryption is also -supported, although libsodium must be installed to use it. Install -vllm with tensorizer support using `pip install vllm[tensorizer]`. +deserialize vLLM models. These models can be loaded using tensorizer +to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint, +or locally. Tensor encryption and decryption is also supported, although +libsodium must be installed to use it. Install vllm with tensorizer support +using `pip install vllm[tensorizer]`. To serialize a model, you can run something like this: @@ -63,6 +64,19 @@ For more information on the available arguments, run `python tensorize_vllm_model.py --help`. + +Once a model is serialized, it can be used to load the model when running the +OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing +the `--tensorizer-uri` CLI argument that is functionally the same as the +`--path-to-tensors` argument in this script, along with `--vllm-tensorized`, to +signify that the model to be deserialized is a vLLM model, rather than a +HuggingFace `PreTrainedModel`, which can also be deserialized using tensorizer +in the same inference server, albeit without the speed optimizations. To +deserialize an encrypted file, the `--encryption-keyfile` argument can be used +to provide the path to the keyfile used to encrypt the model weights. For +information on all the arguments that can be used to configure tensorizer's +deserialization, check out the tensorizer options argument group in the +`vllm/entrypoints/openai/api_server.py` script with `--help`. """ diff --git a/vllm/model_executor/tensorizer_loader.py b/vllm/model_executor/tensorizer_loader.py index ed3ad9e2ffa1..0e1a78337e52 100644 --- a/vllm/model_executor/tensorizer_loader.py +++ b/vllm/model_executor/tensorizer_loader.py @@ -145,7 +145,7 @@ def add_cli_args( parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """Tensorizer CLI arguments""" - # Create the argument group + # Tensorizer options arg group group = parser.add_argument_group( 'tensorizer options', description=('Options for configuring the behavior of the' From 86f67edbb9579b23716d217c75b40b5927c14912 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Mon, 15 Apr 2024 10:37:54 -0400 Subject: [PATCH 2/7] docs: Adjust `tensorize_vllm_model.py` docstring further --- examples/tensorize_vllm_model.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index 7c638b5c4569..72fbbd421f1e 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -29,9 +29,9 @@ libsodium must be installed to use it. Install vllm with tensorizer support using `pip install vllm[tensorizer]`. -To serialize a model, you can run something like this: +To serialize a model, you can run something like this from the root directory: -python tensorize_vllm_model.py \ +python -m examples.tensorize_vllm_model \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ serialize \ @@ -39,25 +39,24 @@ --suffix vllm Which downloads the model from HuggingFace, loads it into vLLM, serializes it, -and saves it to your S3 bucket. A local directory can also be used. +and saves it to your S3 bucket. A local directory can also be used. This +assumes your S3 credentials are specified as environment variables. To provide +S3 credentials directly, you can provide `--s3-access-key-id` and +`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this +script. You can also encrypt the model weights with a randomly-generated key by providing a `--keyfile` argument. To deserialize a model, you can run something like this: -python tensorize_vllm_model.py \ +python -m examples.tensorize_vllm_model \ --model EleutherAI/gpt-j-6B \ --dtype float16 \ deserialize \ --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors Which downloads the model tensors from your S3 bucket and deserializes them. -To provide S3 credentials, you can provide `--s3-access-key-id` and -`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script, -the OpenAI entrypoint, as arguments for LLM(), or as environment variables -in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`. - You can also provide a `--keyfile` argument to decrypt the model weights if they were serialized with encryption. From 42a7a678fa9cb4b85fb8cf171538d12167807780 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Mon, 15 Apr 2024 10:42:29 -0400 Subject: [PATCH 3/7] docs: Add comment on getting help for CLI args for example script --- examples/tensorize_vllm_model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index 72fbbd421f1e..9ceca7109291 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -61,8 +61,12 @@ You can also provide a `--keyfile` argument to decrypt the model weights if they were serialized with encryption. -For more information on the available arguments, run -`python tensorize_vllm_model.py --help`. +For more information on the available arguments for serializing, run +`python -m examples.tensorize_vllm_model serialize --help`. + +Or for deserializing: + +`python -m examples.tensorize_vllm_model deserialize --help`. Once a model is serialized, it can be used to load the model when running the OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing From a18102f8a60f2718207f62d145bb3495c60785fe Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Mon, 15 Apr 2024 10:56:04 -0400 Subject: [PATCH 4/7] docs: Remove more unnecessary comments --- vllm/model_executor/tensorizer_loader.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/model_executor/tensorizer_loader.py b/vllm/model_executor/tensorizer_loader.py index 0e1a78337e52..8550cc97aefe 100644 --- a/vllm/model_executor/tensorizer_loader.py +++ b/vllm/model_executor/tensorizer_loader.py @@ -126,7 +126,6 @@ def __post_init__(self): "s3_endpoint": self.s3_endpoint, } - # Omitting self.dtype and self.device as this behaves weirdly self.deserializer_params = { "verify_hash": self.verify_hash, "encryption": self.encryption_keyfile, @@ -205,9 +204,7 @@ def add_cli_args( @classmethod def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs": - # Get the list of attributes of this dataclass. attrs = [attr.name for attr in dataclasses.fields(cls)] - # Set the attributes from the parsed arguments. tensorizer_args = cls(**{ attr: getattr(args, attr) for attr in attrs if hasattr(args, attr) @@ -291,7 +288,6 @@ def deserialize(self): nn.Module: The deserialized model. """ before_mem = get_mem_usage() - # Lazy load the tensors from S3 into the model. start = time.perf_counter() with open_stream( self.tensorizer_args.tensorizer_uri, From 9128f77c0d8662cfd609a0b82198efbdf215f89a Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Mon, 15 Apr 2024 13:23:44 -0400 Subject: [PATCH 5/7] Apply suggestions from code review Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- examples/tensorize_vllm_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index 9ceca7109291..784199d1589f 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -29,7 +29,8 @@ libsodium must be installed to use it. Install vllm with tensorizer support using `pip install vllm[tensorizer]`. -To serialize a model, you can run something like this from the root directory: +To serialize a model, you can run something like this from the root level of +this repository: python -m examples.tensorize_vllm_model \ --model EleutherAI/gpt-j-6B \ @@ -48,7 +49,8 @@ You can also encrypt the model weights with a randomly-generated key by providing a `--keyfile` argument. -To deserialize a model, you can run something like this: +To deserialize a model, you can run something like this from the root +level of this repository: python -m examples.tensorize_vllm_model \ --model EleutherAI/gpt-j-6B \ From 0ec3bfd2e7578159c232f8903eba1b86212b026b Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Mon, 15 Apr 2024 13:35:18 -0400 Subject: [PATCH 6/7] fix: Resolve comments from review --- examples/tensorize_vllm_model.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index 784199d1589f..dcef17de3cfd 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -29,8 +29,8 @@ libsodium must be installed to use it. Install vllm with tensorizer support using `pip install vllm[tensorizer]`. -To serialize a model, you can run something like this from the root level of -this repository: +To serialize a model, install vLLM from source, then run something +like this from the root level of this repository: python -m examples.tensorize_vllm_model \ --model EleutherAI/gpt-j-6B \ @@ -82,6 +82,14 @@ information on all the arguments that can be used to configure tensorizer's deserialization, check out the tensorizer options argument group in the `vllm/entrypoints/openai/api_server.py` script with `--help`. + +Tensorizer can also be invoked with the `LLM` class directly to load models: + + llm = LLM(model="facebook/opt-125m", + load_format="tensorizer", + tensorizer_uri=path_to_opt_tensors, + num_readers=3, + vllm_tensorized=True) """ From ecf4402cafe4d676754fccc2231d390fedc2b476 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Mon, 15 Apr 2024 13:41:42 -0400 Subject: [PATCH 7/7] docs: Reinclude environ variables --- examples/tensorize_vllm_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py index dcef17de3cfd..8cf8be09d0b9 100644 --- a/examples/tensorize_vllm_model.py +++ b/examples/tensorize_vllm_model.py @@ -41,8 +41,9 @@ Which downloads the model from HuggingFace, loads it into vLLM, serializes it, and saves it to your S3 bucket. A local directory can also be used. This -assumes your S3 credentials are specified as environment variables. To provide -S3 credentials directly, you can provide `--s3-access-key-id` and +assumes your S3 credentials are specified as environment variables +in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`. +To provide S3 credentials directly, you can provide `--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script.