1+ import gc
12import json
23import os
34import pathlib
2021 serialize_vllm_model ,
2122 tensorize_vllm_model )
2223
23- from ..conftest import VllmRunner , cleanup
24+ from ..conftest import VllmRunner
2425from ..utils import RemoteOpenAIServer
26+ from .conftest import retry_until_skip
2527
2628# yapf conflicts with isort for this docstring
2729
2830
29-
3031prompts = [
3132 "Hello, my name is" ,
3233 "The president of the United States is" ,
4041tensorize_model_for_testing_script = os .path .join (
4142 os .path .dirname (__file__ ), "tensorize_vllm_model_for_testing.py" )
4243
44+
4345def is_curl_installed ():
4446 try :
4547 subprocess .check_call (['curl' , '--version' ])
4648 return True
4749 except (subprocess .CalledProcessError , FileNotFoundError ):
4850 return False
4951
52+
5053def get_torch_model (vllm_runner : VllmRunner ):
5154 return vllm_runner \
52- .model \
53- .llm_engine \
54- .model_executor \
55- .driver_worker \
56- .model_runner \
57- .model
55+ .model \
56+ .llm_engine \
57+ .model_executor \
58+ .driver_worker \
59+ .model_runner \
60+ .model
61+
5862
5963def write_keyfile (keyfile_path : str ):
6064 encryption_params = EncryptionParams .random ()
@@ -63,7 +67,6 @@ def write_keyfile(keyfile_path: str):
6367 f .write (encryption_params .key )
6468
6569
66-
6770@patch ('vllm.model_executor.model_loader.tensorizer.TensorizerAgent' )
6871def test_load_with_tensorizer (mock_agent , tensorizer_config ):
6972 mock_linear_method = MagicMock ()
@@ -85,22 +88,22 @@ def test_can_deserialize_s3(vllm_runner):
8588 tensorized_path = f"s3://tensorized/{ model_ref } /fp16/model.tensors"
8689
8790 with vllm_runner (model_ref ,
88- load_format = "tensorizer" ,
89- model_loader_extra_config = TensorizerConfig (
90- tensorizer_uri = tensorized_path ,
91- num_readers = 1 ,
92- s3_endpoint = "object.ord1.coreweave.com" ,
93- )) as loaded_hf_model :
94-
95- deserialized_outputs = loaded_hf_model .generate (prompts , sampling_params ) # noqa: E501
91+ load_format = "tensorizer" ,
92+ model_loader_extra_config = TensorizerConfig (
93+ tensorizer_uri = tensorized_path ,
94+ num_readers = 1 ,
95+ s3_endpoint = "object.ord1.coreweave.com" ,
96+ )) as loaded_hf_model :
97+ deserialized_outputs = loaded_hf_model .generate (prompts ,
98+ sampling_params )
99+ # noqa: E501
96100
97101 assert deserialized_outputs
98102
99103
100104@pytest .mark .skipif (not is_curl_installed (), reason = "cURL is not installed" )
101105def test_deserialized_encrypted_vllm_model_has_same_outputs (
102106 vllm_runner , tmp_path ):
103- cleanup ()
104107 with vllm_runner (model_ref ) as vllm_model :
105108 model_path = tmp_path / (model_ref + ".tensors" )
106109 key_path = tmp_path / (model_ref + ".key" )
@@ -113,18 +116,19 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
113116 encryption_keyfile = key_path
114117 )
115118 serialize_vllm_model (get_torch_model (vllm_model ),
116- config_for_serializing )
117-
119+ config_for_serializing )
118120
119121 config_for_deserializing = TensorizerConfig (tensorizer_uri = model_path ,
120122 encryption_keyfile = key_path )
121123
122124 with vllm_runner (
123- model_ref ,
124- load_format = "tensorizer" ,
125- model_loader_extra_config = config_for_deserializing ) as loaded_vllm_model : # noqa: E501
125+ model_ref ,
126+ load_format = "tensorizer" ,
127+ model_loader_extra_config = config_for_deserializing ) as loaded_vllm_model : # noqa: E501
126128
127- deserialized_outputs = loaded_vllm_model .generate (prompts , sampling_params ) # noqa: E501
129+ deserialized_outputs = loaded_vllm_model .generate (prompts ,
130+ sampling_params )
131+ # noqa: E501
128132
129133 assert outputs == deserialized_outputs
130134
@@ -140,12 +144,11 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
140144 serializer .write_module (hf_model .model )
141145
142146 with vllm_runner (model_ref ,
143- load_format = "tensorizer" ,
144- model_loader_extra_config = TensorizerConfig (
145- tensorizer_uri = model_path ,
146- num_readers = 1 ,
147- )) as loaded_hf_model :
148-
147+ load_format = "tensorizer" ,
148+ model_loader_extra_config = TensorizerConfig (
149+ tensorizer_uri = model_path ,
150+ num_readers = 1 ,
151+ )) as loaded_hf_model :
149152 deserialized_outputs = loaded_hf_model .generate_greedy (
150153 prompts , max_tokens = max_tokens )
151154
@@ -167,32 +170,36 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
167170 model_path = tmp_path / (model_ref + ".tensors" )
168171
169172 serialize_vllm_model (get_torch_model (vllm_model ),
170- TensorizerConfig (tensorizer_uri = model_path ))
173+ TensorizerConfig (tensorizer_uri = model_path ))
171174
172175 with vllm_runner (
173- model_ref ,
174- load_format = "tensorizer" ,
175- model_loader_extra_config = TensorizerConfig (
176- tensorizer_uri = model_path ,
177- num_readers = 1 ,
178- ),
179- enable_lora = True ,
180- max_loras = 1 ,
181- max_lora_rank = 8 ,
182- max_cpu_loras = 2 ,
183- max_num_seqs = 50 ,
184- max_model_len = 1000 ,
176+ model_ref ,
177+ load_format = "tensorizer" ,
178+ model_loader_extra_config = TensorizerConfig (
179+ tensorizer_uri = model_path ,
180+ num_readers = 1 ,
181+ ),
182+ enable_lora = True ,
183+ max_loras = 1 ,
184+ max_lora_rank = 8 ,
185+ max_cpu_loras = 2 ,
186+ max_num_seqs = 50 ,
187+ max_model_len = 1000 ,
185188 ) as loaded_vllm_model :
186189 process_requests (loaded_vllm_model .model .llm_engine , test_prompts )
187190
188191 assert loaded_vllm_model
189192
190193
191194def test_load_without_tensorizer_load_format (vllm_runner ):
195+ model = None
192196 with pytest .raises (ValueError ):
193- vllm_runner (
197+ model = vllm_runner (
194198 model_ref ,
195199 model_loader_extra_config = TensorizerConfig (tensorizer_uri = "test" ))
200+ del model
201+ gc .collect ()
202+ torch .cuda .empty_cache ()
196203
197204
198205@pytest .mark .skipif (not is_curl_installed (), reason = "cURL is not installed" )
@@ -202,7 +209,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
202209 model_path = tmp_path / (model_ref + ".tensors" )
203210
204211 serialize_vllm_model (get_torch_model (vllm_model ),
205- TensorizerConfig (tensorizer_uri = model_path ))
212+ TensorizerConfig (tensorizer_uri = model_path ))
206213
207214 model_loader_extra_config = {
208215 "tensorizer_uri" : str (model_path ),
@@ -220,9 +227,9 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
220227
221228 client = server .get_client ()
222229 completion = client .completions .create (model = model_ref ,
223- prompt = "Hello, my name is" ,
224- max_tokens = 5 ,
225- temperature = 0.0 )
230+ prompt = "Hello, my name is" ,
231+ max_tokens = 5 ,
232+ temperature = 0.0 )
226233
227234 assert completion .id is not None
228235 assert len (completion .choices ) == 1
@@ -233,11 +240,15 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
233240
234241
235242def test_raise_value_error_on_invalid_load_format (vllm_runner ):
243+ model = None
236244 with pytest .raises (ValueError ):
237- vllm_runner (
245+ model = vllm_runner (
238246 model_ref ,
239247 load_format = "safetensors" ,
240248 model_loader_extra_config = TensorizerConfig (tensorizer_uri = "test" ))
249+ del model
250+ gc .collect ()
251+ torch .cuda .empty_cache ()
241252
242253
243254@pytest .mark .skipif (torch .cuda .device_count () < 2 ,
@@ -259,22 +270,20 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
259270 disable_custom_all_reduce = True ,
260271 )
261272
273+
262274@pytest .mark .skipif (torch .cuda .device_count () < 2 ,
263275 reason = "Requires 2 GPUs" )
264276def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs (vllm_runner ,
265277 tmp_path ):
266278 model_ref = "EleutherAI/pythia-1.4b"
267279 # record outputs from un-sharded un-tensorized model
268- base_model = vllm_runner (
269- model_ref ,
270- disable_custom_all_reduce = True ,
271- enforce_eager = True ,
272- )
273- outputs = base_model .generate (prompts , sampling_params )
274-
275- base_model .model .llm_engine .model_executor .shutdown ()
276- del base_model
277- cleanup ()
280+ with vllm_runner (
281+ model_ref ,
282+ disable_custom_all_reduce = True ,
283+ enforce_eager = True ,
284+ ) as base_model :
285+ outputs = base_model .generate (prompts , sampling_params )
286+ base_model .model .llm_engine .model_executor .shutdown ()
278287
279288 # load model with two shards and serialize with encryption
280289 model_path = str (tmp_path / (model_ref + "-%02d.tensors" ))
@@ -287,32 +296,34 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
287296
288297 tensorize_vllm_model (
289298 engine_args = EngineArgs (
290- model = model_ref ,
291- tensor_parallel_size = 2 ,
292- disable_custom_all_reduce = True ,
293- enforce_eager = True ,
294- ),
299+ model = model_ref ,
300+ tensor_parallel_size = 2 ,
301+ disable_custom_all_reduce = True ,
302+ enforce_eager = True ,
303+ ),
295304 tensorizer_config = tensorizer_config ,
296305 )
297306 assert os .path .isfile (model_path % 0 ), "Serialization subprocess failed"
298307 assert os .path .isfile (model_path % 1 ), "Serialization subprocess failed"
299- cleanup ()
300-
301- loaded_vllm_model = vllm_runner (
302- model_ref ,
303- tensor_parallel_size = 2 ,
304- load_format = "tensorizer" ,
305- disable_custom_all_reduce = True ,
306- enforce_eager = True ,
307- model_loader_extra_config = tensorizer_config )
308308
309- deserialized_outputs = loaded_vllm_model .generate (prompts , sampling_params )
309+ with vllm_runner (
310+ model_ref ,
311+ tensor_parallel_size = 2 ,
312+ load_format = "tensorizer" ,
313+ disable_custom_all_reduce = True ,
314+ enforce_eager = True ,
315+ model_loader_extra_config = tensorizer_config ) as loaded_vllm_model :
316+ deserialized_outputs = loaded_vllm_model .generate (prompts ,
317+ sampling_params )
310318
311319 assert outputs == deserialized_outputs
312320
313321
322+
323+ @retry_until_skip (3 )
314324def test_vllm_tensorized_model_has_same_outputs (vllm_runner , tmp_path ):
315- cleanup ()
325+ gc .collect ()
326+ torch .cuda .empty_cache ()
316327 model_ref = "facebook/opt-125m"
317328 model_path = tmp_path / (model_ref + ".tensors" )
318329 config = TensorizerConfig (tensorizer_uri = str (model_path ))
@@ -324,8 +335,10 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
324335 assert is_vllm_tensorized (config )
325336
326337 with vllm_runner (model_ref ,
327- load_format = "tensorizer" ,
328- model_loader_extra_config = config ) as loaded_vllm_model :
329- deserialized_outputs = loaded_vllm_model .generate (prompts , sampling_params ) # noqa: E501
338+ load_format = "tensorizer" ,
339+ model_loader_extra_config = config ) as loaded_vllm_model :
340+ deserialized_outputs = loaded_vllm_model .generate (prompts ,
341+ sampling_params )
342+ # noqa: E501
330343
331344 assert outputs == deserialized_outputs
0 commit comments