2424from vllm import LLM , SamplingParams
2525from vllm .assets .image import ImageAsset
2626from vllm .assets .video import VideoAsset
27- from vllm .config import LoadFormat , TaskOption , TokenizerPoolConfig
27+ from vllm .config import TaskOption , TokenizerPoolConfig
2828from vllm .connections import global_http_connection
2929from vllm .distributed import (cleanup_dist_env_and_memory ,
3030 init_distributed_environment ,
4747
4848_M = TypeVar ("_M" )
4949
50- MODELS_ON_S3 = [
51- "distilbert/distilgpt2" ,
52- "meta-llama/Llama-2-7b-hf" ,
53- "meta-llama/Meta-Llama-3-8B" ,
54- "meta-llama/Llama-3.2-1B" ,
55- "meta-llama/Llama-3.2-1B-Instruct" ,
56- "openai-community/gpt2" ,
57- "ArthurZ/Ilama-3.2-1B" ,
58- "llava-hf/llava-1.5-7b-hf" ,
59- "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ,
60- "ai21labs/Jamba-tiny-random" ,
61- "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV" ,
62- "nm-testing/Phi-3-mini-128k-instruct-FP8" ,
63- "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV" ,
64- "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV" ,
65- "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V" ,
66- "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue" ,
67- "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse" ,
68- "AMead10/Llama-3.2-1B-Instruct-AWQ" ,
69- "shuyuej/Llama-3.2-1B-Instruct-GPTQ" ,
70- "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head" ,
71- "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024" ,
72- "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ" ,
73- "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" ,
74- "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test" ,
75- "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" ,
76- "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor" ,
77- "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama" ,
78- "neuralmagic/Llama-3.2-1B-quantized.w8a8" ,
79- "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym" ,
80- "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym" ,
81- "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym" ,
82- "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" ,
83- "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2" ,
84- "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym" ,
85- "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2" ,
86- "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym" ,
87- "nm-testing/tinyllama-oneshot-w4a16-channel-v2" ,
88- "nm-testing/tinyllama-oneshot-w4a16-group128-v2" ,
89- "nm-testing/tinyllama-oneshot-w8a16-per-channel" ,
90- "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" ,
91- "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" ,
92- "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme" ,
93- "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing" ,
94- "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing" ,
95- "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing" ,
96- "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing" ,
97- "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" ,
98- "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM" ,
99- "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM" ,
100- "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM" ,
101- "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM" ,
102- "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM" ,
103- "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM" ,
104- "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM" ,
105- "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing" ,
106- "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing" ,
107- "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing" ,
108- "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor" ,
109- "nm-testing/llama2.c-stories42M-pruned2.4-compressed" ,
110- ]
111-
112- MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
113-
11450_PromptMultiModalInput = Union [List [_M ], List [List [_M ]]]
11551
11652PromptImageInput = _PromptMultiModalInput [Image .Image ]
@@ -742,14 +678,8 @@ def __init__(
742678 enable_chunked_prefill : bool = False ,
743679 swap_space : int = 4 ,
744680 enforce_eager : Optional [bool ] = False ,
745- load_format : Optional [LoadFormat ] = None ,
746681 ** kwargs ,
747682 ) -> None :
748- if model_name in MODELS_ON_S3 and not load_format :
749- model_name = (f"{ MODEL_WEIGHTS_S3_BUCKET } /{ model_name } " )
750- load_format = LoadFormat .RUNAI_STREAMER
751- if not load_format :
752- load_format = LoadFormat .AUTO
753683 self .model = LLM (
754684 model = model_name ,
755685 task = task ,
@@ -764,7 +694,6 @@ def __init__(
764694 max_model_len = max_model_len ,
765695 block_size = block_size ,
766696 enable_chunked_prefill = enable_chunked_prefill ,
767- load_format = load_format ,
768697 ** kwargs ,
769698 )
770699
0 commit comments