1515from .source_model .base import INPUT_MODELS
1616from .target_model .base import OUTPUT_MODELS , TurbomindModelConfig
1717
18- SUPPORTED_FORMATS = ['meta_llama' , 'hf' , 'awq' , None ]
18+ SUPPORTED_FORMATS = ['meta_llama' , 'hf' , 'awq' , 'qqq' , None ]
1919logger = get_logger ('lmdeploy' )
2020
2121
@@ -26,12 +26,14 @@ def get_input_model_registered_name(model_path: str, model_format: str):
2626 Args:
2727 model_path (str): the path of the input model
2828 model_format (str): the format of the model, which can be one of
29- ['meta_llama', 'hf', 'awq']
29+ ['meta_llama', 'hf', 'awq', 'qqq ']
3030 """
3131 arch = get_model_arch (model_path )[0 ]
3232 register_name = SUPPORTED_ARCHS [arch ]
3333 if model_format == 'awq' :
3434 register_name = register_name + '-awq'
35+ elif model_format == 'qqq' :
36+ register_name = register_name + '-qqq'
3537 return register_name
3638
3739
@@ -92,8 +94,9 @@ def get_output_model_registered_name_and_config(model_path: str,
9294 Args:
9395 model_path (str): the path of the input model
9496 model_format (str): the format of the model, which can be one of
95- ['meta_llama', 'hf', 'awq']
96- group_size (int): the size of group used by awq model
97+ ['meta_llama', 'hf', 'awq', 'qqq']
98+ group_size (int): the size of group used by quantization methods,
99+ including `awq` and `qqq`
97100 """
98101 register_name = 'fp16'
99102 turbomind_model_arch = 'llama'
@@ -113,6 +116,15 @@ def get_output_model_registered_name_and_config(model_path: str,
113116 register_name = 'plora-w4' \
114117 if turbomind_model_arch == 'xcomposer2' else 'w4'
115118 group_size = 128 if group_size == 0 else group_size
119+ config .quantization = 'awq'
120+ elif model_format == 'qqq' :
121+ weight_type = 'int4'
122+ register_name = 'qqq-w4'
123+ from transformers import AutoConfig
124+ quant_config = AutoConfig .from_pretrained (
125+ model_path ).quantization_config
126+ group_size = quant_config ['group_size' ]
127+ config .quantization = 'qqq'
116128 else :
117129 torch_dtype = getattr (model_config , 'torch_dtype' , 'float16' )
118130 TORCH_DTYPE_MAP = {torch .bfloat16 : 'bf16' , torch .float16 : 'fp16' }
@@ -212,17 +224,19 @@ def main(model_name: str,
212224 model_name (str): unused any longer
213225 model_path (str): the directory path of the model
214226 model_format (str): the format of the model, should choose from
215- ['meta_llama', 'hf', 'awq', None]. 'meta_llama' stands for META's
216- llama format, 'hf' means huggingface llama format, and 'awq' means
217- llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
218- The default value is None
219- chat_template (str): the name of the built-in chat template.
227+ ['meta_llama', 'hf', 'awq', 'qqq', None]. 'meta_llama' stands for
228+ META's llama format, 'hf' means huggingface llama format,
229+ 'awq' means llama(hf) model quantized by
230+ lmdeploy/lite/quantization/awq.py,
231+ and 'qqq' means llama(hf) model quantized by the repo
232+ https://github.com/HandH1998/QQQ,
233+ the default value is None
220234 tokenizer_path (str): the path of tokenizer model
221235 dst_path (str): the destination path that saves outputs
222236 tp (int): the number of GPUs used for tensor parallelism, should be 2^n
223237 quant_path (str): Path of the quantized model, which can be None.
224- group_size (int): a parameter used in AWQ to quantize fp16 weights
225- to 4 bits
238+ group_size (int): a parameter used in AWQ or QQQ to quantize fp16
239+ weights to 4 bits
226240 revision (str): The specific model version to use. It can be a branch
227241 name, a tag name, or a commit id. If unspecified, will use
228242 the default version.
0 commit comments