WIP

patrickvonplaten · patrickvonplaten · commit d6569b74d50b · 2024-11-07T22:20:51.000Z
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -483,6 +483,9 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     mistral_mapping = {
         "layers": "model.layers",
         "attention": "self_attn",
+        "qscale_act": "input_scale",
+        "qscale_weight": "weight_scale",
+        "kv_fake_quantizer.qscale_act": "kv_scale",
         "wq": "q_proj",
         "wk": "k_proj",
         "wv": "v_proj",
@@ -603,15 +606,23 @@ def permute(w: torch.Tensor, n_heads: int):
         modules = name.split(".")
 
         # rotary embeds should be sliced
-        if "wk" in modules:
+        if "wk" in modules and modules[-1] == "weight":
             loaded_weight = permute(loaded_weight,
                                     self.config.num_key_value_heads)
-        elif "wq" in modules:
+        elif "wq" in modules and modules[-1] == "weight":
             loaded_weight = permute(loaded_weight,
                                     self.config.num_attention_heads)
 
-        for item in modules:
-            if item in mapping and mapping[item] not in name:
+        num_modules = len(modules)
+        for i in range(num_modules):
+            item = modules[i]
+            next_item = modules[i + 1] if i < num_modules - 1 else None
+
+            combined_item = f"{item}.{next_item}" if next_item is not None else None
+
+            if combined_item in mapping:
+                name = name.replace(combined_item, mapping[combined_item])
+            elif item in mapping and mapping[item] not in name:
                 name = name.replace(item, mapping[item])
 
         return name, loaded_weight
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
@@ -218,12 +218,12 @@ def load_params_config(model, revision) -> PretrainedConfig:
         "hidden_dim": "intermediate_size",
     }
 
-    def recurse_elems(elem: Any):
-        if isinstance(elem, dict):
+    def recurse_elems(elem: Any, wrap_to_hf_config: bool = True):
+        if isinstance(elem, dict) and wrap_to_hf_config:
             config_dict = {}
             for key, value in elem.items():
                 key = config_mapping.get(key, key)
-                config_dict[key] = recurse_elems(value)
+                config_dict[key] = recurse_elems(value, wrap_to_hf_config=False)
             return PretrainedConfig(**config_dict)
         else:
             return elem
@@ -236,6 +236,12 @@ def recurse_elems(elem: Any):
     config_dict["max_position_embeddings"] = config_dict.get(
         "max_position_embeddings", 128_000)
 
+    if config_dict.get("quantization") is not None:
+        config_dict["quantization_config"] = {
+            "quant_method": "fp8",
+            "activation_scheme": "static"
+        }
+
     if config_dict.get("moe") is not None:
         config_dict["architectures"] = ["MixtralForCausalLM"]
     else:
@@ -252,6 +258,7 @@ def recurse_elems(elem: Any):
         config_dict["model_type"] = "pixtral"
 
     config = recurse_elems(config_dict)
+
     return config
 
 
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
@@ -220,7 +220,7 @@ def convert_ids_to_tokens(
 
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
 
-        if any(t.strip() == "�" for t in tokens):
+        if any(t.strip() == "�" for t in tokens) and isinstance(self.tokenizer, Tekkenizer):
             # if any stripped decoded token is undefined
             # because it's invalid unicode then pass bytes
             # See: https://github.com/vllm-project/vllm/pull/8640