Qwen 2 refactored (#349)

robertgshaw2-redhat · rshaw@neuralmagic.com · web-flow · commit 63850f21c6bf · 2024-06-30T18:08:31.000-04:00
draft

---------

Co-authored-by: rshaw@neuralmagic.com &lt;rshaw@neuralmagic&gt;
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -89,7 +89,8 @@ def apply_moe(self,
                   layer: torch.nn.Module,
                   x: torch.Tensor,
                   router_logits: torch.Tensor,
-                  top_k: int) -> torch.Tensor:
+                  top_k: int,
+                  renormalize: bool = True) -> torch.Tensor:
         raise NotImplementedError
 
 
@@ -157,14 +158,15 @@ def apply_moe(self,
                   layer: torch.nn.Module,
                   x: torch.Tensor,
                   router_logits: torch.Tensor,
-                  top_k: int) -> torch.Tensor:
+                  top_k: int,
+                  renormalize: bool = True) -> torch.Tensor:
 
         return fused_moe(x, 
                          layer.w13_weight,
                          layer.w2_weight,
                          router_logits,
                          top_k,
-                         renormalize=True,
+                         renormalize=renormalize,
                          inplace=True)
 
 
@@ -876,7 +878,9 @@ def __init__(
         hidden_size: int,
         intermediate_size: int,
         params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig]=None,
+        reduce_results: bool = False,
+        renormalize: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
 
@@ -886,6 +890,8 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size()
         self.top_k = top_k
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
+        self.reduce_results = reduce_results
+        self.renormalize = renormalize
         
         if quant_config is None:
             self.quant_method: Optional[
@@ -906,22 +912,28 @@ def weight_loader(self,
                       param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor,
                       weight_name: str,
+                      shard_id: int,
                       expert_id: int):
         tp_rank = get_tensor_model_parallel_rank()
         param_data = param.data
         shard_size = self.intermediate_size_per_partition
         shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
 
-        # FIXME: This is going to be brittle.
-        if weight_name.endswith("w1.weight"):
+        # w1, gate_proj case: Load into first shard of w13.
+        if shard_id == 0:
             param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("w3.weight"):
+        # w3, up_proj case: Load into second shard of w13.
+        elif shard_id == 2:
             param_data[expert_id,
                        shard_size:2 * shard_size, :] = loaded_weight[shard, :]
-        if weight_name.endswith("w2.weight"):
+        # w2, down_proj case: Load into only shard of w2.
+        elif shard_id == 1:
             param_data[expert_id, :, :] = loaded_weight[:, shard]
+        else:
+            raise ValueError(f"Shard id must be in [0,1,2] but got {shard_id}")
 
-        # FIXME: This is going to be brittle.
+        # FIXME(robertgshaw2-neuralmagic): Overfit to Mixtral. 
+        # Follow up PR to enable fp8 for other MoE models.
         if "input_scale" in weight_name or "w2.weight_scale" in weight_name:
             if param_data[expert_id] != 1 and (param_data[expert_id] -
                                                loaded_weight).abs() > 1e-5:
@@ -936,7 +948,6 @@ def weight_loader(self,
             assert "w1" in weight_name or "w3" in weight_name
             shard_id = 0 if "w1" in weight_name else 1
             param_data[expert_id][shard_id] = loaded_weight
-        
 
 
     def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
@@ -945,9 +956,10 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
         final_hidden_states = self.quant_method.apply_moe(self,
                                                           x=hidden_states,
                                                           router_logits=router_logits,
-                                                          top_k=self.top_k)
-    
-        if self.tp_size > 1:
+                                                          top_k=self.top_k,
+                                                          renormalize=self.renormalize)
+
+        if self.reduce_results and self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -393,14 +393,15 @@ def apply_moe(self,
                   layer: torch.nn.Module,
                   x: torch.Tensor,
                   router_logits: torch.Tensor,
-                  top_k: int) -> torch.Tensor:
+                  top_k: int,
+                  renormalize: bool=True) -> torch.Tensor:
 
         return fused_moe(x, 
                          layer.w13_weight,
                          layer.w2_weight,
                          router_logits,
                          top_k,
-                         renormalize=True,
+                         renormalize=renormalize,
                          inplace=True,
                          use_fp8=True,
                          w1_scale=layer.w13_scale,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -78,22 +78,22 @@ def __init__(
                                      params_dtype=params_dtype,
                                      quant_config=None)
         
-        self.mlp = FusedMoELinear(num_experts=num_experts,
-                                  top_k=top_k,
-                                  hidden_size=hidden_size,
-                                  intermediate_size=intermediate_size,
-                                  params_dtype=params_dtype,
-                                  quant_config=quant_config)
+        self.experts = FusedMoELinear(num_experts=num_experts,
+                                      top_k=top_k,
+                                      hidden_size=hidden_size,
+                                      intermediate_size=intermediate_size,
+                                      params_dtype=params_dtype,
+                                      reduce_results=True,
+                                      renormalize=True,
+                                      quant_config=quant_config)
 
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.mlp(hidden_states=hidden_states,
-                                       router_logits=router_logits)
-
+        final_hidden_states = self.experts(hidden_states,router_logits)
         return final_hidden_states.view(num_tokens, hidden_size)
 
 
@@ -372,25 +372,25 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         
         expert_params_mapping = [
             # These are the weight scales for the experts
-            # (param_name, weight_name, expert_id)
-            ("mlp.w13_scale" if weight_name in ["w1", "w3"] else "mlp.w2_scale",
-             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id)
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_scale" if weight_name in ["w1", "w3"] else "experts.w2_scale",
+             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id, shard_id)
             for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
+            for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
         ] + [
             # These are the weights for the experts
             # (param_name, weight_name, expert_id)
-            ("mlp.w13_weight" if weight_name in ["w1", "w3"] else "mlp.w2_weight",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id)
+            ("experts.w13_weight" if weight_name in ["w1", "w3"] else "experts.w2_weight",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
             for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
+            for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
         ] + [
             # These are the activation scales for the experts
             # (param_name, weight_name, expert_id)
-            ("mlp.a13_scale" if weight_name in ["w1", "w3"] else "mlp.a2_scale",
-             f"experts.{expert_id}.{weight_name}.input_scale", expert_id)
+            ("experts.a13_scale" if weight_name in ["w1", "w3"] else "experts.a2_scale",
+             f"experts.{expert_id}.{weight_name}.input_scale", expert_id, shard_id)
             for expert_id in range(self.config.num_local_experts)
-            for weight_name in ["w1", "w2", "w3"]
+            for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
         ]
 
         params_dict = dict(self.named_parameters())
@@ -410,7 +410,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for param_name, weight_name, expert_id in expert_params_mapping:
+                for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
                     if weight_name not in name:
                         continue
                     name = name.replace(weight_name, param_name)
@@ -419,6 +419,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader(param,
                                   loaded_weight,
                                   weight_name,
+                                  shard_id=shard_id,
                                   expert_id=expert_id)
                     break
                 else:
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
@@ -35,9 +35,9 @@
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+from vllm.model_executor.layers.linear import (FusedMoELinear,
+                                               MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
@@ -93,25 +93,22 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
-        self.config = config
-        self.rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.n_routed_experts = config.num_experts
-        self.top_k = config.num_experts_per_tok
-        if self.tp_size > self.n_routed_experts:
+        self.tp_size = get_tensor_model_parallel_world_size()        
+
+        if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.n_routed_experts}.")
-
-        self.experts = nn.ModuleList([
-            Qwen2MoeMLP(hidden_size=config.hidden_size,
-                        intermediate_size=config.moe_intermediate_size,
-                        hidden_act=config.hidden_act,
-                        quant_config=quant_config,
-                        reduce_results=False)
-            for idx in range(self.n_routed_experts)
-        ])
-        self.pack_params()
+                f"the number of experts {config.num_experts}.")
+
+        self.experts = FusedMoELinear(
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+        )
 
         self.gate = ReplicatedLinear(config.hidden_size,
                                      self.n_routed_experts,
@@ -131,25 +128,6 @@ def __init__(
                                                   1,
                                                   bias=False)
 
-    def pack_params(self):
-        w1 = []
-        w2 = []
-        for expert in self.experts:
-            w1.append(expert.gate_up_proj.weight)
-            w2.append(expert.down_proj.weight)
-        self.w1 = torch._utils._flatten_dense_tensors(w1)
-        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
-        for data, param in zip(w1s, w1):
-            param.data = data
-        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
-
-        self.w2 = torch._utils._flatten_dense_tensors(w2)
-        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
-        for data, param in zip(w2s, w2):
-            param.data = data
-
-        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
-
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
@@ -162,18 +140,13 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = fused_moe(hidden_states,
-                                        self.w1,
-                                        self.w2,
-                                        router_logits,
-                                        self.top_k,
-                                        renormalize=self.config.norm_topk_prob,
-                                        inplace=True)
-
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
         if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
-        final_hidden_states = tensor_model_parallel_all_reduce(
-            final_hidden_states)
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
 
         return final_hidden_states.view(num_tokens, hidden_dim)
 
@@ -284,6 +257,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config,
         )
+
         if (layer_idx not in config.mlp_only_layers) and (
                 config.num_experts > 0 and
             (layer_idx + 1) % config.decoder_sparse_step == 0):
@@ -426,21 +400,35 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
 
+        expert_params_mapping = [
+            # These are the weights for the experts
+            # (param_name, weight_name, expert_id, shard_id)
+            ("experts.w13_weight" if weight_name in ["gate_proj", "up_proj"] else "experts.w2_weight",
+             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
+            for expert_id in range(self.config.num_experts)
+            for shard_id, weight_name in enumerate(["gate_proj", "down_proj", "up_proj"])
+        ]
+
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked and experts (experts handled below).
                 if weight_name not in name:
                     continue
+                # We have mlp.experts[0].gate_proj in the checkpoint. 
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping 
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_expert." in name)
-                        and name not in params_dict):
-                    continue
                 if name not in params_dict:
                     continue
 
@@ -449,17 +437,26 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_expert." in name)
-                        and name not in params_dict):
-                    continue
-                if name not in params_dict:
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
+                for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  weight_name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)