File tree Expand file tree Collapse file tree 1 file changed +1
-3
lines changed
vllm/model_executor/layers/quantization Expand file tree Collapse file tree 1 file changed +1
-3
lines changed Original file line number Diff line number Diff line change @@ -1489,7 +1489,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
14891489 )
14901490 else :
14911491 w13_input_scale = layer .w13_input_scale .max (dim = 1 ).values .to (torch .float32 )
1492-
14931492 layer .g1_alphas = Parameter (
14941493 (w13_input_scale * w13_weight_scale_2 ).to (torch .float32 ),
14951494 requires_grad = False ,
@@ -1499,6 +1498,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
14991498 layer .w13_input_scale_quant = Parameter (
15001499 (1 / w13_input_scale ).to (torch .float32 ), requires_grad = False
15011500 )
1501+
15021502 # GEMM 2 processing
15031503 if use_global_sf :
15041504 # For backends provide by Flashinfer, the input global scales are
@@ -1508,7 +1508,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
15081508 )
15091509 else :
15101510 w2_input_scale = layer .w2_input_scale
1511-
15121511 layer .g2_alphas = Parameter (
15131512 (w2_input_scale * layer .w2_weight_scale_2 ).to (torch .float32 ),
15141513 requires_grad = False ,
@@ -1642,7 +1641,6 @@ def apply(
16421641 from vllm .model_executor .models .llama4 import Llama4MoE
16431642
16441643 a1_gscale = layer .w13_input_scale_quant
1645-
16461644 (hidden_states_fp4 , hidden_states_scale_linear_fp4 ) = (
16471645 flashinfer .fp4_quantize (
16481646 x ,
You can’t perform that action at this time.
0 commit comments