Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 0 additions & 250 deletions tests/kernels/moe/test_modular_oai_triton_moe.py

This file was deleted.

35 changes: 9 additions & 26 deletions vllm/lora/layers/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,15 @@
_get_config_dtype_str,
)
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
MarlinExperts,
modular_marlin_fused_moe,
)
from vllm.model_executor.layers.fused_moe.fused_moe import (
TritonExperts,
modular_triton_fused_moe,
try_get_optimal_moe_config,
)
from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
FusedMoEModularMethod,
)
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
UnfusedOAITritonExperts,
)
from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel,
)
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)

from .utils import _get_lora_device

Expand Down Expand Up @@ -123,23 +114,15 @@ def _inject_lora_into_fused_moe(self):
self.base_layer.ensure_moe_quant_config_init()
quant_config = self.base_layer.quant_method.moe_quant_config

prepare_finalize = MoEPrepareAndFinalizeNoEP()
m_fused_moe_fn = FusedMoEModularKernel(
prepare_finalize,
self.base_layer.quant_method.select_gemm_impl(
prepare_finalize, self.base_layer
),
self.base_layer.shared_experts,
getattr(self.base_layer, "shared_experts_stream", None),
)
if quant_config.use_mxfp4_w4a16:
assert isinstance(
m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
m_fused_moe_fn = (
modular_triton_fused_moe(
quant_config, shared_experts=self.base_layer.shared_experts
)
else:
assert isinstance(
m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts)
if not quant_config.use_mxfp4_w4a16
else modular_marlin_fused_moe(
quant_config, shared_experts=self.base_layer.shared_experts
)
)

def fwd_decorator(layer, func):
def wrapper(*args, **kwargs):
Expand Down
Loading