pytorch · elfiegg · Dec 4, 2025 · Dec 6, 2025 · Dec 6, 2025 · Dec 7, 2025
@@ -416,6 +416,23 @@ class Parallelism:
     Note that this is still an experimental feature.
     """
 
+    expert_parallel_comm_backend: Literal["standard", "deepep"] = "standard"
+    """
+    Expert-parallel communication backend. No effect for non-MoE models or when ep = 1.
+
+    - "standard": Uses PyTorch all-to-all collectives (default)
+    - "deepep": Uses DeepEP custom kernels for more efficient communication
+
+    DeepEP requires installation:
+    https://github.com/deepseek-ai/DeepEP.
+    """
+
+    deepep_use_alignment_padding: bool = False
+    """
+    Whether to use alignment padding for DeepEP token dispatch.
+    Only applies when expert_parallel_comm_backend="deepep".
+    """
+
 
 @dataclass
 class Checkpoint:

@@ -13,9 +13,14 @@
 from torch.distributed.tensor.placement_types import Placement
 
 from torchtitan.distributed.parallel_dims import ParallelDims
+from torchtitan.distributed.expert_parallel import DeepEPExpertParallel
 
 
-__all__ = ["ParallelDims", "NoParallel"]
+__all__ = [
+    "ParallelDims",
+    "NoParallel",
+    "DeepEPExpertParallel",
+]
 
 
 # NOTE: This is to achieve replicate computation on the gate module in the MoE router.

@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""DeepEP distributed communication primitives for MoE."""
+
+from .deepep import (
+    dispatch_tokens,
+    combine_tokens,
+    DispatchState,
+)
+
+__all__ = [
+    "dispatch_tokens",
+    "combine_tokens",
+    "DispatchState",
+]