Xreki
diff --git a/‎tests/alltoall.py‎
Lines changed: 146 additions & 0 deletions b/‎tests/alltoall.py‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎tests/run_test_internode.sh‎
Lines changed: 26 additions & 19 deletions b/‎tests/run_test_internode.sh‎
Lines changed: 26 additions & 19 deletions
diff --git a/‎tests/test_internode_latency.py‎
Lines changed: 159 additions & 0 deletions b/‎tests/test_internode_latency.py‎
Lines changed: 159 additions & 0 deletions
@@ -0,0 +1,146 @@
+import torch
+import torch.distributed as dist
+from typing import List, Tuple, Optional, Union
+
+from deep_ep import Buffer, EventOverlap
+
+# Communication buffer (will allocate at runtime)
+_buffer: Optional[Buffer] = None
+
+# Set the number of SMs to use
+# NOTES: this is a static variable
+# Buffer.set_num_sms(24)
+
+
+# You may call this function at the framework initialization
+def get_buffer(group: dist.ProcessGroup, hidden_bytes: int) -> Buffer:
+    global _buffer
+    
+    # NOTES: you may also replace `get_*_config` with your auto-tuned results via all the tests
+    num_nvl_bytes, num_rdma_bytes = 0, 0
+    for config in (Buffer.get_dispatch_config(group.size()), Buffer.get_combine_config(group.size())):
+        num_nvl_bytes = max(config.get_nvl_buffer_size_hint(hidden_bytes, group.size()), num_nvl_bytes)
+        num_rdma_bytes = max(config.get_rdma_buffer_size_hint(hidden_bytes, group.size()), num_rdma_bytes)
+
+    # Allocate a buffer if not existed or not enough buffer size
+    if _buffer is None or _buffer.group != group or _buffer.num_nvl_bytes < num_nvl_bytes or _buffer.num_rdma_bytes < num_rdma_bytes:
+        _buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes)
+    return _buffer
+
+
+def get_hidden_bytes(x: torch.Tensor) -> int:
+    t = x[0] if isinstance(x, tuple) else x
+    return t.size(1) * max(t.element_size(), 2)
+
+
+def dispatch_forward(
+    x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+    topk_idx: torch.Tensor,
+    topk_weights: torch.Tensor,
+    num_experts: int,
+    previous_event: Optional[EventOverlap] = None,
+    async_finish: bool = False,
+    allocate_on_comm_stream: bool = False
+) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor, torch.Tensor, List, Tuple, EventOverlap]:
+    # NOTES: an optional `previous_event` means a CUDA event captured that you want to make it as a dependency 
+    # of the dispatch kernel, it may be useful with communication-computation overlap. For more information, please
+    # refer to the docs of `Buffer.dispatch`
+    global _buffer
+
+    # Calculate layout before actual dispatch
+    num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, previous_event = _buffer.get_dispatch_layout(
+        topk_idx,
+        num_experts,
+        previous_event=previous_event,
+        async_finish=async_finish,
+        allocate_on_comm_stream=allocate_on_comm_stream
+    )
+    
+    # Do MoE dispatch
+    # NOTES: the CPU will wait for GPU's signal to arrive, so this is not compatible with CUDA graph
+    # For more advanced usages, please refer to the docs of the `dispatch` function
+    recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = _buffer.dispatch(
+        x,
+        topk_idx=topk_idx,
+        topk_weights=topk_weights,
+        num_tokens_per_rank=num_tokens_per_rank,
+        num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+        is_token_in_rank=is_token_in_rank,
+        num_tokens_per_expert=num_tokens_per_expert,
+        previous_event=previous_event,
+        async_finish=async_finish,
+        allocate_on_comm_stream=allocate_on_comm_stream
+    )
+
+    # For event management, please refer to the docs of the `EventOverlap` class
+    return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event
+
+
+def dispatch_backward(
+    grad_recv_x: torch.Tensor,
+    grad_recv_topk_weights: torch.Tensor,
+    handle: Tuple,
+    previous_event: Optional[EventOverlap] = None,
+    async_finish: bool = False,
+    allocate_on_comm_stream: bool = False
+) -> Tuple[torch.Tensor, torch.Tensor, EventOverlap]:
+    global _buffer
+
+    # The backward process of MoE dispatch is actually a combine
+    # For more advanced usages, please refer to the docs of the `combine` function
+    combined_grad_x, combined_grad_recv_topk_weights, event = _buffer.combine(
+        grad_recv_x,
+        handle,
+        topk_weights=grad_recv_topk_weights,
+        previous_event=previous_event,
+        async_finish=async_finish,
+        allocate_on_comm_stream=allocate_on_comm_stream
+    )
+
+    # For event management, please refer to the docs of the `EventOverlap` class
+    return combined_grad_x, combined_grad_recv_topk_weights, event
+
+
+def combine_forward(
+    x: torch.Tensor,
+    handle: Tuple,
+    previous_event: Optional[EventOverlap] = None,
+    async_finish: bool = False,
+    allocate_on_comm_stream: bool = False
+) -> Tuple[torch.Tensor, EventOverlap]:
+    global _buffer
+
+    # Do MoE combine
+    # For more advanced usages, please refer to the docs of the `combine` function
+    combined_x, _, event = _buffer.combine(
+        x,
+        handle,
+        async_finish=async_finish,
+        previous_event=previous_event,
+        allocate_on_comm_stream=allocate_on_comm_stream)
+
+    # For event management, please refer to the docs of the `EventOverlap` class
+    return combined_x, event
+
+
+def combine_backward(
+    grad_combined_x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+    handle: Tuple,
+    previous_event: Optional[EventOverlap] = None,
+    async_finish: bool = False,
+    allocate_on_comm_stream: bool = False
+) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], EventOverlap]:
+    global _buffer
+
+    # The backward process of MoE combine is actually a dispatch
+    # For more advanced usages, please refer to the docs of the `dispatch` function
+    grad_x, _, _, _, _, event = _buffer.dispatch(
+        grad_combined_x,
+        handle=handle,
+        async_finish=async_finish,
+        previous_event=previous_event,
+        allocate_on_comm_stream=allocate_on_comm_stream
+    )
+
+    # For event management, please refer to the docs of the `EventOverlap` class
+    return grad_x, event
@@ -4,29 +4,37 @@ WORK_ROOT=/root/paddlejob/workspace/env_run/liuyiqun
 export PYTHONPATH=${WORK_ROOT}/env/virtualenvs_cuda12.8/torch_py310_yiqun
 export PATH=${PYTHONPATH}/bin:${PATH}
 
-python -c "import torch; print(torch.__version__)"
-
 export PYTHONPATH=${WORK_ROOT}/PaPerf:$PYTHONPATH
 
 #export NVSHMEM_DIR=$ROOT_DIR/third-party/nvshmem
 #export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH"
 
-export MASTER_ADDR=10.54.95.204
-export MASTER_PORT=8367
-export WORLD_SIZE=8
-
-START_NODE=0
-END_NODE=$((${START_NODE} + ${WORLD_SIZE}))
-export RANK=$(($PADDLE_TRAINER_ID - ${START_NODE}))
-
-if [ ${PADDLE_TRAINER_ID} -lt ${START_NODE} ]; then
-  echo "$PADDLE_TRAINER_ID exit"
-  exit
-elif [ ${PADDLE_TRAINER_ID} -ge ${END_NODE} ]; then
-  echo "$PADDLE_TRAINER_ID exit"
-  exit
+START_RANK=46
+END_RANK=54
+
+if [[ ${PADDLE_TRAINER_ID} -lt $START_RANK ]]; then
+    exit 0
+fi
+
+if [[ ${PADDLE_TRAINER_ID} -ge $END_RANK ]]; then
+    exit 0
 fi
 
+rank=$(($PADDLE_TRAINER_ID - $START_RANK))
+nnodes=$(($END_RANK - $START_RANK))
+echo "rank: ${rank}, nnodes: ${nnodes}"
+
+python -c "import torch; print(torch.__version__)"
+
+#master=`cat /root/paddlejob/workspace/hostfile | head -n 1 | awk '{print $1}'`
+export MASTER_ADDR="10.95.238.87" # 46
+#master="10.95.238.99"  # 48
+#master="10.95.237.154" # 32
+#master="10.95.244.212"  # 8
+export MASTER_PORT=8367
+export WORLD_SIZE=$nnodes
+export RANK=$rank
+
 export NCCL_DEBUG=WARN
 #export NVSHMEM_DEBUG=DEBUG
 #export NVSHMEM_DEBUG=TRACE
@@ -46,7 +54,7 @@ export NVSHMEM_IB_TRAFFIC_CLASS=162
 #export NVSHMEM_IB_ENABLE_IBGDA=true
 #export NVSHMEM_DISABLE_P2P=1
 export NVSHMEM_BOOTSTRAP=UID
-export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME==xgbe0
+export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=xgbe0
 #export NVSHMEM_BOOTSTRAP_UID_SOCK_FAMILY=AF_INET
 
 #export NVSHMEM_DEBUG=INFO
@@ -57,5 +65,4 @@ export PATH=/opt/nvidia/nsight-systems/2025.1.1/bin:$PATH
 
 rm -rf core.*
 
-${nsys_args} python test_internode.py
-#${nsys_args} python test_simple.py
+${nsys_args} python test_internode_latency.py
@@ -0,0 +1,159 @@
+import os
+import sys
+import time
+import numpy as np
+
+import torch
+import torch.distributed as dist
+
+# noinspection PyUnresolvedReferences
+import alltoall
+import utils
+from utils import init_dist, create_grouped_scores
+
+try:
+    from paperf import profile_torch
+    has_paperf = True
+except ImportError:
+    has_paperf = False
+
+
+def print_tensor_info(t, name):
+    #print(f"-- {name}: data_ptr={t.untyped_storage().data_ptr()}, shape={t.size()}, dtype={t.dtype}")
+    print(f"-- {name}: shape={t.size()}, dtype={t.dtype}")
+
+
+def test_main(local_rank: int, num_local_ranks: int, num_ranks: int, num_nodes: int, rank: int, group: dist.ProcessGroup, use_random_input, dump_input):
+    # Settings
+    num_tokens = 4096
+    hidden = 7168
+    num_topk_groups = min(num_nodes, 4)
+    num_topk = 8
+    num_experts = (256 // num_ranks) * num_ranks
+
+    assert num_experts % num_ranks == 0 and num_local_ranks == 8
+    if local_rank == 0:
+        print(f'[config] num_tokens={num_tokens}, hidden={hidden}, num_topk_groups={num_topk_groups}, num_topk={num_topk}', flush=True)
+
+    if use_random_input:
+        # Random data
+        x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * rank
+        x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
+        #x_e4m3 = per_token_cast_to_fp8(x)
+
+        scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
+        group_scores = scores.view(num_tokens, num_nodes, -1).amax(dim=-1)
+        group_idx = torch.topk(group_scores, k=num_topk_groups, dim=-1, sorted=False).indices
+        masked_scores = create_grouped_scores(scores, group_idx, num_nodes)
+
+        topk_idx = torch.topk(masked_scores, num_topk, dim=-1, largest=True, sorted=False)[1]
+        topk_weights = torch.ones((num_tokens, num_topk), dtype=torch.float32, device='cuda') * rank
+        topk_weights_pure_rand = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda')
+
+        if dump_input:
+            utils.dump(x, 'x', local_rank)
+            utils.dump(x_pure_rand, 'x_pure_rand', local_rank)
+            #utils.dump(x_e4m3, 'x_e4m3', local_rank)
+
+            utils.dump(topk_idx, 'topk_idx', local_rank)
+            utils.dump(topk_weights, 'topk_weights', local_rank)
+            utils.dump(topk_weights_pure_rand, 'topk_weights_pure_rand', local_rank)
+    else:
+        x = utils.load("x", local_rank)
+        x_pure_rand = utils.load("x_pure_rand", local_rank)
+        #x_e4m3 = utils.load("x_e4m3", local_rank, "tuple")
+
+        topk_idx = utils.load("topk_idx", local_rank)
+        topk_weights = utils.load("topk_weights", local_rank)
+        topk_weights_pure_rand = utils.load("topk_weights_pure_rand", local_rank)
+
+
+    profile = False
+    profile = profile and has_paperf
+
+    # test bfloat16
+    buffer = alltoall.get_buffer(group, alltoall.get_hidden_bytes(x))
+
+    if profile:
+        profile_torch.switch_profile(0, 0, 1)
+
+    num_warmups = 100
+    num_tests = 1000
+
+    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
+    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
+
+    for i in range(num_warmups + num_tests):
+        if i == num_warmups:
+            group.barrier()
+            torch.cuda.synchronize()
+            cpu_start = time.time()
+
+        if i >= num_warmups:
+            # Record
+            batch_start = time.time()
+            start_events[i - num_warmups].record()
+
+            #group.barrier()
+            recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, dispatch_event = alltoall.dispatch_forward(
+                x=x,
+                topk_idx=topk_idx,
+                topk_weights=topk_weights,
+                num_experts=num_experts,
+                previous_event=None,
+                async_finish=False,
+                allocate_on_comm_stream=False
+            )
+
+            combined_x, event = alltoall.combine_forward(
+                x=recv_x,
+                handle=handle,
+                previous_event=None,
+                async_finish=False,
+                allocate_on_comm_stream=False
+            )
+
+            end_events[i - num_warmups].record()
+            batch_time = time.time() - batch_start
+            if local_rank == 0:
+                print(f"-- {i - num_warmups}-th running, cpu_time: {batch_time:.5f} s")
+    torch.cuda.synchronize()
+    group.barrier()
+
+    cpu_runtime = time.time() - cpu_start
+    avg_cpu_time = cpu_runtime / num_tests
+
+    gpu_times = np.array([s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)])[1:]
+    avg_gpu_time = np.average(gpu_times)
+
+    print(f"-- rank: {rank}, avg_cpu_time: {avg_cpu_time:.5f} s, avg_gpu_time: {avg_gpu_time:.5f} s")
+
+    torch.cuda.synchronize()
+    group.barrier()
+
+    avg_cpu_time_all_ranks = [None, ] * num_ranks
+    avg_gpu_time_all_ranks = [None, ] * num_ranks
+    dist.all_gather_object(avg_cpu_time_all_ranks, avg_cpu_time, group=group)
+    dist.all_gather_object(avg_gpu_time_all_ranks, avg_gpu_time, group=group)
+    if rank == 0:
+        avg_cpu_time = np.average(np.array(avg_cpu_time_all_ranks))
+        avg_gpu_time = np.average(np.array(avg_gpu_time_all_ranks))
+        print(f"-- avg_cpu_time_of_all_ranks: {avg_cpu_time:.5f} s, avg_gpu_time_of_all_ranks: {avg_gpu_time:.5f} s")
+
+
+def test_loop(local_rank: int, num_local_ranks: int):
+    num_nodes = int(os.getenv('WORLD_SIZE', 1))
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+
+    assert num_local_ranks == 8 and num_ranks > 8
+    torch.manual_seed(rank)
+
+    use_random_input = True
+    dump_input = False
+
+    test_main(local_rank, num_local_ranks, num_ranks, num_nodes, rank, group, use_random_input, dump_input)
+
+
+if __name__ == '__main__':
+    num_processes = 8
+    torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)