microsoft
diff --git a/‎docs/guide/mscclpp-torch-integration.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/guide/mscclpp-torch-integration.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/torch-integration/customized_comm_with_default_algo.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/torch-integration/customized_comm_with_default_algo.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/core/gpu_ipc_mem.cc‎
Lines changed: 35 additions & 26 deletions b/‎src/core/gpu_ipc_mem.cc‎
Lines changed: 35 additions & 26 deletions
diff --git a/‎src/ext/collectives/algorithm_collection_builder.cc‎
Lines changed: 11 additions & 9 deletions b/‎src/ext/collectives/algorithm_collection_builder.cc‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎…/allreduce/allreduce_nvls_with_copy_2.cu‎ ‎…lreduce/allreduce_nvls_block_pipeline.cu‎src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu renamed to src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
Lines changed: 28 additions & 26 deletions b/‎…/allreduce/allreduce_nvls_with_copy_2.cu‎ ‎…lreduce/allreduce_nvls_block_pipeline.cu‎src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu renamed to src/ext/collectives/allreduce/allreduce_nvls_block_pipeline.cu
Lines changed: 28 additions & 26 deletions
diff --git a/‎src/ext/collectives/allreduce/allreduce_nvls_packet.cu‎
Lines changed: 5 additions & 3 deletions b/‎src/ext/collectives/allreduce/allreduce_nvls_packet.cu‎
Lines changed: 5 additions & 3 deletions
@@ -129,7 +129,7 @@ class CustomizedComm:
         self._algo_large = [
             algo for algo in algorithms
             if algo.collective == "allreduce"
-            and algo.name == "default_allreduce_nvls_with_copy"
+            and algo.name == "default_allreduce_nvls_warp_pipeline"
         ][0]
 
     def all_reduce(self, tensor: torch.Tensor, stream=None):
 
@@ -61,7 +61,7 @@ def __init__(self, comm: mscclpp.CommGroup):
         self._algorithm_nvls_nonzero_copy = [
             algo
             for algo in algorithms
-            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_with_copy"
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_warp_pipeline"
         ][0]
 
     def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
 
@@ -249,8 +249,13 @@ UniqueGpuIpcMemHandle GpuIpcMemHandle::createMulticast([[maybe_unused]] size_t b
   }
 
   if (handle->typeFlags == GpuIpcMemHandle::Type::None) {
+    cuMemRelease(allocHandle);
     THROW(GPU, Error, ErrorCode::SystemError, "createMulticast failed: neither POSIX FD nor FABRIC handle was created");
   }
+
+  // Release the local allocation handle. The exported POSIX FD / Fabric handle keeps the
+  // multicast object alive. Each importer will get its own handle via cuMemImportFromShareableHandle.
+  MSCCLPP_CUTHROW(cuMemRelease(allocHandle));
   return handle;
 #else   // !(CUDA_NVLS_API_AVAILABLE)
   THROW(GPU, Error, ErrorCode::InvalidUsage,
@@ -418,41 +423,45 @@ std::shared_ptr<void> GpuIpcMem::mapMulticast([[maybe_unused]] int numDevices, [
   // This will block until all devices call cuMulticastAddDevice()
   MSCCLPP_CUTHROW(cuMulticastBindAddr(allocHandle_, mcOffset, bufferAddr, bufferSize, 0));
 
+  // cuMemMap requires offset to be 0 for multicast handles, so we map the entire range
+  // [0, mcOffset + bufferSize) and return a pointer at mcPtr + mcOffset. This only consumes
+  // extra virtual address space for the mcOffset region; no additional physical memory is used.
+  size_t mapSize = mcOffset + bufferSize;
   CUdeviceptr mcPtr;
-  MSCCLPP_CUTHROW(cuMemAddressReserve(&mcPtr, bufferSize, minMcGran, 0U, 0));
-  MSCCLPP_CUTHROW(cuMemMap(mcPtr, bufferSize, 0, allocHandle_, 0));
+  MSCCLPP_CUTHROW(cuMemAddressReserve(&mcPtr, mapSize, minMcGran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap(mcPtr, mapSize, 0, allocHandle_, 0));
 
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = deviceId;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  MSCCLPP_CUTHROW(cuMemSetAccess(mcPtr, bufferSize, &accessDesc, 1));
+  MSCCLPP_CUTHROW(cuMemSetAccess(mcPtr, mapSize, &accessDesc, 1));
 
   // Return shared_ptr with custom deleter that unmaps and unbinds
   CUmemGenericAllocationHandle allocHandle = allocHandle_;
-  return std::shared_ptr<void>(
-      reinterpret_cast<void*>(mcPtr), [self = shared_from_this(), mcOffset, bufferSize, allocHandle](void* ptr) {
-        CUresult res;
-        const char* errStr;
-
-        res = cuMemUnmap((CUdeviceptr)ptr, bufferSize);
-        if (res != CUDA_SUCCESS) {
-          (void)cuGetErrorString(res, &errStr);
-          WARN(GPU, "Failed to unmap CUDA memory at pointer ", (void*)ptr, ": ", errStr);
-        }
-
-        res = cuMemAddressFree((CUdeviceptr)ptr, bufferSize);
-        if (res != CUDA_SUCCESS) {
-          (void)cuGetErrorString(res, &errStr);
-          WARN(GPU, "Failed to free CUDA memory at pointer ", (void*)ptr, ": ", errStr);
-        }
-
-        int deviceId;
-        CUdevice device;
-        if (cudaGetDevice(&deviceId) == cudaSuccess && cuDeviceGet(&device, deviceId) == CUDA_SUCCESS) {
-          (void)cuMulticastUnbind(allocHandle, device, mcOffset, bufferSize);
-        }
-      });
+  return std::shared_ptr<void>(reinterpret_cast<void*>(mcPtr + mcOffset), [self = shared_from_this(), mcPtr, mapSize,
+                                                                           mcOffset, bufferSize, allocHandle](void*) {
+    CUresult res;
+    const char* errStr;
+
+    res = cuMemUnmap(mcPtr, mapSize);
+    if (res != CUDA_SUCCESS) {
+      (void)cuGetErrorString(res, &errStr);
+      WARN(GPU, "Failed to unmap CUDA memory at pointer ", (void*)mcPtr, ": ", errStr);
+    }
+
+    res = cuMemAddressFree(mcPtr, mapSize);
+    if (res != CUDA_SUCCESS) {
+      (void)cuGetErrorString(res, &errStr);
+      WARN(GPU, "Failed to free CUDA memory at pointer ", (void*)mcPtr, ": ", errStr);
+    }
+
+    int deviceId;
+    CUdevice device;
+    if (cudaGetDevice(&deviceId) == cudaSuccess && cuDeviceGet(&device, deviceId) == CUDA_SUCCESS) {
+      (void)cuMulticastUnbind(allocHandle, device, mcOffset, bufferSize);
+    }
+  });
 #else   // !(CUDA_NVLS_API_AVAILABLE)
   THROW(GPU, Error, ErrorCode::InvalidUsage,
         "NVLS is not supported on this device (requires CUDA version >= 12.3 and Linux kernel version >= 5.6.0)");
 
@@ -8,10 +8,10 @@
 #include "allgather/allgather_fullmesh_2.hpp"
 #include "allreduce/allreduce_allpair_packet.hpp"
 #include "allreduce/allreduce_fullmesh.hpp"
-#include "allreduce/allreduce_nvls.hpp"
+#include "allreduce/allreduce_nvls_zero_copy.hpp"
 #include "allreduce/allreduce_nvls_packet.hpp"
-#include "allreduce/allreduce_nvls_with_copy.hpp"
-#include "allreduce/allreduce_nvls_with_copy_2.hpp"
+#include "allreduce/allreduce_nvls_warp_pipeline.hpp"
+#include "allreduce/allreduce_nvls_block_pipeline.hpp"
 #include "allreduce/allreduce_packet.hpp"
 #include "allreduce/allreduce_rsag.hpp"
 #include "allreduce/allreduce_rsag_pipeline.hpp"
@@ -72,12 +72,14 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uin
   auto allreduceNvlsPacket =
       std::make_shared<AllreduceNvlsPacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreduceNvlsPacket->collective(), allreduceNvlsPacket->name(), allreduceNvlsPacket);
-  auto allreduceNvlsWithCopy = std::make_shared<AllreduceNvlsWithCopy>(scratchBuffer, scratchBufferSize)->build();
-  collection.registerAlgorithm(allreduceNvlsWithCopy->collective(), allreduceNvlsWithCopy->name(),
-                               allreduceNvlsWithCopy);
-  auto allreduceNvlsWithCopy2 = std::make_shared<AllreduceNvlsWithCopy2>(scratchBuffer, scratchBufferSize)->build();
-  collection.registerAlgorithm(allreduceNvlsWithCopy2->collective(), allreduceNvlsWithCopy2->name(),
-                               allreduceNvlsWithCopy2);
+  auto allreduceNvlsWarpPipeline =
+      std::make_shared<AllreduceNvlsWarpPipeline>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceNvlsWarpPipeline->collective(), allreduceNvlsWarpPipeline->name(),
+                               allreduceNvlsWarpPipeline);
+  auto allreduceNvlsBlockPipeline =
+      std::make_shared<AllreduceNvlsBlockPipeline>(scratchBuffer, scratchBufferSize)->build();
+  collection.registerAlgorithm(allreduceNvlsBlockPipeline->collective(), allreduceNvlsBlockPipeline->name(),
+                               allreduceNvlsBlockPipeline);
   auto allreducePkt =
       std::make_shared<AllreducePacket>(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build();
   collection.registerAlgorithm(allreducePkt->collective(), allreducePkt->name(), allreducePkt);
 
@@ -3,7 +3,7 @@
 
 #include <mscclpp/algorithm.hpp>
 
-#include "allreduce/allreduce_nvls_with_copy_2.hpp"
+#include "allreduce/allreduce_nvls_block_pipeline.hpp"
 #include "allreduce/common.hpp"
 #include "collective_utils.hpp"
 #include "debug.h"
@@ -15,11 +15,12 @@ __device__ DeviceSemaphore deviceSemaphore[NUM_SEMAPHORES];
 
 template <typename T>
 __global__ void __launch_bounds__(1024, 1)
-    allreduceNvlsWithCopy2([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch, [[maybe_unused]] void* dst,
-                           [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
-                           [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels, [[maybe_unused]] size_t size,
-                           [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
-                           [[maybe_unused]] int nRanksPerNode) {
+    allreduceNvlsBlockPipeline([[maybe_unused]] const void* src, [[maybe_unused]] void* scratch,
+                               [[maybe_unused]] void* dst,
+                               [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
+                               [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels,
+                               [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize,
+                               [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
   constexpr int alignment = 16;
   int nPeers = nRanksPerNode - 1;
@@ -146,7 +147,7 @@ __global__ void __launch_bounds__(1024, 1)
 }
 
 template <ReduceOp OpType, typename T>
-struct NvlsWithCopy2Adapter {
+struct NvlsBlockPipelineAdapter {
   static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*,
                           DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t, size_t,
                           size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize,
@@ -162,15 +163,15 @@ struct NvlsWithCopy2Adapter {
 #endif
       {
         using ChannelType = DeviceHandle<BaseMemoryChannel>;
-        allreduceNvlsWithCopy2<T>
-            <<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                       nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
+        allreduceNvlsBlockPipeline<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+            input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank,
+            nRanksPerNode);
         return cudaGetLastError();
       }
   }
 };
 
-void AllreduceNvlsWithCopy2::initialize(std::shared_ptr<Communicator> comm) {
+void AllreduceNvlsBlockPipeline::initialize(std::shared_ptr<Communicator> comm) {
   nSwitchChannels_ = 8;
   int nBaseChannels = 64;
   this->conns_ = setupConnections(comm);
@@ -180,14 +181,15 @@ void AllreduceNvlsWithCopy2::initialize(std::shared_ptr<Communicator> comm) {
   // setup base memory channels
   this->baseChannels_ = setupBaseMemoryChannels(this->conns_, memorySemaphores, nBaseChannels);
   this->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles(this->baseChannels_);
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
 }
 
-CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
-                                                       void* output, size_t inputSize, DataType dtype, ReduceOp op,
-                                                       cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
-                                                       const std::unordered_map<std::string, uintptr_t>&) {
+CommResult AllreduceNvlsBlockPipeline::allreduceKernelFunc(const std::shared_ptr<void> ctx_void, const void* input,
+                                                           void* output, size_t inputSize, DataType dtype, ReduceOp op,
+                                                           cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
+                                                           const std::unordered_map<std::string, uintptr_t>&) {
   auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
-  AllreduceFunc allreduce = dispatch<NvlsWithCopy2Adapter>(op, dtype);
+  AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter>(op, dtype);
   if (!allreduce) {
     WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast<int>(dtype));
     return CommResult::CommInvalidArgument;
@@ -201,35 +203,35 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
                                 ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0,
                                 blockAndThreadNum.first, blockAndThreadNum.second);
   if (error != cudaSuccess) {
-    WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error));
+    WARN("AllreduceNvlsBlockPipeline failed with error: %s", cudaGetErrorString(error));
     return CommResult::CommUnhandledCudaError;
   }
   return CommResult::CommSuccess;
 }
 
-AlgorithmCtxKey AllreduceNvlsWithCopy2::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
+AlgorithmCtxKey AllreduceNvlsBlockPipeline::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
 }
 
-std::shared_ptr<void> AllreduceNvlsWithCopy2::initAllreduceContext(std::shared_ptr<Communicator> comm, const void*,
-                                                                   void*, size_t, DataType) {
+std::shared_ptr<void> AllreduceNvlsBlockPipeline::initAllreduceContext(std::shared_ptr<Communicator> comm,
+                                                                       const void*, void*, size_t, DataType) {
   auto ctx = std::make_shared<AlgorithmCtx>();
   ctx->rank = comm->bootstrap()->getRank();
   ctx->workSize = comm->bootstrap()->getNranks();
   ctx->nRanksPerNode = comm->bootstrap()->getNranksPerNode();
 
   // setup channels
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels_);
   ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, scratchBufferSize_, nSwitchChannels_);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }
 
-std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build() {
-  auto self = std::make_shared<AllreduceNvlsWithCopy2>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
+std::shared_ptr<Algorithm> AllreduceNvlsBlockPipeline::build() {
+  auto self =
+      std::make_shared<AllreduceNvlsBlockPipeline>(reinterpret_cast<uintptr_t>(scratchBuffer_), scratchBufferSize_);
   return std::make_shared<NativeAlgorithm>(
-      "default_allreduce_nvls_with_copy2", "allreduce",
+      "default_allreduce_nvls_block_pipeline", "allreduce",
       [self](std::shared_ptr<Communicator> comm) { self->initialize(comm); },
       [self](const std::shared_ptr<void> ctx, const void* input, void* output, size_t inputSize,
              [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
@@ -247,4 +249,4 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build() {
 }
 
 }  // namespace collective
-}  // namespace mscclpp
+}  // namespace mscclpp
@@ -75,7 +75,10 @@ struct AllreduceNvlsPacketAdapter {
   }
 };
 
-void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator>) {}
+void AllreduceNvlsPacket::initialize(std::shared_ptr<Communicator> comm) {
+  int nSwitchChannels = 1;
+  this->nvlsConnections_ = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
+}
 
 AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) {
   return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0};
@@ -90,9 +93,8 @@ std::shared_ptr<void> AllreduceNvlsPacket::initAllreduceContext(std::shared_ptr<
 
   // setup channels
   int nSwitchChannels = 1;
-  ctx->nvlsConnections = setupNvlsConnections(comm, nvlsBufferSize_, nSwitchChannels);
   ctx->switchChannels =
-      setupNvlsChannels(ctx->nvlsConnections, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
+      setupNvlsChannels(this->nvlsConnections_, this->scratchBuffer_, this->scratchBufferSize_, nSwitchChannels);
   ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles(ctx->switchChannels);
   return ctx;
 }