33
44#include < mscclpp/algorithm.hpp>
55
6- #include " allreduce/allreduce_nvls_with_copy_2 .hpp"
6+ #include " allreduce/allreduce_nvls_block_pipeline .hpp"
77#include " allreduce/common.hpp"
88#include " collective_utils.hpp"
99#include " debug.h"
@@ -15,11 +15,12 @@ __device__ DeviceSemaphore deviceSemaphore[NUM_SEMAPHORES];
1515
1616template <typename T>
1717__global__ void __launch_bounds__ (1024 , 1 )
18- allreduceNvlsWithCopy2([[maybe_unused]] const void * src, [[maybe_unused]] void * scratch, [[maybe_unused]] void * dst,
19- [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
20- [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels, [[maybe_unused]] size_t size,
21- [[maybe_unused]] size_t scratchBufferSize, [[maybe_unused]] int rank,
22- [[maybe_unused]] int nRanksPerNode) {
18+ allreduceNvlsBlockPipeline([[maybe_unused]] const void * src, [[maybe_unused]] void * scratch,
19+ [[maybe_unused]] void * dst,
20+ [[maybe_unused]] DeviceHandle<BaseMemoryChannel>* memoryChannels,
21+ [[maybe_unused]] DeviceHandle<SwitchChannel>* switchChannels,
22+ [[maybe_unused]] size_t size, [[maybe_unused]] size_t scratchBufferSize,
23+ [[maybe_unused]] int rank, [[maybe_unused]] int nRanksPerNode) {
2324#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
2425 constexpr int alignment = 16 ;
2526 int nPeers = nRanksPerNode - 1 ;
@@ -146,7 +147,7 @@ __global__ void __launch_bounds__(1024, 1)
146147}
147148
148149template <ReduceOp OpType, typename T>
149- struct NvlsWithCopy2Adapter {
150+ struct NvlsBlockPipelineAdapter {
150151 static cudaError_t call (const void * input, void * scratch, void * output, void * memoryChannels, void *,
151152 DeviceHandle<SwitchChannel>* nvlsChannels, DeviceHandle<SwitchChannel>*, size_t , size_t ,
152153 size_t scratchBufferSize, int rank, int nRanksPerNode, int , size_t inputSize,
@@ -162,15 +163,15 @@ struct NvlsWithCopy2Adapter {
162163#endif
163164 {
164165 using ChannelType = DeviceHandle<BaseMemoryChannel>;
165- allreduceNvlsWithCopy2 <T>
166- <<<nBlocks, nThreadsPerBlock, 0 , stream>>> ( input, scratch, output, (ChannelType*)memoryChannels,
167- nvlsChannels, inputSize, scratchBufferSize, rank, nRanksPerNode);
166+ allreduceNvlsBlockPipeline <T><<<nBlocks, nThreadsPerBlock, 0 , stream>>> (
167+ input, scratch, output, (ChannelType*)memoryChannels, nvlsChannels, inputSize, scratchBufferSize, rank ,
168+ nRanksPerNode);
168169 return cudaGetLastError ();
169170 }
170171 }
171172};
172173
173- void AllreduceNvlsWithCopy2 ::initialize (std::shared_ptr<Communicator> comm) {
174+ void AllreduceNvlsBlockPipeline ::initialize (std::shared_ptr<Communicator> comm) {
174175 nSwitchChannels_ = 8 ;
175176 int nBaseChannels = 64 ;
176177 this ->conns_ = setupConnections (comm);
@@ -180,14 +181,15 @@ void AllreduceNvlsWithCopy2::initialize(std::shared_ptr<Communicator> comm) {
180181 // setup base memory channels
181182 this ->baseChannels_ = setupBaseMemoryChannels (this ->conns_ , memorySemaphores, nBaseChannels);
182183 this ->memoryChannelsDeviceHandle_ = setupBaseMemoryChannelDeviceHandles (this ->baseChannels_ );
184+ this ->nvlsConnections_ = setupNvlsConnections (comm, nvlsBufferSize_, nSwitchChannels_);
183185}
184186
185- CommResult AllreduceNvlsWithCopy2 ::allreduceKernelFunc (const std::shared_ptr<void > ctx_void, const void * input,
186- void * output, size_t inputSize, DataType dtype, ReduceOp op,
187- cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
188- const std::unordered_map<std::string, uintptr_t >&) {
187+ CommResult AllreduceNvlsBlockPipeline ::allreduceKernelFunc (const std::shared_ptr<void > ctx_void, const void * input,
188+ void * output, size_t inputSize, DataType dtype, ReduceOp op,
189+ cudaStream_t stream, int nBlocks, int nThreadsPerBlock,
190+ const std::unordered_map<std::string, uintptr_t >&) {
189191 auto ctx = std::static_pointer_cast<AlgorithmCtx>(ctx_void);
190- AllreduceFunc allreduce = dispatch<NvlsWithCopy2Adapter >(op, dtype);
192+ AllreduceFunc allreduce = dispatch<NvlsBlockPipelineAdapter >(op, dtype);
191193 if (!allreduce) {
192194 WARN (" Unsupported operation or data type for allreduce, dtype=%d" , static_cast <int >(dtype));
193195 return CommResult::CommInvalidArgument;
@@ -201,35 +203,35 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr<voi
201203 ctx->rank , ctx->nRanksPerNode , ctx->workSize , inputSize, stream, nullptr , 0 , 0 ,
202204 blockAndThreadNum.first , blockAndThreadNum.second );
203205 if (error != cudaSuccess) {
204- WARN (" AllreduceNvlsWithCopy failed with error: %s" , cudaGetErrorString (error));
206+ WARN (" AllreduceNvlsBlockPipeline failed with error: %s" , cudaGetErrorString (error));
205207 return CommResult::CommUnhandledCudaError;
206208 }
207209 return CommResult::CommSuccess;
208210}
209211
210- AlgorithmCtxKey AllreduceNvlsWithCopy2 ::generateAllreduceContextKey (const void *, void *, size_t , DataType, bool ) {
212+ AlgorithmCtxKey AllreduceNvlsBlockPipeline ::generateAllreduceContextKey (const void *, void *, size_t , DataType, bool ) {
211213 return AlgorithmCtxKey{nullptr , nullptr , 0 , 0 , 0 };
212214}
213215
214- std::shared_ptr<void > AllreduceNvlsWithCopy2 ::initAllreduceContext (std::shared_ptr<Communicator> comm, const void * ,
215- void *, size_t , DataType) {
216+ std::shared_ptr<void > AllreduceNvlsBlockPipeline ::initAllreduceContext (std::shared_ptr<Communicator> comm,
217+ const void *, void *, size_t , DataType) {
216218 auto ctx = std::make_shared<AlgorithmCtx>();
217219 ctx->rank = comm->bootstrap ()->getRank ();
218220 ctx->workSize = comm->bootstrap ()->getNranks ();
219221 ctx->nRanksPerNode = comm->bootstrap ()->getNranksPerNode ();
220222
221223 // setup channels
222- ctx->nvlsConnections = setupNvlsConnections (comm, nvlsBufferSize_, nSwitchChannels_);
223224 ctx->switchChannels =
224- setupNvlsChannels (ctx-> nvlsConnections , this ->scratchBuffer_ , scratchBufferSize_, nSwitchChannels_);
225+ setupNvlsChannels (this -> nvlsConnections_ , this ->scratchBuffer_ , scratchBufferSize_, nSwitchChannels_);
225226 ctx->switchChannelDeviceHandles = setupNvlsChannelDeviceHandles (ctx->switchChannels );
226227 return ctx;
227228}
228229
229- std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build () {
230- auto self = std::make_shared<AllreduceNvlsWithCopy2>(reinterpret_cast <uintptr_t >(scratchBuffer_), scratchBufferSize_);
230+ std::shared_ptr<Algorithm> AllreduceNvlsBlockPipeline::build () {
231+ auto self =
232+ std::make_shared<AllreduceNvlsBlockPipeline>(reinterpret_cast <uintptr_t >(scratchBuffer_), scratchBufferSize_);
231233 return std::make_shared<NativeAlgorithm>(
232- " default_allreduce_nvls_with_copy2 " , " allreduce" ,
234+ " default_allreduce_nvls_block_pipeline " , " allreduce" ,
233235 [self](std::shared_ptr<Communicator> comm) { self->initialize (comm); },
234236 [self](const std::shared_ptr<void > ctx, const void * input, void * output, size_t inputSize,
235237 [[maybe_unused]] size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks,
@@ -247,4 +249,4 @@ std::shared_ptr<Algorithm> AllreduceNvlsWithCopy2::build() {
247249}
248250
249251} // namespace collective
250- } // namespace mscclpp
252+ } // namespace mscclpp
0 commit comments