PaddlePaddle
diff --git a/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎cmake/experiments/cuda_module_loading_lazy.cmake‎
Lines changed: 11 additions & 7 deletions b/‎cmake/experiments/cuda_module_loading_lazy.cmake‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎cmake/mpi.cmake‎
Lines changed: 33 additions & 0 deletions b/‎cmake/mpi.cmake‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎cmake/operators.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/operators.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/distributed/collective/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎paddle/fluid/distributed/collective/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/MPITools.cc‎
Lines changed: 56 additions & 0 deletions b/‎paddle/fluid/distributed/collective/MPITools.cc‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/MPITools.h‎
Lines changed: 53 additions & 0 deletions b/‎paddle/fluid/distributed/collective/MPITools.h‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/ProcessGroup.cc‎
Lines changed: 8 additions & 0 deletions b/‎paddle/fluid/distributed/collective/ProcessGroup.cc‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/ProcessGroup.h‎
Lines changed: 3 additions & 0 deletions b/‎paddle/fluid/distributed/collective/ProcessGroup.h‎
Lines changed: 3 additions & 0 deletions
@@ -485,6 +485,9 @@ if(WITH_DISTRIBUTE)
         ON
         CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
   endif()
+  set(WITH_MPI
+      ON
+      CACHE STRING "Enable MPI when compiling WITH_DISTRIBUTE=ON." FORCE)
   if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
     # disable WITH_PSCORE for NPU before include third_party
     message(
@@ -509,6 +512,10 @@ if(WITH_DISTRIBUTE)
   endif()
 endif()
 
+if(WITH_MPI)
+  include(mpi)
+endif()
+
 include(third_party
 )# download, build, install third_party, Contains about 20+ dependencies
 
 
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,16 +16,15 @@
 # cuda moduel lazy loading is supported by CUDA 11.7+
 # this experiment option makes Paddle supports lazy loading before CUDA 11.7.
 
-option(EXP_CUDA_MODULE_LOADING_LAZY "enable lazy cuda module loading" OFF)
-if(${EXP_CUDA_MODULE_LOADING_LAZY})
-  if(NOT ${ON_INFER} OR NOT ${LINUX})
+if(LINUX)
+  if(NOT ON_INFER)
     message(
       "EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms"
     )
     return()
   endif()
-  if(NOT ${CUDA_FOUND})
-    message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA")
+  if(NOT WITH_GPU)
+    message("EXP_CUDA_MODULE_LOADING_LAZY only works with GPU")
     return()
   endif()
   if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.7")
@@ -39,8 +38,13 @@ if(${EXP_CUDA_MODULE_LOADING_LAZY})
   set(CUDA_USE_STATIC_CUDA_RUNTIME
       OFF
       CACHE BOOL "" FORCE)
-  set(CMAKE_CUDA_FLAGS "--cudart shared")
   enable_language(CUDA)
+  execute_process(
+    COMMAND "rm" "-rf" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy"
+    COMMAND "chmod" "755" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy.sh"
+    COMMAND "bash" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy.sh"
+            "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" "${CUDA_TOOLKIT_ROOT_DIR}")
+  execute_process(COMMAND "chmod" "755" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy")
   set(CUDA_NVCC_EXECUTABLE
       "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy"
       CACHE FILEPATH "" FORCE)
 
@@ -0,0 +1,33 @@
+if(NOT WITH_DISTRIBUTE OR NOT WITH_MPI)
+  return()
+endif()
+
+find_package(MPI)
+
+if(NOT MPI_CXX_FOUND)
+  set(WITH_MPI
+      OFF
+      CACHE STRING "Disable MPI" FORCE)
+  message(WARNING "Not found MPI support in current system")
+  return()
+endif()
+
+message(STATUS "MPI compile flags: " ${MPI_CXX_COMPILE_FLAGS})
+message(STATUS "MPI include path: " ${MPI_CXX_INCLUDE_PATH})
+message(STATUS "MPI LINK flags path: " ${MPI_CXX_LINK_FLAGS})
+message(STATUS "MPI libraries: " ${MPI_CXX_LIBRARIES})
+include_directories(SYSTEM ${MPI_CXX_INCLUDE_PATH})
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_CXX_LINK_FLAGS}")
+add_definitions("-DPADDLE_WITH_MPI")
+find_program(
+  OMPI_INFO
+  NAMES ompi_info
+  HINTS ${MPI_CXX_LIBRARIES}/../bin)
+
+if(OMPI_INFO)
+  execute_process(COMMAND ${OMPI_INFO} OUTPUT_VARIABLE output_)
+  if(output_ MATCHES "smcuda")
+    #NOTE some mpi lib support mpi cuda aware.
+    add_definitions("-DPADDLE_WITH_MPI_AWARE")
+  endif()
+endif()
@@ -510,7 +510,7 @@ function(op_library TARGET)
   if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
     # Append first implemented MKLDNN activation operator
     if(${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
-      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(gelu, MKLDNN);\n")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(softplus, MKLDNN);\n")
     elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
       file(APPEND ${pybind_file}
            "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
 
@@ -43,6 +43,13 @@ if(WITH_NCCL OR WITH_RCCL)
   endif()
 endif()
 
+if(WITH_MPI)
+  cc_library(
+    processgroup_mpi
+    SRCS ProcessGroupMPI.cc MPITools.cc Common.cc
+    DEPS collective_helper device_context)
+endif()
+
 if(WITH_ASCEND_CL)
   cc_library(
     processgroup_hccl
 
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/MPITools.h"
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+
+namespace paddle {
+namespace distributed {
+namespace mpi {
+
+MPI_Op ToMPIType(ReduceOp reduction) {
+  static const std::map<ReduceOp, MPI_Op> red_type = {
+      {ReduceOp::MIN, MPI_MIN},
+      {ReduceOp::MAX, MPI_MAX},
+      {ReduceOp::SUM, MPI_SUM},
+      {ReduceOp::PRODUCT, MPI_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != red_type.end(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Invalid mpi reduction. Must be MPI_MIN | MPI_MAX | "
+                        "MPI_PROD | MPI_SUM."));
+  return it->second;
+}
+
+// NOTE: MPI dose not support CUDA aware now.
+bool CheckMpiCudaAware() { return false; }
+
+void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size() == 1,
+      true,
+      platform::errors::InvalidArgument("the inputs size of MPI must be 1!"));
+
+  PADDLE_ENFORCE_EQ(CheckTensorsInCudaPlace(tensors) && !CheckMpiCudaAware(),
+                    false,
+                    platform::errors::InvalidArgument(
+                        "Found CUDA Tensor. But CUDA-aware MPI not support!"));
+}
+
+}  //  namespace mpi
+}  //  namespace distributed
+}  //  namespace paddle
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <error.h>
+#include <iostream>
+#include <string>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/distributed/collective/Types.h"
+
+#ifdef HOST
+#undef HOST
+#endif
+
+#include <mpi.h>
+
+namespace paddle {
+namespace distributed {
+namespace mpi {
+
+#define MPI_CHECK(cmd)                                                     \
+  do {                                                                     \
+    int r = cmd;                                                           \
+    if (r != MPI_SUCCESS) {                                                \
+      LOG(FATAL) << "Failed, MPI error in" << __FILE__ << ":" << __LINE__  \
+                 << "with error code: " << std::to_string(r) << std::endl; \
+      exit(EXIT_FAILURE);                                                  \
+    }                                                                      \
+  } while (0)
+
+MPI_Op ToMPIType(ReduceOp reduction);
+
+bool CheckMpiCudaAware();
+
+void CheckValidInputs(const std::vector<phi::DenseTensor>& tensors);
+
+}  // namespace mpi
+}  // namespace distributed
+}  // namespace paddle
@@ -52,5 +52,13 @@ ProcessGroup::ProcessGroup(int rank,
   }
 }
 
+ProcessGroup::ProcessGroup(int rank, int size, int gid)
+    : rank_(rank), size_(size), gid_(gid) {
+  if (gid != IGNORE_ID) {
+    auto map = ProcessGroupMapFromGid::getInstance();
+    map->insert(gid_, this);
+  }
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
@@ -82,6 +82,9 @@ class ProcessGroup {
                         int size,
                         const platform::Place& place,
                         int gid);
+
+  explicit ProcessGroup(int rank, int size, int gid);
+
   virtual ~ProcessGroup() {}
 
   int GetRank() const { return rank_; }
Original file line number	Diff line number	Diff line change
`@@ -52,5 +52,13 @@ ProcessGroup::ProcessGroup(int rank,`
`52`	`52`	`}`
`53`	`53`	`}`
`54`	`54`
	`55`	`+ProcessGroup::ProcessGroup(int rank, int size, int gid)`
	`56`	`+ : rank_(rank), size_(size), gid_(gid) {`
	`57`	`+ if (gid != IGNORE_ID) {`
	`58`	`+ auto map = ProcessGroupMapFromGid::getInstance();`
	`59`	`+ map->insert(gid_, this);`
	`60`	`+ }`
	`61`	`+}`
	`62`	`+`
`55`	`63`	`} // namespace distributed`
`56`	`64`	`} // namespace paddle`