PaddlePaddle
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 11 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎cmake/external/cutlass.cmake‎
Lines changed: 43 additions & 0 deletions b/‎cmake/external/cutlass.cmake‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎cmake/flags.cmake‎
Lines changed: 0 additions & 1 deletion b/‎cmake/flags.cmake‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cmake/third_party.cmake‎
Lines changed: 10 additions & 0 deletions b/‎cmake/third_party.cmake‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/auto_parallel/dist_attr.cc‎
Lines changed: 8 additions & 4 deletions b/‎paddle/fluid/distributed/auto_parallel/dist_attr.cc‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎paddle/fluid/distributed/auto_parallel/dist_attr.h‎
Lines changed: 9 additions & 0 deletions b/‎paddle/fluid/distributed/auto_parallel/dist_attr.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/ProcessGroupNCCL.cc‎
Lines changed: 12 additions & 12 deletions b/‎paddle/fluid/distributed/collective/ProcessGroupNCCL.cc‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h‎
Lines changed: 3 additions & 6 deletions b/‎paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc‎
Lines changed: 13 additions & 28 deletions b/‎paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc‎
Lines changed: 13 additions & 28 deletions
diff --git a/‎paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc‎
Lines changed: 7 additions & 22 deletions b/‎paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc‎
Lines changed: 7 additions & 22 deletions
@@ -111,3 +111,14 @@ repos:
     hooks:
     -   id: cmakelint
         args: [--config=./tools/codestyle/.cmakelintrc]
+
+-   repo: https://github.com/PyCQA/autoflake
+    rev: v1.7.7
+    hooks:
+    -   id: autoflake
+        args:
+            - --in-place
+            - --remove-all-unused-imports
+            - --ignore-pass-after-docstring
+            - --ignore-init-module-imports
+            - --exclude=python/paddle/fluid/[!t]**,python/paddle/fluid/tra**
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)
+
+set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
+set(CUTLASS_TAG v2.9.1)
+
+include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/")
+include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/include/")
+include_directories(
+  "${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/tools/util/include/")
+
+add_definitions("-DPADDLE_WITH_CUTLASS")
+
+ExternalProject_Add(
+  extern_cutlass
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${CUTLASS_REPOSITORY}
+  GIT_TAG "${CUTLASS_TAG}"
+  PREFIX ${CUTLASS_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
+
+add_library(cutlass INTERFACE)
+
+add_dependencies(cutlass extern_cutlass)
@@ -149,7 +149,6 @@ if(NOT WIN32)
       -Wno-unused-parameter
       -Wno-unused-function
       -Wno-error=literal-suffix
-      -Wno-error=unused-local-typedefs
       -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
       -Wno-error=terminate # Warning in PADDLE_ENFORCE
       -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
 
@@ -505,4 +505,14 @@ if(WITH_CUSPARSELT)
   list(APPEND third_party_deps extern_cusparselt)
 endif()
 
+if(WITH_GPU
+   AND NOT WITH_ARM
+   AND NOT WIN32
+   AND NOT APPLE)
+  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
+    include(external/cutlass) # download, build, install cusparselt
+    list(APPEND third_party_deps extern_cutlass)
+  endif()
+endif()
+
 add_custom_target(third_party ALL DEPENDS ${third_party_deps})
@@ -319,7 +319,7 @@ bool operator==(const TensorDistAttr& lhs, const TensorDistAttr& rhs) {
 }
 
 std::vector<std::string> OperatorDistAttr::fields_{
-    "process_mesh", "impl_type", "impl_idx"};
+    "process_mesh", "impl_type", "impl_idx", "execution_stream"};
 
 OperatorDistAttr::OperatorDistAttr(const OpDesc& op) : op_(&op) {
   VLOG(4) << "[OperatorDistAttr constructor] op type: " << op_->Type();
@@ -376,8 +376,9 @@ void OperatorDistAttr::initialize() {
       output_dist_attrs_[name] = TensorDistAttr(*output);
     }
   }
-  impl_type_ = "default";
+  impl_type_ = kDefault;
   impl_idx_ = 0;
+  execution_stream_ = kDefault;
 }
 
 void OperatorDistAttr::copy_from(const OperatorDistAttr& dist_attr) {
@@ -386,9 +387,8 @@ void OperatorDistAttr::copy_from(const OperatorDistAttr& dist_attr) {
   set_process_mesh(dist_attr.process_mesh());
   set_impl_type(dist_attr.impl_type());
   set_impl_idx(dist_attr.impl_idx());
+  set_execution_stream(dist_attr.execution_stream());
   set_annotated(dist_attr.annotated());
-  impl_type_ = dist_attr.impl_type();
-  impl_idx_ = dist_attr.impl_idx();
 }
 
 void OperatorDistAttr::set_input_dist_attrs(
@@ -666,6 +666,7 @@ std::string OperatorDistAttr::to_string() const {
   }
   str += "impl_type: " + impl_type_ + ", ";
   str += "impl_idx: " + std::to_string(impl_idx_) + ", ";
+  str += "execution_stream: " + execution_stream_ + ", ";
   str += "annotated: [" + str_join(annotated_) + "], ";
   str += "\nprocess_mesh: " + process_mesh_.to_string() + ", ";
   str += "\ninput_dist_attrs: [\n";
@@ -747,6 +748,9 @@ bool operator==(const OperatorDistAttr& lhs, const OperatorDistAttr& rhs) {
   if (lhs.impl_idx() != rhs.impl_idx()) {
     return false;
   }
+  if (lhs.execution_stream() != rhs.execution_stream()) {
+    return false;
+  }
   for (auto const& item : lhs.input_dist_attrs()) {
     if (rhs.input_dist_attrs().count(item.first) != 1) {
       return false;
 
@@ -46,6 +46,8 @@ using framework::OpDesc;
 using framework::ProgramDesc;
 using framework::VarDesc;
 
+constexpr const char* kDefault = "default";
+
 class TensorDistAttr {
  public:
   TensorDistAttr() = default;
@@ -205,6 +207,12 @@ class OperatorDistAttr {
 
   void set_impl_idx(const int64_t& impl_idx) { impl_idx_ = impl_idx; }
 
+  const std::string& execution_stream() const { return execution_stream_; }
+
+  void set_execution_stream(const std::string& execution_stream) {
+    execution_stream_ = execution_stream;
+  }
+
   const std::map<std::string, bool>& annotated() const { return annotated_; }
 
   void set_annotated(const std::map<std::string, bool>& annotated);
@@ -262,6 +270,7 @@ class OperatorDistAttr {
   ProcessMesh process_mesh_;
   std::string impl_type_;
   int64_t impl_idx_ = -1;
+  std::string execution_stream_;
   std::map<std::string, bool> annotated_;
 };
 
 
@@ -453,7 +453,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
 
   platform::CUDADeviceGuard cuda_guard;
 
-  if (FLAGS_use_stream_safe_cuda_allocator) {
+  {
+    platform::NCCLGroupGuard nccl_guard;
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
       gpuStream_t nccl_stream;
@@ -465,12 +466,11 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
       } else {
         nccl_stream = places_to_ctx_[key][i]->stream();
       }
-      memory::RecordStream(tensors[i].Holder(), nccl_stream);
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
     }
   }
 
-  {
-    platform::NCCLGroupGuard nccl_guard;
+  if (FLAGS_use_stream_safe_cuda_allocator) {
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
       gpuStream_t nccl_stream;
@@ -482,7 +482,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
       } else {
         nccl_stream = places_to_ctx_[key][i]->stream();
       }
-      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+      memory::RecordStream(tensors[i].Holder(), nccl_stream);
     }
   }
 
@@ -521,20 +521,20 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
   // construct uninitialize guard for device
   platform::CUDADeviceGuard cuda_guard;
 
-  if (FLAGS_use_stream_safe_cuda_allocator) {
+  {
+    platform::NCCLGroupGuard nccl_guard;
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
-      memory::RecordStream(tensors[i].Holder(),
-                           places_to_ctx_[key][i]->stream());
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
     }
   }
 
-  {
-    platform::NCCLGroupGuard nccl_guard;
+  if (FLAGS_use_stream_safe_cuda_allocator) {
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
-      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
-      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+      memory::RecordStream(tensors[i].Holder(),
+                           places_to_ctx_[key][i]->stream());
     }
   }
 
 
@@ -24,10 +24,7 @@ paddle::experimental::Tensor conv2d_ad_func(
     const paddle::experimental::Tensor& filter,
     std::vector<int> strides,
     std::vector<int> paddings,
-    std::string paddding_algorithm,
-    int groups,
+    std::string padding_algorithm,
     std::vector<int> dilations,
-    std::string data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search);
+    int groups,
+    std::string data_format);
@@ -29,13 +29,10 @@ paddle::experimental::Tensor conv2d_ad_func(
     const paddle::experimental::Tensor& filter,
     std::vector<int> strides,
     std::vector<int> paddings,
-    std::string paddding_algorithm,
-    int groups,
+    std::string padding_algorithm,
     std::vector<int> dilations,
-    std::string data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search) {
+    int groups,
+    std::string data_format) {
   // Dygraph Record Event
   paddle::platform::RecordEvent dygraph_entrance_record_event(
       "conv2d dygraph", paddle::platform::TracerEventType::Operator, 1);
@@ -64,13 +61,10 @@ paddle::experimental::Tensor conv2d_ad_func(
                             new_filter,
                             strides,
                             paddings,
-                            paddding_algorithm,
-                            groups,
+                            padding_algorithm,
                             dilations,
-                            data_format,
-                            use_addto,
-                            workspace_size_MB,
-                            exhaustive_search);
+                            groups,
+                            data_format);
     }
   }
 
@@ -92,13 +86,10 @@ paddle::experimental::Tensor conv2d_ad_func(
                               filter,
                               strides,
                               paddings,
-                              paddding_algorithm,
-                              groups,
+                              padding_algorithm,
                               dilations,
-                              data_format,
-                              use_addto,
-                              workspace_size_MB,
-                              exhaustive_search);
+                              groups,
+                              data_format);
     transformer->SetOutTensorLayout(&out);
     if (need_tune) {
       egr::Controller::Instance().EnableLayoutAutoTune();
@@ -119,13 +110,10 @@ paddle::experimental::Tensor conv2d_ad_func(
                                                  filter,
                                                  strides,
                                                  paddings,
-                                                 paddding_algorithm,
-                                                 groups,
+                                                 padding_algorithm,
                                                  dilations,
-                                                 data_format,
-                                                 use_addto,
-                                                 workspace_size_MB,
-                                                 exhaustive_search);
+                                                 groups,
+                                                 data_format);
   // Check NaN and Inf if needed
   if (FLAGS_check_nan_inf) {
     egr::CheckTensorHasNanOrInf("conv2d", api_result);
@@ -157,13 +145,10 @@ paddle::experimental::Tensor conv2d_ad_func(
     // SetAttributes if needed
     grad_node->SetAttributestrides(strides);
     grad_node->SetAttributepaddings(paddings);
-    grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
+    grad_node->SetAttributepadding_algorithm(padding_algorithm);
     grad_node->SetAttributegroups(groups);
     grad_node->SetAttributedilations(dilations);
     grad_node->SetAttributedata_format(data_format);
-    grad_node->SetAttributeuse_addto(use_addto);
-    grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
-    grad_node->SetAttributeexhaustive_search(exhaustive_search);
     // Set TensorWrappers for Forward Inputs if needed
     grad_node->SetTensorWrapperinput(input);
     grad_node->SetTensorWrapperfilter(filter);
 
@@ -46,13 +46,10 @@ Conv2dGradNodeFinal::operator()(
   auto& grad_out = hooked_grads[0][0];
   auto& strides = this->strides_;
   auto& paddings = this->paddings_;
-  auto& paddding_algorithm = this->paddding_algorithm_;
+  auto& padding_algorithm = this->padding_algorithm_;
   auto& groups = this->groups_;
   auto& dilations = this->dilations_;
   auto& data_format = this->data_format_;
-  auto& use_addto = this->use_addto_;
-  auto& workspace_size_MB = this->workspace_size_MB_;
-  auto& exhaustive_search = this->exhaustive_search_;
   // Prepare Grad function call
 
   const auto& out_metas = OutputMeta();
@@ -87,13 +84,10 @@ Conv2dGradNodeFinal::operator()(
                                     grad_out,
                                     strides,
                                     paddings,
-                                    paddding_algorithm,
-                                    groups,
+                                    padding_algorithm,
                                     dilations,
+                                    groups,
                                     data_format,
-                                    use_addto,
-                                    workspace_size_MB,
-                                    exhaustive_search,
                                     api_output_0,
                                     api_output_1);
   // Check NaN and Inf id needed
@@ -134,13 +128,10 @@ Conv2dGradNodeFinal::operator()(
     // SetAttributes if needed
     grad_node->SetAttributestrides(strides);
     grad_node->SetAttributepaddings(paddings);
-    grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
+    grad_node->SetAttributepadding_algorithm(padding_algorithm);
     grad_node->SetAttributegroups(groups);
     grad_node->SetAttributedilations(dilations);
     grad_node->SetAttributedata_format(data_format);
-    grad_node->SetAttributeuse_addto(use_addto);
-    grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
-    grad_node->SetAttributeexhaustive_search(exhaustive_search);
     // Set TensorWrappers for Forward Inputs if needed
     grad_node->SetTensorWrapperinput(input);
     grad_node->SetTensorWrapperfilter(filter);
@@ -215,13 +206,10 @@ Conv2dDoubleGradNodeFinal::operator()(
 
   auto& strides = this->strides_;
   auto& paddings = this->paddings_;
-  auto& paddding_algorithm = this->paddding_algorithm_;
+  auto& padding_algorithm = this->padding_algorithm_;
   auto& groups = this->groups_;
   auto& dilations = this->dilations_;
   auto& data_format = this->data_format_;
-  auto& use_addto = this->use_addto_;
-  auto& workspace_size_MB = this->workspace_size_MB_;
-  auto& exhaustive_search = this->exhaustive_search_;
   // Prepare Grad function call
 
   const auto& out_metas = OutputMeta();
@@ -261,13 +249,10 @@ Conv2dDoubleGradNodeFinal::operator()(
                                          grad_filter_grad_optional,
                                          strides,
                                          paddings,
-                                         paddding_algorithm,
-                                         groups,
+                                         padding_algorithm,
                                          dilations,
+                                         groups,
                                          data_format,
-                                         use_addto,
-                                         workspace_size_MB,
-                                         exhaustive_search,
                                          api_output_0,
                                          api_output_1,
                                          api_output_2);
Original file line number	Diff line number	Diff line change
`@@ -453,7 +453,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(`
`453`	`453`
`454`	`454`	`platform::CUDADeviceGuard cuda_guard;`
`455`	`455`
`456`		`- if (FLAGS_use_stream_safe_cuda_allocator) {`
	`456`	`+ {`
	`457`	`+ platform::NCCLGroupGuard nccl_guard;`
`457`	`458`	`for (size_t i = 0; i < tensors.size(); ++i) {`
`458`	`459`	`cuda_guard.SetDevice(places[i]);`
`459`	`460`	`gpuStream_t nccl_stream;`
`@@ -465,12 +466,11 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(`
`465`	`466`	`} else {`
`466`	`467`	`nccl_stream = places_to_ctx_[key][i]->stream();`
`467`	`468`	`}`
`468`		`- memory::RecordStream(tensors[i].Holder(), nccl_stream);`
	`469`	`+ fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);`
`469`	`470`	`}`
`470`	`471`	`}`
`471`	`472`
`472`		`- {`
`473`		`- platform::NCCLGroupGuard nccl_guard;`
	`473`	`+ if (FLAGS_use_stream_safe_cuda_allocator) {`
`474`	`474`	`for (size_t i = 0; i < tensors.size(); ++i) {`
`475`	`475`	`cuda_guard.SetDevice(places[i]);`
`476`	`476`	`gpuStream_t nccl_stream;`
`@@ -482,7 +482,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(`
`482`	`482`	`} else {`
`483`	`483`	`nccl_stream = places_to_ctx_[key][i]->stream();`
`484`	`484`	`}`
`485`		`- fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);`
	`485`	`+ memory::RecordStream(tensors[i].Holder(), nccl_stream);`
`486`	`486`	`}`
`487`	`487`	`}`
`488`	`488`
`@@ -521,20 +521,20 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(`
`521`	`521`	`// construct uninitialize guard for device`
`522`	`522`	`platform::CUDADeviceGuard cuda_guard;`
`523`	`523`
`524`		`- if (FLAGS_use_stream_safe_cuda_allocator) {`
	`524`	`+ {`
	`525`	`+ platform::NCCLGroupGuard nccl_guard;`
`525`	`526`	`for (size_t i = 0; i < tensors.size(); ++i) {`
`526`	`527`	`cuda_guard.SetDevice(places[i]);`
`527`		`- memory::RecordStream(tensors[i].Holder(),`
`528`		`- places_to_ctx_[key][i]->stream());`
	`528`	`+ const auto& nccl_stream = places_to_ctx_[key][i]->stream();`
	`529`	`+ fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);`
`529`	`530`	`}`
`530`	`531`	`}`
`531`	`532`
`532`		`- {`
`533`		`- platform::NCCLGroupGuard nccl_guard;`
	`533`	`+ if (FLAGS_use_stream_safe_cuda_allocator) {`
`534`	`534`	`for (size_t i = 0; i < tensors.size(); ++i) {`
`535`	`535`	`cuda_guard.SetDevice(places[i]);`
`536`		`- const auto& nccl_stream = places_to_ctx_[key][i]->stream();`
`537`		`- fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);`
	`536`	`+ memory::RecordStream(tensors[i].Holder(),`
	`537`	`+ places_to_ctx_[key][i]->stream());`
`538`	`538`	`}`
`539`	`539`	`}`
`540`	`540`