Skip to content

Commit fb39d22

Browse files
committed
Merge branch 'develop' into E711/fix
2 parents 4315d30 + 5a2ab68 commit fb39d22

365 files changed

Lines changed: 7391 additions & 4863 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.pre-commit-config.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,14 @@ repos:
111111
hooks:
112112
- id: cmakelint
113113
args: [--config=./tools/codestyle/.cmakelintrc]
114+
115+
- repo: https://github.com/PyCQA/autoflake
116+
rev: v1.7.7
117+
hooks:
118+
- id: autoflake
119+
args:
120+
- --in-place
121+
- --remove-all-unused-imports
122+
- --ignore-pass-after-docstring
123+
- --ignore-init-module-imports
124+
- --exclude=python/paddle/fluid/[!t]**,python/paddle/fluid/tra**

cmake/external/cutlass.cmake

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
include(ExternalProject)
16+
17+
set(CUTLASS_PREFIX_DIR ${THIRD_PARTY_PATH}/cutlass)
18+
19+
set(CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git)
20+
set(CUTLASS_TAG v2.9.1)
21+
22+
include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/")
23+
include_directories("${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/include/")
24+
include_directories(
25+
"${THIRD_PARTY_PATH}/cutlass/src/extern_cutlass/tools/util/include/")
26+
27+
add_definitions("-DPADDLE_WITH_CUTLASS")
28+
29+
ExternalProject_Add(
30+
extern_cutlass
31+
${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
32+
GIT_REPOSITORY ${CUTLASS_REPOSITORY}
33+
GIT_TAG "${CUTLASS_TAG}"
34+
PREFIX ${CUTLASS_PREFIX_DIR}
35+
UPDATE_COMMAND ""
36+
CONFIGURE_COMMAND ""
37+
BUILD_COMMAND ""
38+
INSTALL_COMMAND ""
39+
TEST_COMMAND "")
40+
41+
add_library(cutlass INTERFACE)
42+
43+
add_dependencies(cutlass extern_cutlass)

cmake/flags.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,6 @@ if(NOT WIN32)
149149
-Wno-unused-parameter
150150
-Wno-unused-function
151151
-Wno-error=literal-suffix
152-
-Wno-error=unused-local-typedefs
153152
-Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
154153
-Wno-error=terminate # Warning in PADDLE_ENFORCE
155154
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2

cmake/third_party.cmake

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,4 +505,14 @@ if(WITH_CUSPARSELT)
505505
list(APPEND third_party_deps extern_cusparselt)
506506
endif()
507507

508+
if(WITH_GPU
509+
AND NOT WITH_ARM
510+
AND NOT WIN32
511+
AND NOT APPLE)
512+
if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
513+
include(external/cutlass) # download, build, install cusparselt
514+
list(APPEND third_party_deps extern_cutlass)
515+
endif()
516+
endif()
517+
508518
add_custom_target(third_party ALL DEPENDS ${third_party_deps})

paddle/fluid/distributed/auto_parallel/dist_attr.cc

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ bool operator==(const TensorDistAttr& lhs, const TensorDistAttr& rhs) {
319319
}
320320

321321
std::vector<std::string> OperatorDistAttr::fields_{
322-
"process_mesh", "impl_type", "impl_idx"};
322+
"process_mesh", "impl_type", "impl_idx", "execution_stream"};
323323

324324
OperatorDistAttr::OperatorDistAttr(const OpDesc& op) : op_(&op) {
325325
VLOG(4) << "[OperatorDistAttr constructor] op type: " << op_->Type();
@@ -376,8 +376,9 @@ void OperatorDistAttr::initialize() {
376376
output_dist_attrs_[name] = TensorDistAttr(*output);
377377
}
378378
}
379-
impl_type_ = "default";
379+
impl_type_ = kDefault;
380380
impl_idx_ = 0;
381+
execution_stream_ = kDefault;
381382
}
382383

383384
void OperatorDistAttr::copy_from(const OperatorDistAttr& dist_attr) {
@@ -386,9 +387,8 @@ void OperatorDistAttr::copy_from(const OperatorDistAttr& dist_attr) {
386387
set_process_mesh(dist_attr.process_mesh());
387388
set_impl_type(dist_attr.impl_type());
388389
set_impl_idx(dist_attr.impl_idx());
390+
set_execution_stream(dist_attr.execution_stream());
389391
set_annotated(dist_attr.annotated());
390-
impl_type_ = dist_attr.impl_type();
391-
impl_idx_ = dist_attr.impl_idx();
392392
}
393393

394394
void OperatorDistAttr::set_input_dist_attrs(
@@ -666,6 +666,7 @@ std::string OperatorDistAttr::to_string() const {
666666
}
667667
str += "impl_type: " + impl_type_ + ", ";
668668
str += "impl_idx: " + std::to_string(impl_idx_) + ", ";
669+
str += "execution_stream: " + execution_stream_ + ", ";
669670
str += "annotated: [" + str_join(annotated_) + "], ";
670671
str += "\nprocess_mesh: " + process_mesh_.to_string() + ", ";
671672
str += "\ninput_dist_attrs: [\n";
@@ -747,6 +748,9 @@ bool operator==(const OperatorDistAttr& lhs, const OperatorDistAttr& rhs) {
747748
if (lhs.impl_idx() != rhs.impl_idx()) {
748749
return false;
749750
}
751+
if (lhs.execution_stream() != rhs.execution_stream()) {
752+
return false;
753+
}
750754
for (auto const& item : lhs.input_dist_attrs()) {
751755
if (rhs.input_dist_attrs().count(item.first) != 1) {
752756
return false;

paddle/fluid/distributed/auto_parallel/dist_attr.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ using framework::OpDesc;
4646
using framework::ProgramDesc;
4747
using framework::VarDesc;
4848

49+
constexpr const char* kDefault = "default";
50+
4951
class TensorDistAttr {
5052
public:
5153
TensorDistAttr() = default;
@@ -205,6 +207,12 @@ class OperatorDistAttr {
205207

206208
void set_impl_idx(const int64_t& impl_idx) { impl_idx_ = impl_idx; }
207209

210+
const std::string& execution_stream() const { return execution_stream_; }
211+
212+
void set_execution_stream(const std::string& execution_stream) {
213+
execution_stream_ = execution_stream;
214+
}
215+
208216
const std::map<std::string, bool>& annotated() const { return annotated_; }
209217

210218
void set_annotated(const std::map<std::string, bool>& annotated);
@@ -262,6 +270,7 @@ class OperatorDistAttr {
262270
ProcessMesh process_mesh_;
263271
std::string impl_type_;
264272
int64_t impl_idx_ = -1;
273+
std::string execution_stream_;
265274
std::map<std::string, bool> annotated_;
266275
};
267276

paddle/fluid/distributed/collective/ProcessGroupNCCL.cc

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
453453

454454
platform::CUDADeviceGuard cuda_guard;
455455

456-
if (FLAGS_use_stream_safe_cuda_allocator) {
456+
{
457+
platform::NCCLGroupGuard nccl_guard;
457458
for (size_t i = 0; i < tensors.size(); ++i) {
458459
cuda_guard.SetDevice(places[i]);
459460
gpuStream_t nccl_stream;
@@ -465,12 +466,11 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
465466
} else {
466467
nccl_stream = places_to_ctx_[key][i]->stream();
467468
}
468-
memory::RecordStream(tensors[i].Holder(), nccl_stream);
469+
fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
469470
}
470471
}
471472

472-
{
473-
platform::NCCLGroupGuard nccl_guard;
473+
if (FLAGS_use_stream_safe_cuda_allocator) {
474474
for (size_t i = 0; i < tensors.size(); ++i) {
475475
cuda_guard.SetDevice(places[i]);
476476
gpuStream_t nccl_stream;
@@ -482,7 +482,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
482482
} else {
483483
nccl_stream = places_to_ctx_[key][i]->stream();
484484
}
485-
fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
485+
memory::RecordStream(tensors[i].Holder(), nccl_stream);
486486
}
487487
}
488488

@@ -521,20 +521,20 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
521521
// construct uninitialize guard for device
522522
platform::CUDADeviceGuard cuda_guard;
523523

524-
if (FLAGS_use_stream_safe_cuda_allocator) {
524+
{
525+
platform::NCCLGroupGuard nccl_guard;
525526
for (size_t i = 0; i < tensors.size(); ++i) {
526527
cuda_guard.SetDevice(places[i]);
527-
memory::RecordStream(tensors[i].Holder(),
528-
places_to_ctx_[key][i]->stream());
528+
const auto& nccl_stream = places_to_ctx_[key][i]->stream();
529+
fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
529530
}
530531
}
531532

532-
{
533-
platform::NCCLGroupGuard nccl_guard;
533+
if (FLAGS_use_stream_safe_cuda_allocator) {
534534
for (size_t i = 0; i < tensors.size(); ++i) {
535535
cuda_guard.SetDevice(places[i]);
536-
const auto& nccl_stream = places_to_ctx_[key][i]->stream();
537-
fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
536+
memory::RecordStream(tensors[i].Holder(),
537+
places_to_ctx_[key][i]->stream());
538538
}
539539
}
540540

paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ paddle::experimental::Tensor conv2d_ad_func(
2424
const paddle::experimental::Tensor& filter,
2525
std::vector<int> strides,
2626
std::vector<int> paddings,
27-
std::string paddding_algorithm,
28-
int groups,
27+
std::string padding_algorithm,
2928
std::vector<int> dilations,
30-
std::string data_format,
31-
bool use_addto,
32-
int workspace_size_MB,
33-
bool exhaustive_search);
29+
int groups,
30+
std::string data_format);

paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,10 @@ paddle::experimental::Tensor conv2d_ad_func(
2929
const paddle::experimental::Tensor& filter,
3030
std::vector<int> strides,
3131
std::vector<int> paddings,
32-
std::string paddding_algorithm,
33-
int groups,
32+
std::string padding_algorithm,
3433
std::vector<int> dilations,
35-
std::string data_format,
36-
bool use_addto,
37-
int workspace_size_MB,
38-
bool exhaustive_search) {
34+
int groups,
35+
std::string data_format) {
3936
// Dygraph Record Event
4037
paddle::platform::RecordEvent dygraph_entrance_record_event(
4138
"conv2d dygraph", paddle::platform::TracerEventType::Operator, 1);
@@ -64,13 +61,10 @@ paddle::experimental::Tensor conv2d_ad_func(
6461
new_filter,
6562
strides,
6663
paddings,
67-
paddding_algorithm,
68-
groups,
64+
padding_algorithm,
6965
dilations,
70-
data_format,
71-
use_addto,
72-
workspace_size_MB,
73-
exhaustive_search);
66+
groups,
67+
data_format);
7468
}
7569
}
7670

@@ -92,13 +86,10 @@ paddle::experimental::Tensor conv2d_ad_func(
9286
filter,
9387
strides,
9488
paddings,
95-
paddding_algorithm,
96-
groups,
89+
padding_algorithm,
9790
dilations,
98-
data_format,
99-
use_addto,
100-
workspace_size_MB,
101-
exhaustive_search);
91+
groups,
92+
data_format);
10293
transformer->SetOutTensorLayout(&out);
10394
if (need_tune) {
10495
egr::Controller::Instance().EnableLayoutAutoTune();
@@ -119,13 +110,10 @@ paddle::experimental::Tensor conv2d_ad_func(
119110
filter,
120111
strides,
121112
paddings,
122-
paddding_algorithm,
123-
groups,
113+
padding_algorithm,
124114
dilations,
125-
data_format,
126-
use_addto,
127-
workspace_size_MB,
128-
exhaustive_search);
115+
groups,
116+
data_format);
129117
// Check NaN and Inf if needed
130118
if (FLAGS_check_nan_inf) {
131119
egr::CheckTensorHasNanOrInf("conv2d", api_result);
@@ -157,13 +145,10 @@ paddle::experimental::Tensor conv2d_ad_func(
157145
// SetAttributes if needed
158146
grad_node->SetAttributestrides(strides);
159147
grad_node->SetAttributepaddings(paddings);
160-
grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
148+
grad_node->SetAttributepadding_algorithm(padding_algorithm);
161149
grad_node->SetAttributegroups(groups);
162150
grad_node->SetAttributedilations(dilations);
163151
grad_node->SetAttributedata_format(data_format);
164-
grad_node->SetAttributeuse_addto(use_addto);
165-
grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
166-
grad_node->SetAttributeexhaustive_search(exhaustive_search);
167152
// Set TensorWrappers for Forward Inputs if needed
168153
grad_node->SetTensorWrapperinput(input);
169154
grad_node->SetTensorWrapperfilter(filter);

paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,10 @@ Conv2dGradNodeFinal::operator()(
4646
auto& grad_out = hooked_grads[0][0];
4747
auto& strides = this->strides_;
4848
auto& paddings = this->paddings_;
49-
auto& paddding_algorithm = this->paddding_algorithm_;
49+
auto& padding_algorithm = this->padding_algorithm_;
5050
auto& groups = this->groups_;
5151
auto& dilations = this->dilations_;
5252
auto& data_format = this->data_format_;
53-
auto& use_addto = this->use_addto_;
54-
auto& workspace_size_MB = this->workspace_size_MB_;
55-
auto& exhaustive_search = this->exhaustive_search_;
5653
// Prepare Grad function call
5754

5855
const auto& out_metas = OutputMeta();
@@ -87,13 +84,10 @@ Conv2dGradNodeFinal::operator()(
8784
grad_out,
8885
strides,
8986
paddings,
90-
paddding_algorithm,
91-
groups,
87+
padding_algorithm,
9288
dilations,
89+
groups,
9390
data_format,
94-
use_addto,
95-
workspace_size_MB,
96-
exhaustive_search,
9791
api_output_0,
9892
api_output_1);
9993
// Check NaN and Inf id needed
@@ -134,13 +128,10 @@ Conv2dGradNodeFinal::operator()(
134128
// SetAttributes if needed
135129
grad_node->SetAttributestrides(strides);
136130
grad_node->SetAttributepaddings(paddings);
137-
grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
131+
grad_node->SetAttributepadding_algorithm(padding_algorithm);
138132
grad_node->SetAttributegroups(groups);
139133
grad_node->SetAttributedilations(dilations);
140134
grad_node->SetAttributedata_format(data_format);
141-
grad_node->SetAttributeuse_addto(use_addto);
142-
grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
143-
grad_node->SetAttributeexhaustive_search(exhaustive_search);
144135
// Set TensorWrappers for Forward Inputs if needed
145136
grad_node->SetTensorWrapperinput(input);
146137
grad_node->SetTensorWrapperfilter(filter);
@@ -215,13 +206,10 @@ Conv2dDoubleGradNodeFinal::operator()(
215206

216207
auto& strides = this->strides_;
217208
auto& paddings = this->paddings_;
218-
auto& paddding_algorithm = this->paddding_algorithm_;
209+
auto& padding_algorithm = this->padding_algorithm_;
219210
auto& groups = this->groups_;
220211
auto& dilations = this->dilations_;
221212
auto& data_format = this->data_format_;
222-
auto& use_addto = this->use_addto_;
223-
auto& workspace_size_MB = this->workspace_size_MB_;
224-
auto& exhaustive_search = this->exhaustive_search_;
225213
// Prepare Grad function call
226214

227215
const auto& out_metas = OutputMeta();
@@ -261,13 +249,10 @@ Conv2dDoubleGradNodeFinal::operator()(
261249
grad_filter_grad_optional,
262250
strides,
263251
paddings,
264-
paddding_algorithm,
265-
groups,
252+
padding_algorithm,
266253
dilations,
254+
groups,
267255
data_format,
268-
use_addto,
269-
workspace_size_MB,
270-
exhaustive_search,
271256
api_output_0,
272257
api_output_1,
273258
api_output_2);

0 commit comments

Comments
 (0)