Skip to content

Commit bb63571

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into storage_allocation_pr
2 parents 62b1432 + 88c2cba commit bb63571

154 files changed

Lines changed: 6287 additions & 1061 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF)
4343
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
4444
option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
4545
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
46+
option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF)
4647
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
4748
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
4849
option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
@@ -64,6 +65,9 @@ endif()
6465
if (WITH_GPU AND WITH_ROCM)
6566
message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
6667
endif()
68+
if (WITH_GPU AND WITH_MLU)
69+
message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
70+
endif()
6771

6872
if(WITH_GPU AND NOT APPLE)
6973
enable_language(CUDA)
@@ -302,6 +306,10 @@ if(WITH_GPU)
302306
endif()
303307
endif()
304308

309+
if(WITH_MLU)
310+
include(neuware)
311+
endif()
312+
305313
if(WITH_ROCM)
306314
include(hip)
307315
include(miopen) # set miopen libraries, must before configure

cmake/configure.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,11 @@ if(WITH_IPU)
102102
add_definitions(-DPADDLE_WITH_IPU)
103103
endif()
104104

105+
if(WITH_MLU)
106+
message(STATUS "Compile with MLU!")
107+
add_definitions(-DPADDLE_WITH_MLU)
108+
endif()
109+
105110
if(WITH_GPU)
106111
add_definitions(-DPADDLE_WITH_CUDA)
107112
add_definitions(-DEIGEN_USE_GPU)
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
include(ExternalProject)
16+
17+
set(CONCURRENTQUEUE_PROJECT "extern_concurrentqueue")
18+
set(CONCURRENTQUEUE_VER "v1.0.3")
19+
SET(CONCURRENTQUEUE_URL_MD5 118e5bb661b567634647312991e10222)
20+
set(CONCURRENTQUEUE_PREFIX_URL "https://github.com/cameron314/concurrentqueue/archive/refs/tags")
21+
set(CONCURRENTQUEUE_URL "${CONCURRENTQUEUE_PREFIX_URL}/${CONCURRENTQUEUE_VER}.tar.gz")
22+
23+
MESSAGE(STATUS "CONCURRENTQUEUE_VERSION: ${CONCURRENTQUEUE_VER}, CONCURRENTQUEUE_URL: ${CONCURRENTQUEUE_URL}")
24+
25+
set(CONCURRENTQUEUE_PREFIX_DIR ${THIRD_PARTY_PATH}/concurrentqueue)
26+
set(CONCURRENTQUEUE_SOURCE_DIR ${THIRD_PARTY_PATH}/concurrentqueue/src/)
27+
set(CONCURRENTQUEUE_INCLUDE_DIR "${CONCURRENTQUEUE_SOURCE_DIR}/extern_concurrentqueue")
28+
29+
ExternalProject_Add(
30+
${CONCURRENTQUEUE_PROJECT}
31+
${EXTERNAL_PROJECT_LOG_ARGS}
32+
URL ${CONCURRENTQUEUE_URL}
33+
URL_MD5 ${CONCURRENTQUEUE_URL_MD5}
34+
PREFIX ${CONCURRENTQUEUE_PREFIX_DIR}
35+
DOWNLOAD_NO_PROGRESS 1
36+
CONFIGURE_COMMAND ""
37+
BUILD_COMMAND ""
38+
INSTALL_COMMAND ""
39+
UPDATE_COMMAND ""
40+
)
41+
42+
include_directories(${CONCURRENTQUEUE_INCLUDE_DIR})

cmake/neuware.cmake

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
if(NOT WITH_MLU)
2+
return()
3+
endif()
4+
5+
if(NOT ENV{NEUWARE_HOME})
6+
set(NEUWARE_HOME "/usr/local/neuware")
7+
else()
8+
set(NEUWARE_HOME $ENV{NEUWARE_HOME})
9+
endif()
10+
message(STATUS "NEUWARE_HOME: " ${NEUWARE_HOME})
11+
12+
set(NEUWARE_INCLUDE_DIR ${NEUWARE_HOME}/include)
13+
set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
14+
15+
INCLUDE_DIRECTORIES(${NEUWARE_INCLUDE_DIR})
16+
17+
set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
18+
set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
19+
set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
20+
21+
generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
22+
TARGET_LINK_LIBRARIES(neuware_lib ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})

cmake/operators.cmake

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ function(op_library TARGET)
1212
set(hip_cc_srcs)
1313
set(xpu_cc_srcs)
1414
set(npu_cc_srcs)
15+
set(mlu_cc_srcs)
1516
set(cudnn_cu_cc_srcs)
1617
set(miopen_cu_cc_srcs)
1718
set(cudnn_cu_srcs)
@@ -24,6 +25,10 @@ function(op_library TARGET)
2425
if (WITH_ASCEND_CL)
2526
set(op_common_deps ${op_common_deps} npu_op_runner)
2627
endif()
28+
if (WITH_MLU)
29+
set(op_common_deps ${op_common_deps} mlu_baseop)
30+
endif()
31+
2732
# Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
2833
set(options UNITY)
2934
set(oneValueArgs "")
@@ -98,6 +103,12 @@ function(op_library TARGET)
98103
list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
99104
endif()
100105
endif()
106+
if(WITH_MLU)
107+
string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
108+
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
109+
list(APPEND mlu_cc_srcs ${MLU_FILE}.cc)
110+
endif()
111+
endif()
101112
else()
102113
foreach(src ${op_library_SRCS})
103114
if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
@@ -122,6 +133,8 @@ function(op_library TARGET)
122133
list(APPEND xpu_cc_srcs ${src})
123134
elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
124135
list(APPEND npu_cc_srcs ${src})
136+
elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
137+
list(APPEND mlu_cc_srcs ${src})
125138
elseif(${src} MATCHES ".*\\.cc$")
126139
list(APPEND cc_srcs ${src})
127140
else()
@@ -196,7 +209,7 @@ function(op_library TARGET)
196209
# Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
197210
if(WITH_UNITY_BUILD AND op_library_UNITY)
198211
# Combine the cc source files.
199-
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
212+
compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} ${mlu_cc_srcs})
200213
if(TARGET ${UNITY_TARGET})
201214
# If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
202215
target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@@ -207,7 +220,7 @@ function(op_library TARGET)
207220
# Add alias library to handle dependencies.
208221
add_library(${TARGET} ALIAS ${UNITY_TARGET})
209222
else()
210-
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
223+
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} ${mlu_cc_srcs} DEPS ${op_library_DEPS}
211224
${op_common_deps})
212225
endif()
213226
endif()
@@ -262,8 +275,10 @@ function(op_library TARGET)
262275
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
263276
list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
264277
list(LENGTH npu_cc_srcs npu_cc_srcs_len)
278+
list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)
265279
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
266-
${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
280+
${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND
281+
${npu_cc_srcs_len} EQUAL 0 AND ${mlu_cc_srcs_len} EQUAL 0)
267282
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
268283
set(pybind_flag 1)
269284
endif()
@@ -322,6 +337,24 @@ function(op_library TARGET)
322337
endif()
323338
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
324339
endif()
340+
if (WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
341+
file(READ ${ORIGINAL_TARGET}_mlu.cc TARGET_MLU_CONTENT)
342+
# It is different from the logic above, becareful
343+
string(REGEX MATCH "REGISTER_OP_MLU_KERNEL\\(.*" multi_mlu_register "${TARGET_MLU_CONTENT}")
344+
# [ \t\r\n]* is used for blank characters
345+
string(REGEX MATCH "REGISTER_OP_MLU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_mlu_register "${multi_mlu_register}")
346+
347+
if (one_mlu_register STREQUAL "")
348+
string(REPLACE "_op" "" MLU_TARGET "${TARGET}")
349+
else ()
350+
string(REPLACE "REGISTER_OP_MLU_KERNEL(" "" MLU_TARGET "${one_mlu_register}")
351+
string(REPLACE "," "" MLU_TARGET "${MLU_TARGET}")
352+
# [ \t\r\n]+ is used for blank characters.
353+
# Here we use '+' instead of '*' since it is a REPLACE operation.
354+
string(REGEX REPLACE "[ \t\r\n]+" "" MLU_TARGET "${MLU_TARGET}")
355+
endif()
356+
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MLU_TARGET}, MLU);\n")
357+
endif()
325358

326359
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
327360
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
@@ -369,11 +402,11 @@ function(register_operators)
369402
set(multiValueArgs EXCLUDES DEPS)
370403
cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
371404
"${multiValueArgs}" ${ARGN})
372-
373405
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
374406
string(REPLACE "_mkldnn" "" OPS "${OPS}")
375407
string(REPLACE "_xpu" "" OPS "${OPS}")
376408
string(REPLACE "_npu" "" OPS "${OPS}")
409+
string(REPLACE "_mlu" "" OPS "${OPS}")
377410
string(REPLACE ".cc" "" OPS "${OPS}")
378411
list(REMOVE_DUPLICATES OPS)
379412
list(LENGTH register_operators_DEPS register_operators_DEPS_len)

cmake/third_party.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,11 @@ if(WITH_XPU)
278278
list(APPEND third_party_deps extern_xpu)
279279
endif(WITH_XPU)
280280

281+
if(WITH_MLU)
282+
include(external/concurrentqueue) # download, build, install concurrentqueue
283+
list(APPEND third_party_deps extern_concurrentqueue)
284+
endif(WITH_MLU)
285+
281286
if(WITH_PSLIB)
282287
include(external/pslib) # download, build, install pslib
283288
list(APPEND third_party_deps extern_pslib)

paddle/fluid/distributed/fleet_executor/carrier.cc

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -240,13 +240,12 @@ void Carrier::CreateInterceptors() {
240240
task_node->run_at_offset(), task_node->run_per_steps()));
241241

242242
std::unique_ptr<Interceptor> interceptor;
243-
if (task_node->type().empty()) {
244-
// TODO(wangxi): delete this in future
245-
interceptor.reset(new Interceptor(interceptor_id, task_node));
246-
} else {
247-
interceptor = InterceptorFactory::Create(task_node->type(),
248-
interceptor_id, task_node);
249-
}
243+
PADDLE_ENFORCE_NE(task_node->type().empty(), true,
244+
platform::errors::NotFound(
245+
"Cannot found type for task node with id %lld",
246+
task_node->task_id()));
247+
interceptor = InterceptorFactory::Create(task_node->type(), interceptor_id,
248+
task_node);
250249
interceptor->SetPlace(place_);
251250
interceptor->SetMiniBatchScope(minibatch_scope_);
252251
interceptor->SetMicroBatchScope(microbatch_scopes_);

paddle/fluid/distributed/fleet_executor/fleet_executor.cc

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -48,32 +48,29 @@ void FleetExecutor::Init(
4848
const framework::ProgramDesc& program_desc, framework::Scope* scope,
4949
const platform::Place& place, const std::vector<TaskNode*>& task_nodes,
5050
const std::unordered_map<int64_t, int64_t>& task_id_to_rank) {
51-
if (task_nodes.size() == 0) {
52-
LOG(INFO) << "fleet executor will use c++ side scheduler construction.";
53-
runtime_graph_ = std::make_shared<RuntimeGraph>(program_desc, exe_desc_);
54-
} else {
55-
LOG(INFO) << "fleet executor has been set dependency on python side.";
56-
// TODO(fleet_exe devs): the unused_vars should be got from run time graph
57-
std::vector<std::unique_ptr<framework::OperatorBase>> ops;
58-
for (auto task_node : task_nodes) {
59-
for (auto op : task_node->ops()) {
60-
ops.emplace_back(std::unique_ptr<framework::OperatorBase>(op));
61-
}
62-
}
63-
auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {});
64-
runtime_graph_ = std::make_shared<RuntimeGraph>();
65-
std::unordered_map<int64_t, TaskNode*> interceptor_id_to_task;
66-
for (auto task_node : task_nodes) {
67-
task_node->SetUnusedVars(unused_vars);
68-
int64_t interceptor_id = task_node->task_id();
69-
interceptor_id_to_task.emplace(interceptor_id, task_node);
70-
}
71-
runtime_graph_->SetInterceptorIdToRank(task_id_to_rank);
72-
runtime_graph_->SetInterceptorIdToNode(interceptor_id_to_task);
73-
for (auto& unique_op : ops) {
74-
unique_op.release();
51+
PADDLE_ENFORCE_GT(task_nodes.size(), 0,
52+
platform::errors::InvalidArgument(
53+
"Fleet executor is inited with empty task node"));
54+
// TODO(fleet_exe devs): the unused_vars should be got from run time graph
55+
std::vector<std::unique_ptr<framework::OperatorBase>> ops;
56+
for (auto task_node : task_nodes) {
57+
for (auto op : task_node->ops()) {
58+
ops.emplace_back(std::unique_ptr<framework::OperatorBase>(op));
7559
}
7660
}
61+
auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {});
62+
runtime_graph_ = std::make_shared<RuntimeGraph>();
63+
std::unordered_map<int64_t, TaskNode*> interceptor_id_to_task;
64+
for (auto task_node : task_nodes) {
65+
task_node->SetUnusedVars(unused_vars);
66+
int64_t interceptor_id = task_node->task_id();
67+
interceptor_id_to_task.emplace(interceptor_id, task_node);
68+
}
69+
runtime_graph_->SetInterceptorIdToRank(task_id_to_rank);
70+
runtime_graph_->SetInterceptorIdToNode(interceptor_id_to_task);
71+
for (auto& unique_op : ops) {
72+
unique_op.release();
73+
}
7774
root_scope_ = scope;
7875
place_ = place;
7976
PADDLE_ENFORCE_NOT_NULL(root_scope_, platform::errors::InvalidArgument(

paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,5 @@ message RankInfo {
2323
message FleetExecutorDesc {
2424
optional int64 cur_rank = 1 [ default = 0 ]; // Rank id of current processor
2525
repeated RankInfo cluster_info = 2;
26-
optional int32 dp_degree = 3 [ default = 1 ];
27-
optional int32 mp_degree = 4 [ default = 1 ];
28-
optional int32 pp_degree = 5 [ default = 1 ];
29-
optional int64 num_micro_batches = 6 [ default = 1 ];
30-
optional int64 num_slots = 7 [ default = 1 ];
26+
optional int64 num_micro_batches = 3 [ default = 1 ];
3127
}

0 commit comments

Comments
 (0)