Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/fluid/framework/details/build_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false};
// Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
// should not be sparse types
bool fuse_all_optimizer_ops_{false};
bool fuse_all_optimizer_ops_{true};
bool fuse_all_reduce_ops_{false};
// fuse_relu_depthwise_conv can fuse the `relu ->
// depthwise_conv`
Expand Down
20 changes: 4 additions & 16 deletions paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,14 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/device_memory_aligment.h"
#include "paddle/fluid/platform/profiler.h"

DEFINE_bool(skip_fused_all_reduce_check, false, "");
namespace paddle {
namespace framework {
namespace details {

// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
static size_t Alignment(size_t size, const platform::Place &place) {
// Allow to allocate the minimum chunk size is 4 KB.
size_t alignment = 1 << 12;
if (platform::is_gpu_place(place)) {
// Allow to allocate the minimum chunk size is 256 B.
alignment = 1 << 8;
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}

typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
GradientAndLoDTensor;

Expand Down Expand Up @@ -121,7 +109,7 @@ void FusedAllReduceOpHandle::RunImpl() {
for (size_t k = 1; k < g_tensor.size(); ++k) {
const void *cur_address = g_tensor.at(k - 1).second->data<void>();
int64_t len = g_tensor.at(k - 1).second->numel();
auto offset = Alignment(len * size_of_dtype, places_[0]);
auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
void *infer_next_address = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(cur_address) + offset);
const void *next_address = g_tensor.at(k).second->data<void>();
Expand Down Expand Up @@ -241,8 +229,8 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
// Get element number
int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0);
// Alignment(len)
*numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
*numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
}
}

Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/framework/details/multi_devices_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
constexpr char kParamsAndDenseGrads[] = "params_and_dense_grads";
constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";

typedef std::vector<ProgramDesc> ProgramDescs;
constexpr char kProgramDescs[] = "program_descs";

typedef std::vector<std::vector<std::pair<std::string, std::string>>>
GroupParamsAndGrads;
constexpr char kGroupParamsAndGrads[] = "group_params_grads";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/profiler.h"

Expand Down Expand Up @@ -70,6 +72,29 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
InitializeVariable(pair.first, pair.second);
}
}

const ir::Graph &graph = Graph();
if (graph.Has(details::kProgramDescs)) {
auto &program_descs =
graph.Get<details::ProgramDescs>(details::kProgramDescs);
// Init vars
auto &fused_grad_vars = graph.Get<details::FusedVars>(details::kFusedVars);
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
for (auto &var_name : fused_grad_vars) {
auto var = local_exec_scopes_[i]->Var(var_name);
var->GetMutable<LoDTensor>();
}
}

for (auto &program_desc : program_descs) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes_[i], places_[i]);
}
}
}
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Define the fused variables in the local execution scope.
Because for some model, there may be more than one program, and those programs may share some parameters, for the previous strategy, the gradients of the shared parameters of those programs are also shared, But this is somewhat problematic, so we should define those fused variables of gradients in the local execution scope.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy these line to Comments may be better
And which is the unit test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}

void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
if (node->Op()->Type() == fuse_op_type) {
auto grad_name = node->Op()->Input(kGrad);
PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1));
if (IsLoDTensorType(GettypeOfVar(vars_info, grad_name[0]))) {
if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) {
opt_nodes.emplace_back(node);
}
++opt_ops_num;
Expand All @@ -61,6 +61,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
}
result.Set(details::kFusedOptType, new details::FusedOptType);
result.Get<details::FusedOptType>(details::kFusedOptType) = fuse_op_type;
if (!result.Has(details::kProgramDescs)) {
result.Set(details::kProgramDescs, new details::ProgramDescs);
}

// Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
// initialized in scopes before execution.
Expand Down Expand Up @@ -153,7 +156,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
}
aux_var_names.pop_back();
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
aux_var_set, fused_vars_name);
aux_var_set, fused_vars_name, &result);

// Step 5: Fuse optimizer Ops and Scale Ops
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result);
Expand Down Expand Up @@ -203,37 +206,21 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
PADDLE_ENFORCE(iter != vars_info.end());
PADDLE_ENFORCE(!iter->second.empty());
PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var());
PADDLE_ENFORCE(
iter->second.front()->Var()->GetType() == proto::VarType::LOD_TENSOR,
"Currently the gradient type only should be LoDTensor when "
"fusing optimizer ops.");
PADDLE_ENFORCE(IsLoDTensorType(iter->second.front()->Var()->GetType()),
"Currently the gradient type only should be LoDTensor when "
"fusing optimizer ops.");
for (auto var : iter->second) {
var->Var()->SetPersistable(true);
}
}

// Init Grads
for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
auto &scope = *it;
VLOG(6) << "Init: " << fused_grad_name;
PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr,
"%s has existed in scope.", fused_grad_name);
scope->Var(fused_grad_name)->GetMutable<LoDTensor>();
for (auto &grad_var_name : grads) {
auto iter = vars_info.find(grad_var_name);
PADDLE_ENFORCE(iter != vars_info.end());
PADDLE_ENFORCE(!iter->second.empty());
PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var());
scope->Var(grad_var_name)->GetMutable<LoDTensor>();
}
}
// Define Ops
ProgramDesc program_desc;
result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0);
AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
false, false);
// Run Ops
RunInitOps(places, local_scopes, *global_block);
}

std::unordered_map<std::string, std::vector<Node *>>
Expand All @@ -255,7 +242,7 @@ bool FuseOptimizerOpPass::IsLoDTensorType(
return type == proto::VarType::LOD_TENSOR;
}

proto::VarType::Type FuseOptimizerOpPass::GettypeOfVar(
proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar(
const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
const std::string &name) const {
auto grad_iter = var_nodes.find(name);
Expand All @@ -271,47 +258,18 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const {
// Init Vars
for (auto &var_name : aux_var_names) {
auto &fused_var_name = fused_vars_name.at(var_name);
InitVars(local_scopes, fused_var_name);
}
const std::unordered_map<std::string, std::string> &fused_vars_name,
ir::Graph *result) const {
// Define Ops
ProgramDesc program_desc;
result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
ProgramDesc &program_desc =
result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace(
aux_var_set.at(var_name), aux_var_set.at(var_name),
fused_vars_name.at(var_name), global_block, true);
}
// Run Ops
RunInitOps(places, local_scopes, *global_block);
}

void FuseOptimizerOpPass::RunInitOps(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const BlockDesc &global_block) const {
for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : global_block.AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]);
}
}
}

void FuseOptimizerOpPass::InitVars(const std::vector<Scope *> &local_scopes,
const std::string &fused_var_name) const {
// Alloc parameters and auxiliary vars in the respective scope.
size_t idx = local_scopes.size();
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
++iter, --idx) {
auto &scope = *iter;
VLOG(6) << "Init: " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
}

void FuseOptimizerOpPass::SortParametersAndAuxVars(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,20 +79,13 @@ class FuseOptimizerOpPass : public ir::Pass {
const std::vector<std::string> &aux_var_names,
const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name)
const;

void RunInitOps(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const BlockDesc &global_block) const;

void InitVars(const std::vector<Scope *> &local_scopes,
const std::string &fused_var_name) const;
const std::unordered_map<std::string, std::string> &fused_vars_name,
ir::Graph *result) const;

std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
const Graph &result) const;

proto::VarType::Type GettypeOfVar(
proto::VarType::Type GetTypeOfVar(
const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
const std::string &name) const;

Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code
if (WITH_GPU)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)

# FIXME(typhoonzero): operator deps may not needed.
# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
Expand Down
24 changes: 7 additions & 17 deletions paddle/fluid/operators/coalesce_tensor_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_memory_aligment.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -86,8 +87,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor);

offset +=
Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
offset += platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype;
}
} else if (context.Attr<bool>("set_constant")) {
math::SetConstant<DeviceContext, T> set_constant;
Expand All @@ -106,7 +107,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
->ShareDataWith(fused_tensor->Slice(
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim);
len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
len = platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype;
offset += len;
ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")"
<< " address: " << out_tensors[i]->data<void>() << ", ";
Expand All @@ -115,19 +117,6 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
}

private:
// Note(zcd): Addresses should be aligned, otherwise, the results may have
// diff.
size_t Alignment(size_t size, const platform::Place &place) const {
// Allow to allocate the minimum chunk size is 4 KB.
size_t alignment = 1 << 12;
if (platform::is_gpu_place(place)) {
// Allow to allocate the minimum chunk size is 256 B.
alignment = 1 << 8;
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}

void GetMemSizeAndDtype(
const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel,
Expand Down Expand Up @@ -156,7 +145,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
PADDLE_ENFORCE_GT(size, 0);
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
<< "), ";
*numel += Alignment(static_cast<size_t>(size) * size_of_dtype, place) /
*numel += platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
place) /
size_of_dtype;
}

Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,17 +102,17 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else()
cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif()
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)

IF(WITH_GPU)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
ENDIF()
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)

if(WITH_GPU)
Expand Down
34 changes: 34 additions & 0 deletions paddle/fluid/platform/device_memory_aligment.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/platform/device_memory_aligment.h"

namespace paddle {
namespace platform {
size_t Alignment(size_t size, const platform::Place &place) {
size_t alignment = 1024;
if (platform::is_cpu_place(place)) {
alignment = CpuMinChunkSize();
} else {
#ifdef PADDLE_WITH_CUDA
alignment = GpuMinChunkSize();
#else
PADDLE_THROW("Fluid is not compiled with CUDA");
#endif
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
} // namespace platform
} // namespace paddle
Loading