Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/api/all.h"
#include "paddle/phi/api/lib/api_custom_impl.h"

Expand All @@ -34,6 +35,19 @@ AddNGradNodeFinal::operator()(
bool is_new_grad) {
// Fill Zero For GradIn Tensors

// This 'Local_XXXGradNode' record event is different with
// 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function,
// but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra accumulation overhead than
// 'Local_XXXGradNode'.
paddle::platform::RecordEvent node_execution_inner(
"Local_AddNGradNodeFinal",
paddle::platform::TracerEventType::OperatorInner,
1);

// Apply Gradient Hooks
auto hooked_grads = ApplyGradientHooks(grads);

Expand Down
26 changes: 26 additions & 0 deletions paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,19 @@ Conv2dGradNodeFinal::operator()(
bool is_new_grad) {
// Fill Zero For GradIn Tensors
VLOG(3) << " Running Conv2dGradNodeFinal: " << this;
// This 'Local_XXXGradNode' record event is different with
// 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function,
// but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra accumulation overhead than
// 'Local_XXXGradNode'.
paddle::platform::RecordEvent node_execution_inner(
"Local_Conv2dGradNodeFinal",
paddle::platform::TracerEventType::OperatorInner,
1);

// Apply Gradient Hooks
auto hooked_grads = ApplyGradientHooks(grads);

Expand Down Expand Up @@ -208,6 +221,19 @@ Conv2dDoubleGradNodeFinal::operator()(
egr::kSlotSmallVectorSize>& grads,
bool create_graph,
bool is_new_grad) {
// This 'Local_XXXGradNode' record event is different with
// 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function,
// but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra accumulation overhead than
// 'Local_XXXGradNode'.
paddle::platform::RecordEvent node_execution_inner(
"Local_Conv2dDoubleGradNodeFinal",
paddle::platform::TracerEventType::OperatorInner,
1);

// Fill Zero For GradIn Tensors
const auto& input_metas = this->InputMeta();
egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0][0],
Expand Down
39 changes: 39 additions & 0 deletions paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,19 @@ MultiplyGradNode::operator()(
bool is_new_grad) {
VLOG(3) << "Running AD API GRAD: "
<< "multiply_grad";
// This 'Local_XXXGradNode' record event is different with
// 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function,
// but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra accumulation overhead than
// 'Local_XXXGradNode'.
paddle::platform::RecordEvent node_execution_inner(
"Local_MultiplyGradNode",
paddle::platform::TracerEventType::OperatorInner,
1);

// Fill Zero For GradIn Tensors
const auto& input_metas = this->InputMeta();
egr::EagerUtils::FillZeroForEmptyGradInput(&grads[0][0], input_metas[0][0]);
Expand Down Expand Up @@ -245,6 +258,19 @@ MultiplyDoubleGradNode::operator()(
bool is_new_grad) {
VLOG(3) << "Running AD API GRAD: "
<< "multiply_double_grad";
// This 'Local_XXXGradNode' record event is different with
// 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function,
// but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra accumulation overhead than
// 'Local_XXXGradNode'.
paddle::platform::RecordEvent node_execution_inner(
"Local_MultiplyDoubleGradNode",
paddle::platform::TracerEventType::OperatorInner,
1);

// Fill Zero For GradIn Tensors
const auto& input_metas = this->InputMeta();
egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0][0],
Expand Down Expand Up @@ -505,6 +531,19 @@ MultiplyGradNode::operator()(
bool is_new_grad) {
VLOG(3) << "Running AD API GRAD: "
<< "multiply_grad";
// This 'Local_XXXGradNode' record event is different with
// 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function,
// but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra accumulation overhead than
// 'Local_XXXGradNode'.
paddle::platform::RecordEvent node_execution_inner(
"Local_MultiplyGradNode",
paddle::platform::TracerEventType::OperatorInner,
1);

// Fill Zero For GradIn Tensors
const auto& input_metas = this->InputMeta();
egr::EagerUtils::FillZeroForEmptyGradInput(&grads[0][0], input_metas[0][0]);
Expand Down
13 changes: 13 additions & 0 deletions paddle/fluid/eager/api/manual/eager_manual/nodes/reshard_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"

paddle::small_vector<std::vector<paddle::Tensor>,
egr::kSlotSmallVectorSize> // NOLINT
Expand All @@ -29,6 +30,18 @@ ReshardGradNode::operator()(
#ifdef PADDLE_WITH_DISTRIBUTE
VLOG(3) << "Running AD API GRAD: "
<< "reshard_grad";
// This 'Local_XXXGradNode' record event is different with
// 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function,
// but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra accumulation overhead than
// 'Local_XXXGradNode'.
paddle::platform::RecordEvent node_execution_inner(
"Local_ReshardGradNode",
paddle::platform::TracerEventType::OperatorInner,
1);

// Apply Gradient Hooks
auto hooked_grad = ApplyGradientHooks(grads);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/api/all.h"
#include "paddle/phi/api/backward/backward_api.h"
#include "paddle/phi/api/backward/sparse_bw_api.h"
Expand All @@ -37,6 +38,19 @@ SyncBatchNormGradNode::operator()(
bool is_new_grad) {
VLOG(3) << "Running AD API GRAD: "
<< "sync_batch_norm_grad";
// This 'Local_XXXGradNode' record event is different with
// 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function,
// but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra accumulation overhead than
// 'Local_XXXGradNode'.
paddle::platform::RecordEvent node_execution_inner(
"Local_SyncBatchNormGradNode",
paddle::platform::TracerEventType::OperatorInner,
1);

// Fill Zero For GradIn Tensors

// Apply Gradient Hooks
Expand Down Expand Up @@ -256,6 +270,19 @@ SyncBatchNormGradNode::operator()(
bool is_new_grad) {
VLOG(3) << "Running AD API GRAD: "
<< "sync_batch_norm_grad";
// This 'Local_XXXGradNode' record event is different with
// 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function,
// but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra accumulation overhead than
// 'Local_XXXGradNode'.
paddle::platform::RecordEvent node_execution_inner(
"Local_SyncBatchNormGradNode",
paddle::platform::TracerEventType::OperatorInner,
1);

// Fill Zero For GradIn Tensors

// Apply Gradient Hooks
Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,12 @@ class {} : public egr::GradNodeBase {{
paddle::small_vector<std::vector<paddle::Tensor>, egr::kSlotSmallVectorSize> {}::operator()(paddle::small_vector<std::vector<paddle::Tensor>, egr::kSlotSmallVectorSize>& grads, bool create_graph, bool is_new_grad) {{
VLOG(3) << \"Running AD API GRAD: \" << \"{}\";

// This 'Local_XXXGradNode' record event is different with 'Global_XXXGradNode' event.
// * 'Local_XXXGradNode' will only cover execution time of this function.
// * 'Global_XXXGradNode' will not only cover execution time of this function, but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared by other OP(s), which may have extra accumulation overhead than 'Local_XXXGradNode'.
paddle::platform::RecordEvent grad_node_record_event_inner(\"Local_{}\", paddle::platform::TracerEventType::OperatorInner, 1);

// Fill Zero For GradIn Tensors
{}
// Apply Gradient Hooks
Expand Down Expand Up @@ -2787,6 +2793,7 @@ def _gen_api_call_code_block(
self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
grad_node_name,
self.backward_api_name,
grad_node_name,
fill_zero_str,
get_grad_in_args_str,
grad_function_prepare_str,
Expand Down
19 changes: 15 additions & 4 deletions paddle/fluid/eager/backward.cc
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,6 @@ std::vector<paddle::Tensor> RunBackward(
while (!queue.empty()) {
GradNodeBase* node = queue.front();
VLOG(3) << "Preparing GradNode:" << node->name() << " addr:" << node;
paddle::platform::RecordEvent node_record_event(
std::string((*node).name()),
paddle::platform::TracerEventType::Operator,
1);

if (queue.size() > 1 && node_in_degree_map[node] != 0) {
queue.pop_front();
Expand All @@ -280,6 +276,21 @@ std::vector<paddle::Tensor> RunBackward(
EnforceGradNodeHasInput(node);

VLOG(7) << "Run Backward Kernel with GradTensorHolder.";

// This 'Global_XXXGradNode' record event is different with
// 'Local_XXXGradNode' event.
// * 'Global_XXXGradNode' will not only cover execution time of this
// function, but also include gradient
// accumulation when the output(s) of corresponding forward OP are shared
// by other OP(s), which may have extra overhead of accumulation than
// 'Local_XXXGradNode'.
// * 'Local_XXXGradNode' will only cover execution time of GradNode
// function.
paddle::platform::RecordEvent grad_node_record_event(
"Global_" + std::string((*node).name()),
paddle::platform::TracerEventType::Operator,
1);

// Run Pre Backward Node and get outputs
paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize>
grad_output_tensors = (*node)(
Expand Down