diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc index 84162355e2f88d..5d2912d4beb6ae 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/api/all.h" #include "paddle/phi/api/lib/api_custom_impl.h" @@ -34,6 +35,19 @@ AddNGradNodeFinal::operator()( bool is_new_grad) { // Fill Zero For GradIn Tensors + // This 'Local_XXXGradNode' record event is different with + // 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, + // but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra accumulation overhead than + // 'Local_XXXGradNode'. + paddle::platform::RecordEvent node_execution_inner( + "Local_AddNGradNodeFinal", + paddle::platform::TracerEventType::OperatorInner, + 1); + // Apply Gradient Hooks auto hooked_grads = ApplyGradientHooks(grads); diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc index 437cce80c919b8..888d96b50fa3c7 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc @@ -38,6 +38,19 @@ Conv2dGradNodeFinal::operator()( bool is_new_grad) { // Fill Zero For GradIn Tensors VLOG(3) << " Running Conv2dGradNodeFinal: " << this; + // This 'Local_XXXGradNode' record event is different with + // 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, + // but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra accumulation overhead than + // 'Local_XXXGradNode'. + paddle::platform::RecordEvent node_execution_inner( + "Local_Conv2dGradNodeFinal", + paddle::platform::TracerEventType::OperatorInner, + 1); + // Apply Gradient Hooks auto hooked_grads = ApplyGradientHooks(grads); @@ -208,6 +221,19 @@ Conv2dDoubleGradNodeFinal::operator()( egr::kSlotSmallVectorSize>& grads, bool create_graph, bool is_new_grad) { + // This 'Local_XXXGradNode' record event is different with + // 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, + // but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra accumulation overhead than + // 'Local_XXXGradNode'. + paddle::platform::RecordEvent node_execution_inner( + "Local_Conv2dDoubleGradNodeFinal", + paddle::platform::TracerEventType::OperatorInner, + 1); + // Fill Zero For GradIn Tensors const auto& input_metas = this->InputMeta(); egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0][0], diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc index 1a098acf071dda..b1f25601d066b2 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc @@ -41,6 +41,19 @@ MultiplyGradNode::operator()( bool is_new_grad) { VLOG(3) << "Running AD API GRAD: " << "multiply_grad"; + // This 'Local_XXXGradNode' record event is different with + // 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, + // but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra accumulation overhead than + // 'Local_XXXGradNode'. + paddle::platform::RecordEvent node_execution_inner( + "Local_MultiplyGradNode", + paddle::platform::TracerEventType::OperatorInner, + 1); + // Fill Zero For GradIn Tensors const auto& input_metas = this->InputMeta(); egr::EagerUtils::FillZeroForEmptyGradInput(&grads[0][0], input_metas[0][0]); @@ -245,6 +258,19 @@ MultiplyDoubleGradNode::operator()( bool is_new_grad) { VLOG(3) << "Running AD API GRAD: " << "multiply_double_grad"; + // This 'Local_XXXGradNode' record event is different with + // 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, + // but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra accumulation overhead than + // 'Local_XXXGradNode'. + paddle::platform::RecordEvent node_execution_inner( + "Local_MultiplyDoubleGradNode", + paddle::platform::TracerEventType::OperatorInner, + 1); + // Fill Zero For GradIn Tensors const auto& input_metas = this->InputMeta(); egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0][0], @@ -505,6 +531,19 @@ MultiplyGradNode::operator()( bool is_new_grad) { VLOG(3) << "Running AD API GRAD: " << "multiply_grad"; + // This 'Local_XXXGradNode' record event is different with + // 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, + // but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra accumulation overhead than + // 'Local_XXXGradNode'. + paddle::platform::RecordEvent node_execution_inner( + "Local_MultiplyGradNode", + paddle::platform::TracerEventType::OperatorInner, + 1); + // Fill Zero For GradIn Tensors const auto& input_metas = this->InputMeta(); egr::EagerUtils::FillZeroForEmptyGradInput(&grads[0][0], input_metas[0][0]); diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/reshard_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/reshard_node.cc index 15fd00ed5bbaae..0049c67b4870e5 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/reshard_node.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/reshard_node.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" paddle::small_vector, egr::kSlotSmallVectorSize> // NOLINT @@ -29,6 +30,18 @@ ReshardGradNode::operator()( #ifdef PADDLE_WITH_DISTRIBUTE VLOG(3) << "Running AD API GRAD: " << "reshard_grad"; + // This 'Local_XXXGradNode' record event is different with + // 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, + // but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra accumulation overhead than + // 'Local_XXXGradNode'. + paddle::platform::RecordEvent node_execution_inner( + "Local_ReshardGradNode", + paddle::platform::TracerEventType::OperatorInner, + 1); // Apply Gradient Hooks auto hooked_grad = ApplyGradientHooks(grads); diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/sync_batch_norm_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/sync_batch_norm_node.cc index 04bfac8ebd5c69..4e327d23e6da97 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/sync_batch_norm_node.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/sync_batch_norm_node.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/api/all.h" #include "paddle/phi/api/backward/backward_api.h" #include "paddle/phi/api/backward/sparse_bw_api.h" @@ -37,6 +38,19 @@ SyncBatchNormGradNode::operator()( bool is_new_grad) { VLOG(3) << "Running AD API GRAD: " << "sync_batch_norm_grad"; + // This 'Local_XXXGradNode' record event is different with + // 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, + // but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra accumulation overhead than + // 'Local_XXXGradNode'. + paddle::platform::RecordEvent node_execution_inner( + "Local_SyncBatchNormGradNode", + paddle::platform::TracerEventType::OperatorInner, + 1); + // Fill Zero For GradIn Tensors // Apply Gradient Hooks @@ -256,6 +270,19 @@ SyncBatchNormGradNode::operator()( bool is_new_grad) { VLOG(3) << "Running AD API GRAD: " << "sync_batch_norm_grad"; + // This 'Local_XXXGradNode' record event is different with + // 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, + // but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra accumulation overhead than + // 'Local_XXXGradNode'. + paddle::platform::RecordEvent node_execution_inner( + "Local_SyncBatchNormGradNode", + paddle::platform::TracerEventType::OperatorInner, + 1); + // Fill Zero For GradIn Tensors // Apply Gradient Hooks diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index d894ef47788254..c6cebbd7812517 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -209,6 +209,12 @@ class {} : public egr::GradNodeBase {{ paddle::small_vector, egr::kSlotSmallVectorSize> {}::operator()(paddle::small_vector, egr::kSlotSmallVectorSize>& grads, bool create_graph, bool is_new_grad) {{ VLOG(3) << \"Running AD API GRAD: \" << \"{}\"; + // This 'Local_XXXGradNode' record event is different with 'Global_XXXGradNode' event. + // * 'Local_XXXGradNode' will only cover execution time of this function. + // * 'Global_XXXGradNode' will not only cover execution time of this function, but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared by other OP(s), which may have extra accumulation overhead than 'Local_XXXGradNode'. + paddle::platform::RecordEvent grad_node_record_event_inner(\"Local_{}\", paddle::platform::TracerEventType::OperatorInner, 1); + // Fill Zero For GradIn Tensors {} // Apply Gradient Hooks @@ -2787,6 +2793,7 @@ def _gen_api_call_code_block( self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format( grad_node_name, self.backward_api_name, + grad_node_name, fill_zero_str, get_grad_in_args_str, grad_function_prepare_str, diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 33d945d29a4a32..434ea8a888f77a 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -253,10 +253,6 @@ std::vector RunBackward( while (!queue.empty()) { GradNodeBase* node = queue.front(); VLOG(3) << "Preparing GradNode:" << node->name() << " addr:" << node; - paddle::platform::RecordEvent node_record_event( - std::string((*node).name()), - paddle::platform::TracerEventType::Operator, - 1); if (queue.size() > 1 && node_in_degree_map[node] != 0) { queue.pop_front(); @@ -280,6 +276,21 @@ std::vector RunBackward( EnforceGradNodeHasInput(node); VLOG(7) << "Run Backward Kernel with GradTensorHolder."; + + // This 'Global_XXXGradNode' record event is different with + // 'Local_XXXGradNode' event. + // * 'Global_XXXGradNode' will not only cover execution time of this + // function, but also include gradient + // accumulation when the output(s) of corresponding forward OP are shared + // by other OP(s), which may have extra overhead of accumulation than + // 'Local_XXXGradNode'. + // * 'Local_XXXGradNode' will only cover execution time of GradNode + // function. + paddle::platform::RecordEvent grad_node_record_event( + "Global_" + std::string((*node).name()), + paddle::platform::TracerEventType::Operator, + 1); + // Run Pre Backward Node and get outputs paddle::small_vector, kSlotSmallVectorSize> grad_output_tensors = (*node)(