PaddlePaddle · MingMingShangTian · Nov 22, 2021 · Nov 1, 2021 · Nov 1, 2021 · Nov 2, 2021
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -1183,6 +1183,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
       BuildPtenKernelContext(*runtime_ctx, dev_ctx);
       (*pt_kernel_)(pt_kernel_context_.get());
+
+      WriteBackToOutputs(runtime_ctx);
+
       pt_kernel_context_->ClearData();
     } else {
       (*kernel_func_)(
@@ -1808,50 +1811,78 @@ void OperatorWithKernel::BuildPtenKernelContext(
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto& in_def = input_defs.at(i);
     auto& ins_vector = ctx.inputs.at(input_names[i]);
-    if (pt_kernel_context_->InputsSize() <= i) {
+
+    size_t start_idx =
+        (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
+    size_t end_idx = start_idx + ins_vector.size();
+
+    if (pt_kernel_context_->InputsSize() == start_idx) {
       paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
       for (auto* var : ins_vector) {
         tmp_inputs.emplace_back(
             experimental::MakePtenTensorBaseFromVar(*var, in_def));
       }
       pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs));
-    } else {
+    } else if (pt_kernel_context_->InputsSize() > start_idx) {
       size_t input_size = pt_kernel_context_->InputsSize();
       for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > i + j) {
+        if (input_size > start_idx + j) {
           experimental::ReMakePtenDenseTensorFromVar(
               *ins_vector[j], in_def,
-              pt_kernel_context_->MutableInputAt<pten::DenseTensor>(i + j));
+              pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
+                                                                    j));
+        } else {
+          pt_kernel_context_->EmplaceBackInputWithoutSetRange(
+              experimental::MakePtenTensorBaseFromVar(*ins_vector[j], in_def));
         }
-        // TODO(chenweihang): adapt multi-input case later
       }
       pt_kernel_context_->MutableInputRangeAt(i) =
-          std::make_pair(i, i + ins_vector.size());
+          std::make_pair(start_idx, end_idx);
+    } else {
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "error start index when trying to set new tensor to inputs, start "
+          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
+          "`%d` ",
+          start_idx, pt_kernel_context_->InputsSize()));
     }
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto& out_def = output_defs.at(i);
     auto& outs_vector = ctx.outputs.at(output_names[i]);
-    if (pt_kernel_context_->OutputsSize() <= i) {
+
+    size_t start_idx =
+        (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
+    size_t end_idx = start_idx + outs_vector.size();
+
+    if (pt_kernel_context_->OutputsSize() == start_idx) {
       paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
       for (auto* var : outs_vector) {
         tmp_outputs.emplace_back(
             experimental::MakePtenTensorBaseFromVar(var, out_def));
       }
       pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else {
+    } else if (pt_kernel_context_->OutputsSize() > start_idx) {
       size_t output_size = pt_kernel_context_->OutputsSize();
       for (size_t j = 0; j < outs_vector.size(); ++j) {
-        if (output_size > i + j) {
+        if (output_size > start_idx + j) {
           experimental::ReMakePtenDenseTensorFromVar(
               outs_vector[j], out_def,
-              pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(i + j));
+              pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
+                                                                     j));
+        } else {
+          pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
+              experimental::MakePtenTensorBaseFromVar(outs_vector[j], out_def));
         }
-        // TODO(chenweihang): adapt multi-output case later
       }
       pt_kernel_context_->MutableOutputRangeAt(i) =
-          std::make_pair(i, i + outs_vector.size());
+          std::make_pair(start_idx, end_idx);
+    } else {
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "error start index when trying to set new tensor to inputs, start "
+          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
+          "`%d` ",
+          start_idx, pt_kernel_context_->OutputsSize()));
     }
   }
 
@@ -1882,6 +1913,16 @@ void OperatorWithKernel::BuildPtenKernelContext(
         pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
         pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(pten::DataType))) {
+        auto data_type = pten::TransToPtenDataType(
+            static_cast<framework::proto::VarType::Type>(
+                BOOST_GET_CONST(int, attr)));
+        pt_kernel_context_->EmplaceBackAttr(data_type);
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<int>))) {
+        pt_kernel_context_->EmplaceBackAttr(
+            BOOST_GET_CONST(std::vector<int>, attr));
       } else if (attr_defs[i].type_index ==
                      std::type_index(typeid(std::vector<int64_t>)) &&
                  std::type_index(attr.type()) ==
@@ -1901,5 +1942,26 @@ void OperatorWithKernel::BuildPtenKernelContext(
   }
 }
 
+void OperatorWithKernel::WriteBackToOutputs(RuntimeContext* ctx) const {
+  // auto& input_names = std::get<0>(pt_kernel_signature_->args);
+  // auto& attr_names = std::get<1>(pt_kernel_signature_->args);
+  auto& output_names = std::get<2>(pt_kernel_signature_->args);
+
+  // pt_kernel_context_
+
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    auto& outs_vector = ctx->outputs.at(output_names[i]);
+
+    auto& range_pair = pt_kernel_context_->OutputRangeAt(i);
+    auto pten_outs =
+        pt_kernel_context_->MutableOutputBetween<pten::DenseTensor>(
+            range_pair.first, range_pair.second);
+
+    for (size_t j = 0; j < pten_outs.size(); ++j) {
+      experimental::MakeVariableFromPtenTensor(pten_outs[j], outs_vector[j]);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
@@ -589,6 +589,8 @@ class OperatorWithKernel : public OperatorBase {
   void BuildPtenKernelContext(const RuntimeContext& ctx,
                               platform::DeviceContext* dev_ctx) const;
 
+  void WriteBackToOutputs(RuntimeContext* ctx) const;
+
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
   mutable std::unique_ptr<OpKernelFunc> kernel_func_;

diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
@@ -209,5 +209,7 @@ void Tensor::ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
   type_ = type;
 }
 
+void Tensor::setType(const proto::VarType::Type type) { type_ = type; }
+
 }  // namespace framework
 }  // namespace paddle
@@ -273,6 +273,8 @@ class Tensor {
   void ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
                            const proto::VarType::Type type);
 
+  void setType(const proto::VarType::Type type);
+
   TensorInplaceVersion& InplaceVersionCounter() {
     return *inplace_version_counter_;
   }

diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
@@ -295,52 +295,76 @@ static void BuildDygraphPtenKernelContext(
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto& in_def = input_defs.at(i);
     auto& ins_vector = ins.at(input_names[i]);
-    if (kernel_ctx->InputsSize() <= i) {
+
+    size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
+    size_t end_idx = start_idx + ins_vector.size();
+
+    if (kernel_ctx->InputsSize() == start_idx) {
       paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
       for (const auto& var : ins_vector) {
         const auto& variable = var->Var();
         tmp_inputs.emplace_back(
             experimental::MakePtenTensorBaseFromVar(variable, in_def));
       }
       kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
-    } else {
+    } else if (kernel_ctx->InputsSize() > start_idx) {
       size_t input_size = kernel_ctx->InputsSize();
       for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > i + j) {
+        if (input_size > start_idx + j) {
           experimental::ReMakePtenDenseTensorFromVar(
               ins_vector[j]->Var(), in_def,
-              kernel_ctx->MutableInputAt<pten::DenseTensor>(i + j));
+              kernel_ctx->MutableInputAt<pten::DenseTensor>(start_idx + j));
+        } else {
+          kernel_ctx->EmplaceBackInputWithoutSetRange(
+              experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(),
+                                                      in_def));
         }
-        // TODO(chenweihang): adapt multi-input case later
       }
-      kernel_ctx->MutableInputRangeAt(i) =
-          std::make_pair(i, i + ins_vector.size());
+      kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx);
+    } else {
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "error start index when trying to set new tensor to inputs, start "
+          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
+          "`%d` ",
+          start_idx, kernel_ctx->InputsSize()));
     }
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto& out_def = output_defs.at(i);
     auto& outs_vector = outs.at(output_names[i]);
-    if (kernel_ctx->OutputsSize() <= i) {
+
+    size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
+    size_t end_idx = start_idx + outs_vector.size();
+
+    if (kernel_ctx->OutputsSize() == start_idx) {
       paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
       for (auto& var : outs_vector) {
         auto* variable = var->MutableVar();
         tmp_outputs.emplace_back(
             experimental::MakePtenTensorBaseFromVar(variable, out_def));
       }
       kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else {
+    } else if (kernel_ctx->OutputsSize() > start_idx) {
       size_t output_size = kernel_ctx->OutputsSize();
       for (size_t j = 0; j < outs_vector.size(); ++j) {
         if (output_size > i + j) {
           experimental::ReMakePtenDenseTensorFromVar(
               outs_vector[j]->MutableVar(), out_def,
               kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
+        } else {
+          kernel_ctx->EmplaceBackOutputWithoutSetRange(
+              experimental::MakePtenTensorBaseFromVar(
+                  outs_vector[j]->MutableVar(), out_def));
         }
-        // TODO(chenweihang): adapt multi-output case later
       }
-      kernel_ctx->MutableOutputRangeAt(i) =
-          std::make_pair(i, i + outs_vector.size());
+      kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx);
+    } else {
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "error start index when trying to set new tensor to inputs, start "
+          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
+          "`%d` ",
+          start_idx, kernel_ctx->OutputsSize()));
     }
   }
 
@@ -371,6 +395,15 @@ static void BuildDygraphPtenKernelContext(
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
         kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(pten::DataType))) {
+        auto data_type = pten::TransToPtenDataType(
+            static_cast<framework::proto::VarType::Type>(
+                BOOST_GET_CONST(int, attr)));
+        kernel_ctx->EmplaceBackAttr(data_type);
+      } else if (attr_defs[i].type_index ==
+                 std::type_index(typeid(std::vector<int>))) {
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<int>, attr));
       } else if (attr_defs[i].type_index ==
                      std::type_index(typeid(std::vector<int64_t>)) &&
                  std::type_index(attr.type()) ==
@@ -390,6 +423,26 @@ static void BuildDygraphPtenKernelContext(
   }
 }
 
+template <typename VarType>
+static void WriteBackToOutputs(
+    const framework::KernelSignature& pt_kernel_signature,
+    const NameVarMap<VarType>& outs, pten::KernelContext* kernel_ctx) {
+  auto& output_names = std::get<2>(pt_kernel_signature.args);
+
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    auto& outs_vector = outs.at(output_names[i]);
+
+    auto& range_pair = kernel_ctx->OutputRangeAt(i);
+    auto pten_outs = kernel_ctx->MutableOutputBetween<pten::DenseTensor>(
+        range_pair.first, range_pair.second);
+
+    for (size_t j = 0; j < pten_outs.size(); ++j) {
+      experimental::MakeVariableFromPtenTensor(pten_outs[j],
+                                               outs_vector[j]->MutableVar());
+    }
+  }
+}
+
 template <typename VarType>
 static void PreparedOpRunImpl(
     const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
@@ -463,6 +516,20 @@ static void PreparedOpRunPtImpl(
 
   pt_kernel(pt_kernel_context);
 
+  if (FLAGS_benchmark) {
+    dev_ctx->Wait();
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+#if defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+  }
+
+  WriteBackToOutputs<VarType>(pt_kernel_signature, outs, pt_kernel_context);
+
   // Ensure that it does not affect the VarBase life cycle management
   pt_kernel_context->ClearData();
 

diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/manipulation.h"
+
 namespace paddle {
 namespace operators {
 
@@ -53,11 +57,26 @@ class CastOpKernel : public framework::OpKernel<InT> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
-    framework::VisitDataType(
-        static_cast<framework::proto::VarType::Type>(
-            context.Attr<int>("out_dtype")),
-        CastOpFunctor<DeviceContext, InT>(
-            in, out, context.template device_context<DeviceContext>()));
+
+    auto out_dtype = context.Attr<int>("out_dtype");
+    // todo: not used in_dtype
+    auto in_dtype = context.Attr<int>("in_dtype");
+
+    auto& dev_ctx = context.device_context<DeviceContext>();
+    out->mutable_data(dev_ctx.GetPlace(),
+                      static_cast<framework::proto::VarType::Type>(out_dtype));
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
+
+    auto pt_out_dtype = pten::TransToPtenDataType(
+        static_cast<framework::proto::VarType::Type>(out_dtype));
+    auto pt_in_dtype = pten::TransToPtenDataType(
+        static_cast<framework::proto::VarType::Type>(in_dtype));
+
+    // call new kernel
+    pten::Cast<InT>(dev_ctx, *pt_x.get(), pt_out_dtype, pt_in_dtype,
+                    pt_out.get());
   }
 };
 

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
@@ -552,14 +552,13 @@ class Reshape2Op : public ReshapeOp {
       const framework::ExecutionContext &ctx) const override {
     auto multi_inputs = ctx.MultiInput<framework::Tensor>("ShapeTensor");
     if (multi_inputs.size() > 0) {
-      return framework::KernelSignature(
-          "reshape2.mulhost.mid", {"X", "ShapeTensor"}, {}, {"XShape", "Out"});
+      return framework::KernelSignature("reshape2.mulhost",
+                                        {"X", "ShapeTensor"}, {}, {"Out"});
     } else if (ctx.HasInput("Shape")) {
-      return framework::KernelSignature("reshape2.host.mid", {"X", "Shape"}, {},
-                                        {"XShape", "Out"});
+      return framework::KernelSignature("reshape2.host", {"X", "Shape"}, {},
+                                        {"Out"});
     } else {
-      return framework::KernelSignature("reshape2.mid", {"X"}, {"shape"},
-                                        {"XShape", "Out"});
+      return framework::KernelSignature("reshape2", {"X"}, {"shape"}, {"Out"});
     }
   }
 };

diff --git a/paddle/pten/api/include/manipulation.h b/paddle/pten/api/include/manipulation.h
@@ -21,6 +21,8 @@ namespace experimental {
 
 PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis);
 
+Tensor cast(const Tensor& x, DataType out_dtype);
+
 PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector<int64_t>& shape);
 }  // namespace experimental
 }  // namespace paddle