diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h index 697dbb9170f184..be7fe8ea23fac1 100644 --- a/paddle/fluid/distributed/fleet.h +++ b/paddle/fluid/distributed/fleet.h @@ -36,7 +36,6 @@ limitations under the License. */ namespace paddle { namespace framework { -class Tensor; class Scope; class SelectedRows; class Variable; diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index 7e820a38581af6..deb3d26527727e 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -145,8 +145,8 @@ bool DistModel::LoadParameters() { return true; } -void DistModel::Run(const std::vector &input_data, - std::vector *output_data) { +void DistModel::Run(const std::vector &input_data, + std::vector *output_data) { /* TODO(fleet exe dev): implement this funct */ } diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h index 57bfd88147746b..182c5a508098ed 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.h +++ b/paddle/fluid/distributed/fleet_executor/dist_model.h @@ -18,6 +18,7 @@ #include #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/place.h" @@ -25,7 +26,6 @@ namespace paddle { namespace framework { class ProgramDesc; class Scope; -class Tensor; } namespace distributed { @@ -45,8 +45,8 @@ class DistModel { public: explicit DistModel(const DistModelConfig& config) : config_(config) {} bool Init(); - void Run(const std::vector& input_data, - std::vector* output_data); + void Run(const std::vector& input_data, + std::vector* output_data); ~DistModel() = default; private: diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index 4d9f84fdc6e0f3..147758abfd5553 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -20,10 +20,13 @@ limitations under the License. */ namespace paddle { namespace framework { class Variable; -class Tensor; } // namespace framework } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc index c0c1fda4c4fca1..f83c7bdb15fa1c 100644 --- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc @@ -31,11 +31,14 @@ class PSClient; class PSServer; } // namespace distributed namespace framework { -class Tensor; class Variable; } // namespace framework } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace framework = paddle::framework; namespace platform = paddle::platform; namespace operators = paddle::operators; diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc index 471750feaefef7..f9c2b55eb4fee2 100644 --- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc @@ -32,11 +32,14 @@ class PSClient; class PSServer; } // namespace distributed namespace framework { -class Tensor; class Variable; } // namespace framework } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace framework = paddle::framework; namespace platform = paddle::platform; namespace operators = paddle::operators; diff --git a/paddle/fluid/eager/legacy/prepared_operator.h b/paddle/fluid/eager/legacy/prepared_operator.h index 0e00b52e0481aa..87720fd8f005b8 100644 --- a/paddle/fluid/eager/legacy/prepared_operator.h +++ b/paddle/fluid/eager/legacy/prepared_operator.h @@ -29,7 +29,6 @@ DECLARE_bool(use_mkldnn); namespace paddle { namespace framework { -class Tensor; class Variable; } // namespace framework namespace platform { @@ -37,6 +36,10 @@ class DeviceContext; } // namespace platform } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace egr { namespace legacy { diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc index 14bef7fe023f63..496991fd6862d1 100644 --- a/paddle/fluid/framework/copy_same_tensor_test.cc +++ b/paddle/fluid/framework/copy_same_tensor_test.cc @@ -68,7 +68,7 @@ static bool CopySameTensorTestMain(const DDim &dims, if (sync_copy) { TensorCopySync(src_tensor, dst_place, &src_tensor); } else { - TensorCopy(src_tensor, dst_place, &src_tensor); + paddle::framework::TensorCopy(src_tensor, dst_place, &src_tensor); platform::DeviceContextPool::Instance().Get(src_place)->Wait(); platform::DeviceContextPool::Instance().Get(dst_place)->Wait(); } diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index d06f5a0227af74..1a4f283f511da4 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -28,8 +28,9 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, // NOTE(zhiqiu): Special case for CPU->NPU, avoid stream sync. if (platform::is_cpu_place(in.place()) && platform::is_npu_place(dst_place)) { - TensorCopy(in, dst_place, - *platform::DeviceContextPool::Instance().Get(dst_place), out); + paddle::framework::TensorCopy( + in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place), + out); return; } diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h index 60b52a5e7069fb..8ff97646cfce79 100644 --- a/paddle/fluid/framework/data_device_transform.h +++ b/paddle/fluid/framework/data_device_transform.h @@ -21,8 +21,6 @@ limitations under the License. */ namespace paddle { namespace framework { -class Tensor; - void TransDataDevice(const Tensor& in, const platform::Place& dst_place, Tensor* out); diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 2533acaa6d35ac..313ee9cd68a0b7 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -50,12 +50,15 @@ DECLARE_bool(enable_slotrecord_reset_shrink); namespace paddle { namespace framework { class DataFeedDesc; -class Tensor; class Scope; class Variable; } // namespace framework } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index f7b4a36d2f4001..182ffe65c3c7ec 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -25,7 +25,6 @@ namespace paddle { namespace framework { class OpKernelType; -class Tensor; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h index 2bbdac52ee49fd..f8b36b48c308ea 100644 --- a/paddle/fluid/framework/data_transform.h +++ b/paddle/fluid/framework/data_transform.h @@ -31,7 +31,6 @@ namespace paddle { namespace framework { class OpKernelType; -class Tensor; class Variable; void TransformData(const OpKernelType &expected_kernel_type, diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h index 678764430f0ffa..76cea64dc47550 100644 --- a/paddle/fluid/framework/data_type_transform.h +++ b/paddle/fluid/framework/data_type_transform.h @@ -25,7 +25,6 @@ namespace paddle { namespace framework { class OpKernelType; -class Tensor; using KernelTypePair = std::pair; diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index b8fac755709e76..052860cd0ab404 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -169,7 +169,7 @@ FetchResultType AsyncSSAGraphExecutor::Run( std::vector lodtensor_ptrs; lodtensor_ptrs.push_back(&(BOOST_GET(LoDTensor, val.at(fetch_idx)))); LoDTensor var; - var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace()); ret.emplace_back(var); } else { auto array = BOOST_GET(LoDTensorArray, val.at(fetch_idx)); @@ -179,7 +179,8 @@ FetchResultType AsyncSSAGraphExecutor::Run( std::vector lodtensor_ptrs; lodtensor_ptrs.push_back(&array[i]); item_array.emplace_back(); - item_array.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + MergeLoDTensor(&(item_array.back()), lodtensor_ptrs, + platform::CPUPlace()); } ret.emplace_back(item_array); } diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.h b/paddle/fluid/framework/details/fetch_async_op_handle.h index 41df0d90aaf817..3e9563ab1eda47 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.h +++ b/paddle/fluid/framework/details/fetch_async_op_handle.h @@ -22,14 +22,18 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; namespace ir { class Node; } // namespace ir } // namespace framework + namespace platform { class DeviceContext; } // namespace platform diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 0a116cd9d8abba..60e58fafa41983 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -81,7 +81,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const { } auto &val = BOOST_GET(FetchList, *data_); LoDTensor var; - var.MergeLoDTensor(tensors_ptr, platform::CPUPlace()); + MergeLoDTensor(&var, tensors_ptr, platform::CPUPlace()); val.at(offset_) = std::move(var); } else { auto &array = BOOST_GET_CONST(LoDTensorArray, tensors_[0]); @@ -99,7 +99,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const { tensors_ptr.push_back(&element[i]); } tmp_array.emplace_back(); - tmp_array.back().MergeLoDTensor(tensors_ptr, platform::CPUPlace()); + MergeLoDTensor(&(tmp_array.back()), tensors_ptr, platform::CPUPlace()); } auto &val = BOOST_GET(FetchList, *data_); val.at(offset_) = std::move(tmp_array); diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc index 4d31069dd06eeb..74f5deed45557c 100644 --- a/paddle/fluid/framework/details/gather_op_handle.cc +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -16,11 +16,10 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle + +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h index 10b7ab0bc9c534..fa2cbb550339a3 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.h +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h @@ -19,11 +19,9 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 51063f68d4cbd6..936e84a6c82b9a 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -275,7 +275,7 @@ FetchResultType ParallelSSAGraphExecutor::Run( } if (lodtensor_ptrs.size() != 0) { LoDTensor var; - var.MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace()); ret.emplace_back(var); } else { LoDTensorArray var_array(lodtensorarray_ptrs[0]->size()); @@ -285,7 +285,7 @@ FetchResultType ParallelSSAGraphExecutor::Run( for (size_t j = 0; j < lodtensorarray_ptrs.size(); ++j) { ptrs.push_back(&(lodtensorarray_ptrs[j]->at(i))); } - var.MergeLoDTensor(ptrs, platform::CPUPlace()); + MergeLoDTensor(&var, ptrs, platform::CPUPlace()); var_array[i] = std::move(var); } ret.emplace_back(var_array); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index a2f7cc6fcecbf7..3d877dbbde248c 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -18,11 +18,9 @@ #include "paddle/fluid/platform/profiler.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc index 1225e2ee025b2e..434ba325ae3acf 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc @@ -22,7 +22,6 @@ namespace paddle { namespace framework { class Scope; -class Tensor; class Variable; namespace ir { @@ -31,6 +30,10 @@ class MemOptVarInfo; } // namespace framework } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { namespace details { diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 82078555013845..be1371542f5306 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -16,9 +16,12 @@ #include "paddle/fluid/framework/selected_rows.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h index a882d5120bc668..a689c47a1611f4 100644 --- a/paddle/fluid/framework/details/variable_visitor.h +++ b/paddle/fluid/framework/details/variable_visitor.h @@ -19,7 +19,6 @@ namespace paddle { namespace framework { -class Tensor; class Variable; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc index 3b70ef737f5bef..e191979c505223 100644 --- a/paddle/fluid/framework/device_worker.cc +++ b/paddle/fluid/framework/device_worker.cc @@ -14,10 +14,13 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Scope; void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; } diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 332a5840491274..edb87a378dd4c3 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -43,7 +43,6 @@ limitations under the License. */ namespace paddle { namespace framework { -class Tensor; class ProgramDesc; class Scope; } // namespace framework diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h index 03ed8884925ce4..ff4cf23da6e965 100644 --- a/paddle/fluid/framework/dlpack_tensor.h +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -21,8 +21,6 @@ namespace paddle { namespace framework { -class Tensor; - class DLPackTensor { public: using LaneType = decltype(::DLTensor::dtype.lanes); // uint16_t diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index cc97af4b1969d2..83d5a2efa342e5 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -15,9 +15,12 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/platform/cpu_helper.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 0c3aafd85f2835..6454874c028b85 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -19,10 +19,13 @@ limitations under the License. */ #include #include "glog/logging.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; void SetFeedVariable(Scope* scope, const LoDTensor& input, diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index dc9310ff5b2632..89c4bff922bbdc 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -20,10 +20,13 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/string_array.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Scope; void SetFeedVariable(Scope* scope, const LoDTensor& input, diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc index c883412a9a4c32..30a1de15cb0528 100644 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -18,9 +18,12 @@ #include "paddle/fluid/framework/op_version_registry.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Scope; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc index 6443d0594a9c5a..194686825ff2d8 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -19,9 +19,12 @@ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Scope; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc index c0a4f099e39d42..ae9c873e14113f 100644 --- a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc @@ -15,11 +15,9 @@ #include "paddle/fluid/framework/ir/delete_dropout_op_pass.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc index af75646551e285..aecbd8619a67d9 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc @@ -16,11 +16,9 @@ #include -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index 09fd6b8dd11167..74937313d130fe 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -22,11 +22,9 @@ limitations under the License. */ #include "paddle/fluid/platform/device_code.h" #include "paddle/fluid/platform/float16.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -206,9 +204,11 @@ void TestMainImpl(std::string func_name, std::string code_str, for (int64_t i = 0; i < cpu_tensors[id].numel(); ++i) { tmp_cpu_ptr[i] = paddle::platform::float16(cpu_ptr[i]); } - TensorCopySync(tmp_cpu_tensors[id], place, &gpu_tensors[id]); + paddle::framework::TensorCopySync(tmp_cpu_tensors[id], place, + &gpu_tensors[id]); } else { - TensorCopySync(cpu_tensors[id], place, &gpu_tensors[id]); + paddle::framework::TensorCopySync(cpu_tensors[id], place, + &gpu_tensors[id]); } args.push_back(&gpu_ptrs[id]); } @@ -234,8 +234,8 @@ void TestMainImpl(std::string func_name, std::string code_str, paddle::platform::float16* tmp_cpu_ptr = tmp_cpu_tensors[id].mutable_data( cpu_tensors[id].dims(), paddle::platform::CPUPlace()); - TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(), - &tmp_cpu_tensors[id]); + paddle::framework::TensorCopySync( + gpu_tensors[id], paddle::platform::CPUPlace(), &tmp_cpu_tensors[id]); float* cpu_ptr = cpu_tensors[id].mutable_data( cpu_tensors[id].dims(), paddle::platform::CPUPlace()); @@ -243,8 +243,8 @@ void TestMainImpl(std::string func_name, std::string code_str, cpu_ptr[i] = static_cast(tmp_cpu_ptr[i]); } } else { - TensorCopySync(gpu_tensors[id], paddle::platform::CPUPlace(), - &cpu_tensors[id]); + paddle::framework::TensorCopySync( + gpu_tensors[id], paddle::platform::CPUPlace(), &cpu_tensors[id]); } } } diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 4681933a66cd34..48ba7cc0a2a8ac 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -319,14 +319,47 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, TensorFromStream(is, static_cast(tensor), dev_ctx); } -std::vector LoDTensor::SplitLoDTensor( - const std::vector places) const { +LoD ConvertToLengthBasedLoD(const LoD &offset_lod) { + LoD length_lod; + length_lod.reserve(offset_lod.size()); + for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) { + std::vector level; + if (offset_lod[lvl].size() > 0) { + level.reserve(offset_lod[lvl].size() - 1); + } + for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) { + level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]); + } + length_lod.push_back(level); + } + return length_lod; +} + +LoD ConvertToOffsetBasedLoD(const LoD &length_lod) { + LoD offset_lod; + offset_lod.reserve(length_lod.size()); + for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) { + std::vector level; + level.reserve(length_lod[lvl].size() + 1); + size_t tmp = 0; + level.push_back(tmp); + for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) { + tmp += length_lod[lvl][idx]; + level.push_back(tmp); + } + offset_lod.push_back(level); + } + return offset_lod; +} + +std::vector SplitLoDTensor( + const LoDTensor &src, const std::vector places) { PADDLE_ENFORCE_GT(places.size(), 0, platform::errors::InvalidArgument( "Place number cannot be empty when splitting.")); - check_memory_size(); - size_t batch_size = - lod().empty() ? static_cast(dims()[0]) : lod()[0].size() - 1; + src.check_memory_size(); + size_t batch_size = src.lod().empty() ? static_cast(src.dims()[0]) + : src.lod()[0].size() - 1; // if batch_size is 0, just return #places.size() copys of empty // tensors. @@ -335,10 +368,10 @@ std::vector LoDTensor::SplitLoDTensor( empty_results.reserve(places.size()); for (size_t i = 0; i < places.size(); ++i) { LoDTensor dst; - dst.Resize(dims()); - dst.mutable_data(places[i], type()); - if (!lod().empty()) { - dst.set_lod(lod()); + dst.Resize(src.dims()); + dst.mutable_data(places[i], src.type()); + if (!src.lod().empty()) { + dst.set_lod(src.lod()); } empty_results.emplace_back(std::move(dst)); } @@ -360,17 +393,18 @@ std::vector LoDTensor::SplitLoDTensor( begin, end)); LoDTensor dst; - if (lod().empty()) { - auto src = Slice(begin, end); + if (src.lod().empty()) { + auto sliced_src = src.Slice(begin, end); auto &dst_place = places[i]; - framework::TensorCopy(src, dst_place, &dst); + framework::TensorCopy(sliced_src, dst_place, &dst); } else { - auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0); + auto lod_and_offset = + GetSubLoDAndAbsoluteOffset(src.lod(), begin, end, 0); auto &offset = lod_and_offset.second; - auto src = Slice(offset.first, offset.second); + auto sliced_src = src.Slice(offset.first, offset.second); auto &dst_place = places[i]; - framework::TensorCopy(src, dst_place, &dst); + framework::TensorCopy(sliced_src, dst_place, &dst); LoD my_lod; for (auto &l : lod_and_offset.first) { @@ -388,9 +422,9 @@ std::vector LoDTensor::SplitLoDTensor( return results; } -void LoDTensor::MergeLoDTensor( - const std::vector &lod_tensors, - platform::Place dst_place) { +void MergeLoDTensor(LoDTensor *target, + const std::vector &lod_tensors, + platform::Place dst_place) { PADDLE_ENFORCE_EQ(lod_tensors.empty(), false, platform::errors::InvalidArgument( "The LoDTensors to be merged are empty.")); @@ -449,10 +483,10 @@ void LoDTensor::MergeLoDTensor( } } } - Resize(new_dim); - set_layout(new_layout); - set_lod(new_lod); - mutable_data(dst_place, new_type); + target->Resize(new_dim); + target->set_layout(new_layout); + target->set_lod(new_lod); + target->mutable_data(dst_place, new_type); int begin = 0; for (auto *src : lod_tensors) { @@ -460,44 +494,11 @@ void LoDTensor::MergeLoDTensor( if (end == begin) { continue; } - auto dst = Slice(begin, end); + auto dst = target->Slice(begin, end); framework::TensorCopy(*src, dst_place, &dst); begin = end; } } -LoD ConvertToLengthBasedLoD(const LoD &offset_lod) { - LoD length_lod; - length_lod.reserve(offset_lod.size()); - for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) { - std::vector level; - if (offset_lod[lvl].size() > 0) { - level.reserve(offset_lod[lvl].size() - 1); - } - for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) { - level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]); - } - length_lod.push_back(level); - } - return length_lod; -} - -LoD ConvertToOffsetBasedLoD(const LoD &length_lod) { - LoD offset_lod; - offset_lod.reserve(length_lod.size()); - for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) { - std::vector level; - level.reserve(length_lod[lvl].size() + 1); - size_t tmp = 0; - level.push_back(tmp); - for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) { - tmp += length_lod[lvl][idx]; - level.push_back(tmp); - } - offset_lod.push_back(level); - } - return offset_lod; -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index bbb8f8005168ca..41cd6b83fd1d58 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -36,7 +36,15 @@ class DeviceContext; namespace paddle { namespace framework { -using LoDTensor = paddle::framework::Tensor; +using LoDTensor = pten::DenseTensor; + +// Split Tensor and copy to each place specified in places. +std::vector SplitLoDTensor( + const LoDTensor& src, const std::vector places); + +void MergeLoDTensor(LoDTensor* target, + const std::vector& lod_tensors, + platform::Place dst_place); /* * LoD is short for Level of Details. diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index e3223e67fc94df..917bb7cc096c26 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -147,7 +147,7 @@ TEST(LoD, SplitLoDTensor) { lod1.push_back(std::vector({0, 1, 2})); lod1.push_back(std::vector({0, 2, 7})); - auto lods = lod_tensor.SplitLoDTensor(places); + auto lods = SplitLoDTensor(lod_tensor, places); EXPECT_EQ(lods[0].lod(), lod0); EXPECT_EQ(lods[1].lod(), lod1); } @@ -167,7 +167,7 @@ TEST(LoD, SplitLoDTensorWithZeroBatchSize) { LoD lod_res; lod_res.push_back(std::vector({0})); - auto lods = lod_tensor.SplitLoDTensor(places); + auto lods = SplitLoDTensor(lod_tensor, places); EXPECT_EQ(lods[0].lod(), lod_res); EXPECT_EQ(lods[1].lod(), lod_res); } @@ -213,7 +213,7 @@ TEST(LoD, MergeLoDTensor) { std::vector lods{&lod_tensor0, &lod_tensor1, &lod_tensor2}; LoDTensor lod_tensor; - lod_tensor.MergeLoDTensor(lods, place); + MergeLoDTensor(&lod_tensor, lods, place); EXPECT_EQ(lod_tensor.lod(), lod); } diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index f706eabb47988a..a74917e7e69c8f 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -24,6 +24,10 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { @@ -31,7 +35,6 @@ namespace framework { * Simple, intuitive and effective. Only single thread is supported, and * currently designed for inference. */ -class Tensor; class ProgramDesc; class Scope; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e3f0fbbdfdc4a5..ea45ef857dfa0b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -32,11 +32,10 @@ limitations under the License. */ #include "paddle/pten/common/scalar.h" #include "paddle/pten/common/scalar_array.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d6c1c4cb6acc0f..8767b55062cdae 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -1048,7 +1048,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( VLOG(3) << "Split " << (is_persistable ? "persistable" : "no persistable") << " data (" << pair.first << "), dim:" << pair.second.dims() << ", place: " << pair.second.place(); - auto lod_tensors = pair.second.SplitLoDTensor(member_->places_); + auto lod_tensors = SplitLoDTensor(pair.second, member_->places_); bool is_cpu_place = platform::is_cpu_place(member_->places_.front()); if (!is_persistable && num_places != lod_tensors.size() && !allow_partial_feed) { diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index 62d6ba09735478..acd742a5822e2a 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -14,10 +14,13 @@ limitations under the License. */ #include #include "paddle/fluid/framework/device_worker.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Scope; class Variable; diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index 3634ccca95126e..445f446ef2f4ae 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -30,8 +30,6 @@ limitations under the License. */ namespace paddle { namespace framework { -class Tensor; - class SelectedRows { /* * @brief We can use the SelectedRows structure to reproduce a sparse table. diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 6aa10a058081b8..bb8d7df7457501 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -18,105 +18,13 @@ limitations under the License. */ DECLARE_bool(use_stream_safe_cuda_allocator); namespace paddle { -namespace framework { - -Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const { - check_memory_size(); - PADDLE_ENFORCE_GE(begin_idx, 0, - paddle::platform::errors::OutOfRange( - "The start row index must be greater than 0." - "But received the start index is d%.", - begin_idx)); - PADDLE_ENFORCE_LE(end_idx, meta_.dims[0], - paddle::platform::errors::OutOfRange( - "The end row index is out of bound.")); - PADDLE_ENFORCE_LT( - begin_idx, end_idx, - paddle::platform::errors::InvalidArgument( - "The start row index must be less than the end row index." - "But received the start index = %d, the end index = %d.", - begin_idx, end_idx)); - - if (meta_.dims[0] == 1) { - return *this; - } else { - size_t base = numel() / meta_.dims[0]; - Tensor dst; - dst.storage_ = pten::make_intrusive( - storage_->data_shared()); - dst.meta_.layout = meta_.layout; - dst.meta_.dtype = meta_.dtype; - DDim dst_dims = meta_.dims; - dst_dims[0] = end_idx - begin_idx; - dst.Resize(dst_dims); - dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype()); - return dst; - } -} - -std::vector Tensor::Split(int64_t split_size, int64_t axis) const { - check_memory_size(); - - PADDLE_ENFORCE_GE(meta_.dims.size(), 0, - paddle::platform::errors::OutOfRange( - "split expects at least a 1-dimensional tensor")); - - PADDLE_ENFORCE_GE( - split_size, 0, - paddle::platform::errors::OutOfRange( - "split expects split_size be non-negative, but got split_size is %d", - split_size)); - - int64_t numel_size = meta_.dims[axis]; - - int64_t num_splits = 1; - if (split_size != 0) { - num_splits = - std::max((numel_size + split_size - 1) / split_size, 1); - } - - std::vector splits(num_splits); - int64_t last_split_size = split_size - (split_size * num_splits - numel_size); - - for (int64_t i = 0; i < num_splits; ++i) { - int64_t length = i < num_splits - 1 ? split_size : last_split_size; - splits[i] = Slice(i * split_size, i * split_size + length); - } - return splits; -} - -std::vector Tensor::Chunk(int64_t chunks, int64_t axis) const { - check_memory_size(); - PADDLE_ENFORCE_GE(meta_.dims.size(), 0, - paddle::platform::errors::OutOfRange( - "split expects at least a 1-dimensional tensor")); - PADDLE_ENFORCE_GE( - chunks, 0, - paddle::platform::errors::OutOfRange( - "chunks expects to be greater than 0, but got chunks is %d", chunks)); - - int64_t numel_size = meta_.dims[axis]; - int64_t split_size = (numel_size + chunks - 1) / chunks; - return Split(split_size, axis); -} - -Tensor& Tensor::ShareDataWith(const Tensor& src) { - src.check_memory_size(); - // Preserve LoD - auto lod = meta_.lod; - *this = src; - meta_.lod = lod; - return *this; -} -Tensor& Tensor::ShareInplaceVersionCounterWith(const Tensor& src) { - PADDLE_ENFORCE_NOT_NULL( - inplace_version_counter_, - platform::errors::PreconditionNotMet( - "Tensor does not hold inplace_version_counter_.")); - - inplace_version_counter_ = src.inplace_version_counter_; - return *this; -} +namespace memory { +namespace allocation { +class Allocation; +} // namespace allocation +} // namespace memory +} // namespace paddle -} // namespace framework +namespace paddle { +namespace framework {} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 95405820a48d98..8c7345f3e2f614 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -69,35 +69,7 @@ using LoD = std::vector>; Variable object but not a pointer. */ -class Tensor : public pten::DenseTensor { - public: - using DenseTensor = pten::DenseTensor; - using DenseTensor::DenseTensor; - - // Split Tensor and copy to each place specified in places. - std::vector SplitLoDTensor( - const std::vector places) const; - - void MergeLoDTensor(const std::vector& lod_tensors, - platform::Place place); - - /*! The internal of two tensors share the same memory block. */ - Tensor& ShareDataWith(const Tensor& src); - - /*! The internal of two tensors share the same inplace version counter. */ - Tensor& ShareInplaceVersionCounterWith(const Tensor& src); - - Tensor Slice(int64_t begin_idx, int64_t end_idx) const; - - std::vector Split(int64_t split_size, int64_t axis) const; - - std::vector Chunk(int64_t chunks, int64_t axis) const; - - Tensor& Resize(const DDim& dims) { - meta_.dims = dims; - return *this; - } -}; +using Tensor = pten::DenseTensor; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 4298b159ead52f..dff48790960569 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -387,18 +387,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { TensorCopyImpl(src, dst_place, dst); } -void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place, - pten::DenseTensor* dst) { - TensorCopyImpl(src, dst_place, dst); -} void TensorCopy(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx, Tensor* dst) { TensorCopyImpl(src, dst_place, ctx, dst); } -void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place, - const platform::DeviceContext& ctx, pten::DenseTensor* dst) { - TensorCopyImpl(src, dst_place, ctx, dst); -} void TensorCopySync(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { @@ -1394,45 +1386,50 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) { return os; } -std::ostream& operator<<(std::ostream& os, const Tensor& t) { +} // namespace framework +} // namespace paddle + +namespace pten { + +std::ostream& operator<<(std::ostream& os, const pten::DenseTensor& t) { if (t.lod().size() > 0) { os << " - lod: " << t.lod() << "\n"; } os << " - place: " << t.place() << "\n"; os << " - shape: [" << t.dims() << "]\n"; - os << " - layout: " << DataLayoutToString(t.layout()) << "\n"; + os << " - layout: " << paddle::framework::DataLayoutToString(t.layout()) + << "\n"; #ifdef PADDLE_WITH_MKLDNN os << " - format: " << dnnl_fmt_tag2str(static_cast(t.format())) << "\n"; #endif - Tensor tensor; + DenseTensor tensor; tensor.Resize(t.dims()); - if (platform::is_cpu_place(t.place())) { + if (paddle::platform::is_cpu_place(t.place())) { tensor.ShareDataWith(t); } else { - platform::CPUPlace place; - framework::TensorCopy(t, place, &tensor); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + paddle::platform::CPUPlace place; + paddle::framework::TensorCopy(t, place, &tensor); + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); auto& dev_ctx = *pool.Get(t.place()); dev_ctx.Wait(); } -#define PrintTensorCallback(cpp_type, proto_type) \ - do { \ - if (tensor.type() == proto_type) { \ - os << " - dtype: " << proto_type << "\n"; \ - print_tensor(os, tensor); \ - return os; \ - } \ +#define PrintTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor.type() == proto_type) { \ + os << " - dtype: " << proto_type << "\n"; \ + paddle::framework::print_tensor(os, tensor); \ + return os; \ + } \ } while (0) _ForEachDataType_(PrintTensorCallback); VLOG(1) << "PrintVar: unrecognized data type:" << t.type(); return os; } - -} // namespace framework -} // namespace paddle +} diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 3cb3c733f4042b..3c62f3c5e43d7e 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -39,9 +39,6 @@ limitations under the License. */ namespace paddle { namespace framework { -std::ostream& operator<<(std::ostream& os, const LoD& lod); -std::ostream& operator<<(std::ostream& os, const Tensor& t); - class PrintOptions { public: static PrintOptions& Instance() { @@ -76,12 +73,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, // If ctx_place and src_place are the same, src_ctx.Wait() is added // after memory::Copy; if ctx_place and dst_place are the same, // src_ctx.Wait() is added before memory::Copy. -class Tensor; - void TensorCopy(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx, Tensor* dst); -void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place, - const platform::DeviceContext& ctx, pten::DenseTensor* dst); // NOTE(zcd): If the src.place() and dst_place are two different GPU, // the copy operation is carried out on the dst_place's stream. This is @@ -92,8 +85,6 @@ void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place, // not completed. void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst); -void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place, - pten::DenseTensor* dst); void TensorCopySync(const Tensor& src, const platform::Place& dst_place, Tensor* dst); @@ -469,5 +460,11 @@ inline void TensorToVector(const Tensor& src, std::vector* dst) { delete[] array; } +std::ostream& operator<<(std::ostream& os, const LoD& lod); + } // namespace framework } // namespace paddle + +namespace pten { +std::ostream& operator<<(std::ostream& os, const DenseTensor& t); +} diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 8bba9492a56868..91d618970e30c8 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -40,7 +40,6 @@ namespace paddle { namespace framework { class Dataset; -class Tensor; class ProgramDesc; class PullDenseWorker; class Scope; diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 715e7a14c5529d..008b6829f9fe37 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -47,6 +47,10 @@ #include "xpu/bkcl.h" #endif +namespace pten { +class DenseTensor; +} // namespace pten + // Users should add forward declarations here namespace paddle { @@ -70,7 +74,6 @@ class BKCLCommunicator; namespace framework { class LoDRankTable; class ScopeBase; -class Tensor; class ReaderHolder; class Scope; class SelectedRows; diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 22f016e2cadc1a..09cc480fe17326 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -29,9 +29,12 @@ DECLARE_bool(use_mkldnn); +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; } // namespace framework namespace platform { diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 0c9bedf3dca322..ad518eb96062d2 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -16,6 +16,7 @@ #include +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/string/string_helper.h" @@ -24,6 +25,7 @@ #include "paddle/fluid/imperative/parallel_context.h" +#include "paddle/pten/core/dense_tensor.h" namespace paddle { namespace imperative { @@ -975,7 +977,8 @@ void Reducer::ProcessUnusedDenseVars() { auto *dest_grad_tensor = grad_var_base_tmp->MutableVar()->GetMutable(); const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); - TensorCopy(src_tensor, place_, *dev_ctx, dest_grad_tensor); + paddle::framework::TensorCopy(src_tensor, place_, *dev_ctx, + dest_grad_tensor); dest_grad_tensor->Resize(dest_dims); } } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 8bb08b6fdaf2aa..06a353d5622a70 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -90,12 +90,12 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { temp_tensor.mutable_data(cpu_place); // Copy the parameter data to a tmp tensor. - TensorCopySync(*t, cpu_place, &temp_tensor); + paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); // Reallocation the space on GPU t->clear(); // Copy parameter data to newly allocated GPU space. - TensorCopySync(temp_tensor, place, t); + paddle::framework::TensorCopySync(temp_tensor, place, t); } } } diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index bf67cfed35f892..f0ce652beae11a 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -35,7 +35,6 @@ limitations under the License. */ namespace paddle { namespace framework { -class Tensor; class Scope; } // namespace framework diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index 857160ad102828..fa5997d92dd231 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -21,9 +21,12 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Scope; class SelectedRows; } // namespace framework diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc index a8ed703da95c65..b0c7c7448a50ef 100644 --- a/paddle/fluid/inference/lite/test_tensor_utils.cc +++ b/paddle/fluid/inference/lite/test_tensor_utils.cc @@ -122,7 +122,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { } #endif std::vector result; - TensorToVector(lod_tensor_n, ctx, &result); + paddle::framework::TensorToVector(lod_tensor_n, ctx, &result); ASSERT_EQ(result, vector); ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod()); } @@ -142,7 +142,7 @@ void test_tensor_share(const platform::DeviceContext& ctx) { framework::LoDTensor lod_tensor_n; TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); std::vector result; - TensorToVector(lod_tensor_n, ctx, &result); + paddle::framework::TensorToVector(lod_tensor_n, ctx, &result); ASSERT_EQ(result, vector); ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod()); } diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 0e661651914741..08e15b22b84cdf 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -82,10 +82,11 @@ class BatchNormOpConverter : public OpConverter { platform::CPUPlace cpu_place; // copy data from gpu to cpu - TensorCopySync((*Bias_t), cpu_place, &bias_tensor); - TensorCopySync((*Mean_t), cpu_place, &mean_tensor); - TensorCopySync((*Scale_t), cpu_place, &scale_tensor); - TensorCopySync((*Variance_t), cpu_place, &variance_tensor); + paddle::framework::TensorCopySync((*Bias_t), cpu_place, &bias_tensor); + paddle::framework::TensorCopySync((*Mean_t), cpu_place, &mean_tensor); + paddle::framework::TensorCopySync((*Scale_t), cpu_place, &scale_tensor); + paddle::framework::TensorCopySync((*Variance_t), cpu_place, + &variance_tensor); auto* bias_data = bias_tensor.mutable_data(platform::CPUPlace()); auto* mean_data = mean_tensor.mutable_data(platform::CPUPlace()); diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index de5d3110e18903..67e7c78b62e9d2 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -55,8 +55,8 @@ class LayerNormOpConverter : public OpConverter { scale_tensor->Resize(Scale_t->dims()); platform::CPUPlace cpu_place; - TensorCopySync((*Bias_t), cpu_place, &(*bias_tensor)); - TensorCopySync((*Scale_t), cpu_place, &(*scale_tensor)); + paddle::framework::TensorCopySync((*Bias_t), cpu_place, &(*bias_tensor)); + paddle::framework::TensorCopySync((*Scale_t), cpu_place, &(*scale_tensor)); auto* bias_data = bias_tensor->mutable_data(platform::CPUPlace()); auto* scale_data = scale_tensor->mutable_data(platform::CPUPlace()); diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index a883d2b5bbb49f..9e81d1177cfe10 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -46,7 +46,8 @@ class PReluOpConverter : public OpConverter { std::unique_ptr alpha_tensor_temp( new framework::LoDTensor()); alpha_tensor_temp->Resize(alpha_tensor->dims()); - TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get()); + paddle::framework::TensorCopySync(*alpha_tensor, cpu_place, + alpha_tensor_temp.get()); float* alpha_data = alpha_tensor_temp->mutable_data(cpu_place); nvinfer1::ILayer* layer = nullptr; diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index cfb25eb2ba8276..1e503b83bbd67b 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -63,7 +63,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, *(temp_data + i) = random(0., 1.); } - TensorCopySync(temp_tensor, place, tensor); + paddle::framework::TensorCopySync(temp_tensor, place, tensor); } /* diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 2a35f497ed07f1..aa69463674f742 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -370,7 +370,8 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name, name_with_suffix)); weight_map[name_with_suffix].reset(new framework::Tensor()); weight_map[name_with_suffix]->Resize(weight_tensor->dims()); - TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get()); + paddle::framework::TensorCopySync(*weight_tensor, cpu_place, + weight_map[name_with_suffix].get()); float *weight_data = weight_map[name_with_suffix]->mutable_data(cpu_place); name_suffix_counter += 1; diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 849ec07d07ed7a..1f90ff216adbad 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -35,12 +35,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/utils/any.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle - namespace paddle { namespace inference { namespace tensorrt { diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index c627075bfe95d9..d5cc69ea661d92 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -41,12 +41,12 @@ class TensorRTEngineTest : public ::testing::Test { void PrepareInputOutput(const std::vector &input, std::vector output_shape) { - TensorFromVector(input, *ctx_, &input_); + paddle::framework::TensorFromVector(input, *ctx_, &input_); output_.Resize(framework::make_ddim(output_shape)); } void GetOutput(std::vector *output) { - TensorToVector(output_, *ctx_, output); + paddle::framework::TensorToVector(output_, *ctx_, output); } protected: diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu index 86748d4505d287..48e19defd03438 100644 --- a/paddle/fluid/operators/abs_op.cu +++ b/paddle/fluid/operators/abs_op.cu @@ -50,9 +50,9 @@ class AbsKernel std::vector ins = {x}; std::vector outs = {out}; auto functor = CudaAbsFunctor(); - LaunchSameDimsElementwiseCudaKernel>(dev_ctx, ins, &outs, - functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, math::Real>(dev_ctx, ins, &outs, + functor); } }; diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 8cced5cd919f24..b4a9386ce0fb8e 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -1368,14 +1368,14 @@ class ELUGradCudaKernel : public framework::OpKernel { if (alpha > 0) { CudaELUGradFunctor functor; functor.alpha = alpha; - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); } else { CudaELUGradNegativeAlphaFunctor functor; functor.alpha = alpha; ins.push_back(x); - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); } } }; @@ -1451,8 +1451,8 @@ class ActivationCudaKernel for (auto& attr : attrs) { *attr.second = ctx.Attr(attr.first); } - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); } }; @@ -1481,17 +1481,17 @@ class ActivationGradCudaKernel if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { // Only need forward output Out ins.push_back(out); - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); } else if (static_cast(Functor::FwdDeps()) == static_cast(kDepX)) { // Only need forward input X ins.push_back(x); - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kBinary, T, T>(dev_ctx, ins, &outs, functor); } else { - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); } } }; diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 6e32860d69c62f..a089f6b4a3c19a 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -2696,8 +2696,8 @@ class PowKernel : public framework::OpKernel { auto* factor_data = factor_tensor->data(); framework::Tensor cpu_factor_tensor; if (platform::is_gpu_place(factor_tensor->place())) { - TensorCopySync(*factor_tensor, platform::CPUPlace(), - &cpu_factor_tensor); + framework::TensorCopySync(*factor_tensor, platform::CPUPlace(), + &cpu_factor_tensor); factor_data = cpu_factor_tensor.data(); } auto factor = @@ -2751,8 +2751,8 @@ class PowGradKernel auto* factor_data = factor_tensor->data(); framework::Tensor cpu_factor_tensor; if (platform::is_gpu_place(factor_tensor->place())) { - TensorCopySync(*factor_tensor, platform::CPUPlace(), - &cpu_factor_tensor); + framework::TensorCopySync(*factor_tensor, platform::CPUPlace(), + &cpu_factor_tensor); factor_data = cpu_factor_tensor.data(); } auto factor = diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index 6582be7354f636..0a710dd842fd49 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -50,7 +50,7 @@ void Update(const platform::NPUDeviceContext& ctx, runner_p2.Run(stream); std::vector bad_out_data; - TensorToVector(*bad_out_tensor, ctx, &bad_out_data); + paddle::framework::TensorToVector(*bad_out_tensor, ctx, &bad_out_data); if (bad_out_data[0] >= decr_every_n_nan_or_inf) { const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, {*updated_loss_scaling_tensor}, @@ -61,7 +61,8 @@ void Update(const platform::NPUDeviceContext& ctx, runner_p3.Run(stream); std::vector new_loss_scaling; - TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + paddle::framework::TensorToVector(*updated_loss_scaling_tensor, ctx, + &new_loss_scaling); float min_value = 1.0; if (FLAGS_min_loss_scaling > 1) { min_value = static_cast(FLAGS_min_loss_scaling); @@ -98,7 +99,7 @@ void Update(const platform::NPUDeviceContext& ctx, runner_p2.Run(stream); std::vector good_out_data; - TensorToVector(*good_out_tensor, ctx, &good_out_data); + paddle::framework::TensorToVector(*good_out_tensor, ctx, &good_out_data); if (good_out_data[0] >= incr_every_n_steps) { const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, @@ -109,7 +110,8 @@ void Update(const platform::NPUDeviceContext& ctx, runner_p3.Run(stream); std::vector new_loss_scaling; - TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + paddle::framework::TensorToVector(*updated_loss_scaling_tensor, ctx, + &new_loss_scaling); if (!std::isfinite(new_loss_scaling[0])) { // updated_loss_scaling_data = pre_loss_scaling_data const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, @@ -209,7 +211,8 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel { "FoundInfinite must has only one element.")); std::vector found_inf_vec; - TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec); + paddle::framework::TensorToVector(*found_inf, ctx.device_context(), + &found_inf_vec); LazyZerosNPU{}(dev_ctx, found_inf_vec, xs, outs); const bool stop_update = ctx.Attr("stop_update"); diff --git a/paddle/fluid/operators/assert_op.cc b/paddle/fluid/operators/assert_op.cc index 466e0e793e4e3b..215f6ad4be9ff1 100644 --- a/paddle/fluid/operators/assert_op.cc +++ b/paddle/fluid/operators/assert_op.cc @@ -16,10 +16,13 @@ #include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/tensor_formatter.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { class InferShapeContext; -class Tensor; class OpDesc; class Scope; class Variable; diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h index d9648c9617255e..1dd28c9389daf5 100644 --- a/paddle/fluid/operators/assign_op.h +++ b/paddle/fluid/operators/assign_op.h @@ -25,9 +25,12 @@ class DeviceContext; } // namespace platform } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; } // namespace framework } // namespace paddle @@ -76,7 +79,7 @@ class AssignFunctor { framework::LoDTensor *out) const { if (lod_tensor.numel() == 0) return; auto &out_tensor = *out; - TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); + paddle::framework::TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); out_tensor.set_lod(lod_tensor.lod()); } diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index 792d01a5efe430..049cfb8046f80e 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -47,7 +47,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, init.push_back(static_cast(3.0)); init.push_back(static_cast(4.0)); - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({4}); ctx.Wait(); @@ -62,7 +62,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, op->Run(*scope, place); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index e3dc54e17cd7fd..5f32d697bae408 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -382,7 +382,8 @@ class BatchNormKernel if (ctx.HasInput("MomentumTensor")) { const auto *mom_tensor = ctx.Input("MomentumTensor"); Tensor mom_cpu; - TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); + paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), + &mom_cpu); momentum = mom_cpu.data()[0]; } diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index be4847da51f187..aa8ceca5416200 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -86,7 +86,8 @@ class NPUBatchNormOpKernel : public framework::OpKernel { if (ctx.HasInput("MomentumTensor")) { const auto *mom_tensor = ctx.Input("MomentumTensor"); Tensor mom_cpu; - TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); + paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), + &mom_cpu); momentum = mom_cpu.data()[0]; } diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc index d232891f3d6840..505acbbdbde1b0 100644 --- a/paddle/fluid/operators/batch_norm_op_xpu.cc +++ b/paddle/fluid/operators/batch_norm_op_xpu.cc @@ -87,7 +87,8 @@ class BatchNormXPUKernel : public framework::OpKernel { if (ctx.HasInput("MomentumTensor")) { const auto *mom_tensor = ctx.Input("MomentumTensor"); Tensor mom_cpu; - TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); + paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), + &mom_cpu); momentum = mom_tensor->data()[0]; } diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu index da96aa92cd25a9..d493dad132992a 100644 --- a/paddle/fluid/operators/bce_loss_op.cu +++ b/paddle/fluid/operators/bce_loss_op.cu @@ -91,8 +91,8 @@ class BCELossGradCUDAKernel : public framework::OpKernel { std::vector outs = {dx}; auto& dev_ctx = ctx.template device_context(); auto functor = BCELossGradFunctor(); - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kTernary, T, T>(dev_ctx, ins, &outs, functor); } }; diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc index 603eec4d52232e..ace2b656e8efb3 100644 --- a/paddle/fluid/operators/benchmark/op_tester.cc +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -308,7 +308,7 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor, } if (!platform::is_cpu_place(place_)) { - TensorCopySync(cpu_tensor, place_, tensor); + paddle::framework::TensorCopySync(cpu_tensor, place_, tensor); } } diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu index 34facf1ea1fa90..cf189193d1c11a 100644 --- a/paddle/fluid/operators/bincount_op.cu +++ b/paddle/fluid/operators/bincount_op.cu @@ -77,8 +77,10 @@ void BincountCUDAInner(const framework::ExecutionContext& context) { input_min_scala.device(*place) = input_x.minimum(); Tensor input_min_cpu, input_max_cpu; - TensorCopySync(input_max_t, platform::CPUPlace(), &input_max_cpu); - TensorCopySync(input_min_t, platform::CPUPlace(), &input_min_cpu); + paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(), + &input_max_cpu); + paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(), + &input_min_cpu); InputT input_min = input_min_cpu.data()[0]; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index e10fdf522ff7c5..849cdb715049ba 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -100,7 +100,7 @@ void CopyInputDataToPlace(const framework::Scope& scope, for (const auto& var_name : scope.LocalVarNames()) { const auto& src_tensor = scope.GetVar(var_name)->Get(); auto* dst_tensor = dst_scope->Var(var_name)->GetMutable(); - TensorCopySync(src_tensor, dst_place, dst_tensor); + paddle::framework::TensorCopySync(src_tensor, dst_place, dst_tensor); } } @@ -135,10 +135,12 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) { elementwise_add_op->Run(scope, run_place); LoDTensor test_out, expected_out; - TensorCopySync(scope.Var(test_out_name)->Get(), - platform::CPUPlace(), &test_out); - TensorCopySync(scope.Var(expected_out_name)->Get(), - platform::CPUPlace(), &expected_out); + paddle::framework::TensorCopySync( + scope.Var(test_out_name)->Get(), platform::CPUPlace(), + &test_out); + paddle::framework::TensorCopySync( + scope.Var(expected_out_name)->Get(), platform::CPUPlace(), + &expected_out); ASSERT_TRUE(test_out.IsInitialized()); ASSERT_TRUE(expected_out.IsInitialized()); diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h index 3672fa983e495c..fb41dc16d65129 100644 --- a/paddle/fluid/operators/clip_op.h +++ b/paddle/fluid/operators/clip_op.h @@ -64,7 +64,8 @@ class ClipKernel : public framework::OpKernel { auto* max_t = context.Input("Max"); auto* max_data = max_t->data(); if (platform::is_gpu_place(max_t->place())) { - TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu); + paddle::framework::TensorCopySync(*max_t, platform::CPUPlace(), + &max_cpu); max_data = max_cpu.data(); } max = max_data[0]; @@ -77,7 +78,8 @@ class ClipKernel : public framework::OpKernel { auto* min_t = context.Input("Min"); auto* min_data = min_t->data(); if (platform::is_gpu_place(min_t->place())) { - TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu); + paddle::framework::TensorCopySync(*min_t, platform::CPUPlace(), + &min_cpu); min_data = min_cpu.data(); } min = min_data[0]; @@ -101,7 +103,8 @@ class ClipKernel : public framework::OpKernel { std::vector ins = {x}; std::vector outs = {out}; auto functor = ClipFunctor(min, max); - LaunchSameDimsElementwiseCudaKernel( + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, T>( context.template device_context(), ins, &outs, functor); #endif @@ -141,7 +144,8 @@ class ClipGradKernel : public framework::OpKernel { auto* max_t = context.Input("Max"); auto* max_data = max_t->data(); if (platform::is_gpu_place(max_t->place())) { - TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu); + paddle::framework::TensorCopySync(*max_t, platform::CPUPlace(), + &max_cpu); max_data = max_cpu.data(); } max = max_data[0]; @@ -154,7 +158,8 @@ class ClipGradKernel : public framework::OpKernel { auto* min_t = context.Input("Min"); auto* min_data = min_t->data(); if (platform::is_gpu_place(min_t->place())) { - TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu); + paddle::framework::TensorCopySync(*min_t, platform::CPUPlace(), + &min_cpu); min_data = min_cpu.data(); } min = min_data[0]; diff --git a/paddle/fluid/operators/clip_op_xpu.cc b/paddle/fluid/operators/clip_op_xpu.cc index 7d4b02af418bef..c53bb2d9e4d0cb 100644 --- a/paddle/fluid/operators/clip_op_xpu.cc +++ b/paddle/fluid/operators/clip_op_xpu.cc @@ -36,7 +36,8 @@ class ClipXPUKernel : public framework::OpKernel { auto* max_t = ctx.Input("Max"); auto* max_data = max_t->data(); if (platform::is_xpu_place(max_t->place())) { - TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu); + paddle::framework::TensorCopySync(*max_t, platform::CPUPlace(), + &max_cpu); max_data = max_cpu.data(); } max = max_data[0]; @@ -48,7 +49,8 @@ class ClipXPUKernel : public framework::OpKernel { auto* min_t = ctx.Input("Min"); auto* min_data = min_t->data(); if (platform::is_xpu_place(min_t->place())) { - TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu); + paddle::framework::TensorCopySync(*min_t, platform::CPUPlace(), + &min_cpu); min_data = min_cpu.data(); } min = min_data[0]; diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc index b1e09e487fb3fb..ecf682aa52432a 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -139,7 +139,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { } PrintDebugInfo("input data", init); - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({num1, num2}); ctx.Wait(); @@ -165,7 +165,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { ctx.Wait(); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); PrintDebugInfo("output data", out_vec); diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc index b998aaa3e689ce..fa134b60e28deb 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -139,7 +139,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { } PrintDebugInfo("input data", init); - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({num1, num2}); ctx.Wait(); @@ -164,7 +164,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { ctx.Wait(); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); PrintDebugInfo("output data", out_vec); diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 714dc4e19f9b13..0e4210ea7304ae 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -144,7 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx, try { const auto& runner_mean = paddle::operators::NpuOpRunner( "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}}); - TensorToVector(mean, dev_ctx, &vec); + paddle::framework::TensorToVector(mean, dev_ctx, &vec); } catch (...) { LOG(WARNING) << "ContainsNan catch exception"; return true; diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index edbc19eea23b61..3e91220423e6a5 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -146,7 +146,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, auto place = ctx.GetPlace(); - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({num1, num2}); ctx.Wait(); @@ -170,7 +170,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, ctx.Wait(); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); PrintDebugInfo("output data", out_vec); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc index 2ea217afb776fd..1ea34c8200333f 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -133,7 +133,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { } PrintDebugInfo("input data", init); - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({num, num}); ctx.Wait(); @@ -159,7 +159,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { ctx.Wait(); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); PrintDebugInfo("output data", out_vec); diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc index 1ad809dbfc469c..3e96f15d5d3dd6 100644 --- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc +++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc @@ -71,8 +71,8 @@ void shard_index(const Tensor &table_t, const Tensor &ids_t, int64_t start_idx, #if (CANN_VERSION_CODE >= 503003) Tensor factor_tensor(ids_t.type()); factor_tensor.mutable_data({1}, context.GetPlace()); - TensorFromVector(std::vector{static_cast(start_idx)}, - context.device_context(), &factor_tensor); + paddle::framework::TensorFromVector(std::vector{static_cast(start_idx)}, + context.device_context(), &factor_tensor); sub_runner.SetType("Sub") .AddInput(ids_t) .AddInput(factor_tensor) diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h index c8577a96174898..a5e28235c168c1 100644 --- a/paddle/fluid/operators/collective/c_identity_op.h +++ b/paddle/fluid/operators/collective/c_identity_op.h @@ -48,7 +48,7 @@ class CIdentityOpKernel : public framework::OpKernel { "The ring_id (%d) for c_identity op must be non-negative.", rid)); out->mutable_data(ctx.GetPlace()); - TensorCopy(*x, out->place(), out); + paddle::framework::TensorCopy(*x, out->place(), out); } }; diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc index 1919b8ee35edf5..d589d0a25e694c 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -137,7 +137,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { auto place = ctx.GetPlace(); - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({num1, num2}); ctx.Wait(); @@ -161,7 +161,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { ctx.Wait(); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); PrintDebugInfo("output data", out_vec); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc index 5fa0df97c655f3..db78652f87980e 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -137,7 +137,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { } PrintDebugInfo("input data", init); - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({num1, num2}); ctx.Wait(); @@ -166,7 +166,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { } std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); PrintDebugInfo("output data", out_vec); diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc index 45613715b8260c..5778a270f19926 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc @@ -56,9 +56,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { init_y.push_back(static_cast(2.0)); } - TensorFromVector(init_x, ctx, tensor_x); + paddle::framework::TensorFromVector(init_x, ctx, tensor_x); tensor_x->Resize({10, 10}); - TensorFromVector(init_y, ctx, tensor_y); + paddle::framework::TensorFromVector(init_y, ctx, tensor_y); tensor_y->Resize({10, 10}); f::AttributeMap attrs; @@ -85,7 +85,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { sync_op->Run(*scope, place); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); // sync op copy auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc index a6c5149ed283ad..e7017835686940 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -136,7 +136,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { } std::cout << std::endl; - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({num, num}); ctx.Wait(); @@ -169,7 +169,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { // ctx.Wait(); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); EXPECT_EQ(out_vec.size(), init.size()); for (uint32_t i = 0; i < out_vec.size(); i++) { diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc index ac1559f87d1f66..2be37cc456b973 100644 --- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc +++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc @@ -64,7 +64,7 @@ bool Check(T value, int size = 2 * 512 * 8192) { init.push_back(static_cast(value)); } - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x); return result; } diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc index c1f55df5e8d860..edd4b18b35a6d3 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -145,7 +145,7 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) { } VLOG(3) << "Run op recv_v2"; std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); EXPECT_EQ(out_vec == init, true); diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc index e47ae646b148ec..b2470ab4c0570e 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -119,7 +119,7 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) { std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); int rank_id = atoi(getenv("RANK_ID")); VLOG(3) << "rank id:" << rank_id; - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({num, num}); ctx.Wait(); auto place = ctx.GetPlace(); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu index 2f4098c2608220..3a4d5303953ac4 100644 --- a/paddle/fluid/operators/controlflow/bitwise_op.cu +++ b/paddle/fluid/operators/controlflow/bitwise_op.cu @@ -35,8 +35,9 @@ class BinaryBitwiseOpKernel std::vector outs = {out}; const auto& cuda_ctx = ctx.template device_context(); - LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, -1, functor); + paddle::operators::LaunchElementwiseCudaKernel(cuda_ctx, ins, &outs, -1, + functor); } }; @@ -56,8 +57,8 @@ class UnaryBitwiseOpKernel std::vector outs = {out}; const auto& cuda_ctx = ctx.template device_context(); - LaunchSameDimsElementwiseCudaKernel( - cuda_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, T>(cuda_ctx, ins, &outs, functor); } }; diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu index 64a96ae9e8ee12..54f59c40a205d7 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cu +++ b/paddle/fluid/operators/controlflow/compare_all_op.cu @@ -55,8 +55,8 @@ class CompareReduceOpKernel context.template device_context(); std::vector ins = {x, y}; std::vector outs = {&tmp}; - LaunchSameDimsElementwiseCudaKernel( - cuda_ctx, ins, &outs, Functor()); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kBinary, T, bool>(cuda_ctx, ins, &outs, Functor()); // Reduce by 'bitwise and' operator std::vector reduce_dims; diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu index fc7dce208c4869..f03a11d906f4e7 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cu +++ b/paddle/fluid/operators/controlflow/compare_op.cu @@ -35,7 +35,8 @@ class CompareOpKernel ctx.template device_context(); int axis = PackTensorsIntoVector(ctx, &ins, &outs); - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( cuda_ctx, ins, &outs, axis, functor); } }; diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index 99b16d9b692538..ed4995d4fbeda2 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -39,12 +39,13 @@ static void DataCopy(const framework::LoDTensor &src_item, : paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout(), src_item, &out, platform::CPUPlace()); - TensorCopySync(out, platform::CPUPlace(), dst_item); + paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item); } else { - TensorCopySync(src_item, platform::CPUPlace(), dst_item); + paddle::framework::TensorCopySync(src_item, platform::CPUPlace(), + dst_item); } #else - TensorCopySync(src_item, platform::CPUPlace(), dst_item); + paddle::framework::TensorCopySync(src_item, platform::CPUPlace(), dst_item); #endif } else { // Not copy, if the src tensor is empty. diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc index 9bb9e481034bd5..d7f74c44bd522a 100644 --- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc @@ -50,12 +50,13 @@ static void DeepCopy(const framework::LoDTensor &src_item, : paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout(), src_item, &out, platform::CPUPlace()); - TensorCopySync(out, platform::CPUPlace(), dst_item); + paddle::framework::TensorCopySync(out, platform::CPUPlace(), dst_item); } else { - TensorCopySync(src_item, platform::CPUPlace(), dst_item); + paddle::framework::TensorCopySync(src_item, platform::CPUPlace(), + dst_item); } #else - TensorCopySync(src_item, platform::CPUPlace(), dst_item); + paddle::framework::TensorCopySync(src_item, platform::CPUPlace(), dst_item); #endif } else { // Not copy, if the src tensor is empty. diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu index 4a3fc6c895174c..53261160205350 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cu +++ b/paddle/fluid/operators/controlflow/logical_op.cu @@ -33,10 +33,12 @@ class BinaryLogicalOpKernel int axis = PackTensorsIntoVector(ctx, &ins, &outs); if (ins.size() == 1) { - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( cuda_ctx, ins, &outs, axis, functor); } else { - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( cuda_ctx, ins, &outs, axis, functor); } } diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc index c4451c3b583c72..8e46c7acf09181 100644 --- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc +++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc @@ -55,7 +55,7 @@ class WriteToArrayOp : public ArrayOp { platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); - TensorCopy(x_tensor, place, dev_ctx, out_tensor); + paddle::framework::TensorCopy(x_tensor, place, dev_ctx, out_tensor); } else { VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " "nothing has been written to output array[" diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h index 8ef12ca05e36a6..46c61842914a78 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.h +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -22,9 +22,12 @@ #include "paddle/fluid/operators/controlflow/op_variant.h" #include "paddle/fluid/platform/variant.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class ProgramDesc; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc index 721354954c7035..08f900884d612f 100644 --- a/paddle/fluid/operators/copy_cross_scope_op.cc +++ b/paddle/fluid/operators/copy_cross_scope_op.cc @@ -65,7 +65,8 @@ class CopyCrossScopeOp : public framework::OperatorBase { auto id_tensor = id_var->GetMutable(); auto it = scope.kids().begin(); framework::Tensor cpu_id_tensor; - TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor); + paddle::framework::TensorCopySync(*id_tensor, platform::CPUPlace(), + &cpu_id_tensor); auto id_value = cpu_id_tensor.data(); for (auto i = 0; i < *id_value; i++) { it++; @@ -87,7 +88,8 @@ class CopyCrossScopeOp : public framework::OperatorBase { x_name)); auto dst_tensor = dst_var->GetMutable(); auto main_tensor = main_var->GetMutable(); - TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor); + paddle::framework::TensorCopySync(*dst_tensor, main_tensor->place(), + main_tensor); } return; } @@ -107,7 +109,8 @@ class CopyCrossScopeOp : public framework::OperatorBase { "No variable with name %s found in destination scope.", x_name)); auto src_tensor = source_var->GetMutable(); auto dst_tensor = dst_var->GetMutable(); - TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor); + paddle::framework::TensorCopySync(*src_tensor, dst_tensor->place(), + dst_tensor); if (ToM) { auto* main_var = scope.FindVar(x_name); @@ -116,7 +119,8 @@ class CopyCrossScopeOp : public framework::OperatorBase { platform::errors::NotFound( "No variable with name %s found in destination scope.", x_name)); auto main_tensor = main_var->GetMutable(); - TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor); + paddle::framework::TensorCopySync(*dst_tensor, main_tensor->place(), + main_tensor); } } }; diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc index 37bc32d745edab..5f951ad337e8e7 100644 --- a/paddle/fluid/operators/copy_cross_scope_test.cc +++ b/paddle/fluid/operators/copy_cross_scope_test.cc @@ -43,18 +43,18 @@ void Compare1(f::Scope* scope, const p::DeviceContext& ctx, auto var_x = scope->Var("tmp"); auto x = var_x->GetMutable(); std::vector main_x = {1.0}; - TensorFromVector(main_x, ctx, x); + paddle::framework::TensorFromVector(main_x, ctx, x); auto var_id = scope->Var("Id"); auto id = var_id->GetMutable(); std::vector main_id = {1}; - TensorFromVector(main_id, ctx, id); + paddle::framework::TensorFromVector(main_id, ctx, id); for (int i = 0; i < 3; i++) { auto& child_scope = scope->NewScope(); auto child_var = child_scope.Var("tmp"); auto tensor_x = child_var->GetMutable(); std::vector init_x = {static_cast(i)}; - TensorFromVector(init_x, ctx, tensor_x); + paddle::framework::TensorFromVector(init_x, ctx, tensor_x); } ctx.Wait(); @@ -78,7 +78,7 @@ void Compare1(f::Scope* scope, const p::DeviceContext& ctx, auto* tensor_out = dst_var->GetMutable(); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); int expected = 1; EXPECT_EQ(static_cast(out_vec[0]), expected); @@ -91,18 +91,18 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx, auto var_x = scope->Var("tmp"); auto x = var_x->GetMutable(); std::vector main_x = {1.0}; - TensorFromVector(main_x, ctx, x); + paddle::framework::TensorFromVector(main_x, ctx, x); auto var_id = scope->Var("Id"); auto id = var_id->GetMutable(); std::vector main_id = {0}; - TensorFromVector(main_id, ctx, id); + paddle::framework::TensorFromVector(main_id, ctx, id); for (int i = 0; i < 3; i++) { auto& child_scope = scope->NewScope(); auto child_var = child_scope.Var("tmp"); auto tensor_x = child_var->GetMutable(); std::vector init_x = {static_cast(i)}; - TensorFromVector(init_x, ctx, tensor_x); + paddle::framework::TensorFromVector(init_x, ctx, tensor_x); } ctx.Wait(); @@ -121,7 +121,7 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx, auto* tensor_out = dst_var->GetMutable(); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); int expected = 0; EXPECT_EQ(static_cast(out_vec[0]), expected); diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc index 4096d872cd8e1e..013ad5dd8cb86c 100644 --- a/paddle/fluid/operators/crop_op_npu.cc +++ b/paddle/fluid/operators/crop_op_npu.cc @@ -29,7 +29,8 @@ class CropNPUKernel : public framework::OpKernel { std::vector offset_list; if (ctx.HasInput("Offsets")) { auto* offsets_tensor = ctx.Input("Offsets"); - TensorToVector(*offsets_tensor, ctx.device_context(), &offset_list); + paddle::framework::TensorToVector(*offsets_tensor, ctx.device_context(), + &offset_list); if (offset_list.empty()) { offset_list.resize(x->dims().size(), 0); } diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h index 54666c8482c021..9ee10e49fcb5ac 100644 --- a/paddle/fluid/operators/crop_tensor_op.h +++ b/paddle/fluid/operators/crop_tensor_op.h @@ -42,7 +42,7 @@ inline std::vector get_new_data( tensor->dims())); if (platform::is_gpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_data.push_back(static_cast(*temp.data())); } else { @@ -111,7 +111,8 @@ static std::vector GetShape(const framework::ExecutionContext& ctx) { auto* shape_data = shape_tensor->data(); framework::Tensor cpu_shape_tensor; if (platform::is_gpu_place(shape_tensor->place())) { - TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); + paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(), + &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); } res = std::vector(shape_data, shape_data + shape_tensor->numel()); diff --git a/paddle/fluid/operators/dequantize_abs_max_op.h b/paddle/fluid/operators/dequantize_abs_max_op.h index 0d9d20fc120ca4..6f4a76f9b4af1c 100644 --- a/paddle/fluid/operators/dequantize_abs_max_op.h +++ b/paddle/fluid/operators/dequantize_abs_max_op.h @@ -20,11 +20,9 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/dequantize_log_op.h b/paddle/fluid/operators/dequantize_log_op.h index 67ce9cc84d3a85..75b7f04645fc3c 100644 --- a/paddle/fluid/operators/dequantize_log_op.h +++ b/paddle/fluid/operators/dequantize_log_op.h @@ -19,11 +19,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/op_registry.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/dequeue_op.cc b/paddle/fluid/operators/dequeue_op.cc index f8ab97040ee7c2..fb5d53dacf0ed6 100644 --- a/paddle/fluid/operators/dequeue_op.cc +++ b/paddle/fluid/operators/dequeue_op.cc @@ -70,7 +70,8 @@ class DequeueOp : public framework::OperatorBase { "Op(dequeue), but poped %d element.", lod_tensor_vec.size())); for (size_t j = 0; j < lod_tensor_vec.size(); ++j) { - TensorCopySync(lod_tensor_vec[j], dev_place, out_tensor); + paddle::framework::TensorCopySync(lod_tensor_vec[j], dev_place, + out_tensor); } } } diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index 60cb16ce6c0470..eddb25d57b47cc 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -93,7 +93,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { auto score_in = score_ins[i]; if (multi_rois_num.size() > 0) { framework::Tensor temp; - TensorCopySync(*multi_rois_num[i], platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*multi_rois_num[i], + platform::CPUPlace(), &temp); const int* length_in = temp.data(); lod_size = multi_rois_num[i]->numel(); for (size_t n = 0; n < lod_size; ++n) { diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index a9a6dcea1bbe5f..355a35d4dd21b1 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -200,7 +200,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { } if (multi_rois_num.size() > 0) { Tensor* rois_num_t = multi_rois_num[i]; - TensorCopySync(sub_lod, dev_ctx.GetPlace(), rois_num_t); + paddle::framework::TensorCopySync(sub_lod, dev_ctx.GetPlace(), + rois_num_t); rois_num_t->Resize({lod_size}); } framework::LoD lod; diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h index e3c125b0a68885..f1b454913f7424 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -33,7 +33,8 @@ inline std::vector GetLodFromRoisNum(const Tensor* rois_num) { auto* rois_num_data = rois_num->data(); Tensor cpu_tensor; if (platform::is_gpu_place(rois_num->place())) { - TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor); + paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(), + &cpu_tensor); rois_num_data = cpu_tensor.data(); } rois_lod.push_back(static_cast(0)); diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc index 8422a1fa6ccbfd..8cc0ebcab61f7b 100644 --- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc +++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc @@ -320,8 +320,10 @@ class LocalityAwareNMSKernel : public framework::OpKernel { LoDTensor scores; LoDTensor boxes; - TensorCopySync(*scores_input, platform::CPUPlace(), &scores); - TensorCopySync(*boxes_input, platform::CPUPlace(), &boxes); + paddle::framework::TensorCopySync(*scores_input, platform::CPUPlace(), + &scores); + paddle::framework::TensorCopySync(*boxes_input, platform::CPUPlace(), + &boxes); std::vector>> all_indices; std::vector batch_starts = {0}; int64_t batch_size = score_dims[0]; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 2ddcc7a06f6797..fbf631f75b61f9 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -384,7 +384,8 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel { roi2image_data[j] = i; } } - TensorCopySync(roi2image, ctx.GetPlace(), &roi2image_dev); + paddle::framework::TensorCopySync(roi2image, ctx.GetPlace(), + &roi2image_dev); int out_size = rois_num * transformed_height * transformed_width * channels; auto stream = ctx.cuda_device_context().stream(); diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h index 33fa7a092768c4..d7db7dddce3887 100644 --- a/paddle/fluid/operators/dropout_impl_util.h +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -30,7 +30,8 @@ inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, if (seed) { framework::Tensor seed_cpu_tensor; - TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); + paddle::framework::TensorCopySync(*seed, platform::CPUPlace(), + &seed_cpu_tensor); *seed_data = static_cast(seed_cpu_tensor.data()[0]); *increment = offset; } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc index cf6401db926007..98a38a07dadaac 100644 --- a/paddle/fluid/operators/dropout_op_npu.cc +++ b/paddle/fluid/operators/dropout_op_npu.cc @@ -73,7 +73,8 @@ class DropoutNPUKernel : public framework::OpKernel { float keep_prob = 1. - dropout_prob; if (seed_tensor) { std::vector seed_data; - TensorToVector(*seed_tensor, ctx.device_context(), &seed_data); + paddle::framework::TensorToVector(*seed_tensor, ctx.device_context(), + &seed_data); seed = seed_data[0]; } else { seed = ctx.Attr("fix_seed") ? ctx.Attr("seed") : 0; diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index 3e401d1c4f9f4f..5c9be588419e34 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -45,7 +45,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { init.push_back(1.0); } - TensorFromVector(init, ctx, tensor); + paddle::framework::TensorFromVector(init, ctx, tensor); auto place = ctx.GetPlace(); auto out_var = scope->Var("Out"); @@ -70,7 +70,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { dropout_op->Run(*scope, place); std::vector out_vec; - TensorToVector(*out_tensor, ctx, &out_vec); + paddle::framework::TensorToVector(*out_tensor, ctx, &out_vec); std::vector std_out = { 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu index 3202b0a7d254bb..9b146fe7279dea 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu @@ -28,7 +28,8 @@ class ElementwiseFloorDivKernel ctx.template device_context(); int axis = PackTensorsIntoVector(ctx, &ins, &outs); - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( cuda_ctx, ins, &outs, axis, FloorDivFunctor()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu index eaf77744285657..7433c505f472a2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu @@ -28,8 +28,9 @@ class ElementwiseMaxKernel ctx.template device_context(); int axis = PackTensorsIntoVector(ctx, &ins, &outs); - LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, axis, MaxFunctor()); + paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, axis, + MaxFunctor()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu index a733b4a66f1294..5af985567d898d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu @@ -28,8 +28,9 @@ class ElementwiseMinKernel ctx.template device_context(); int axis = PackTensorsIntoVector(ctx, &ins, &outs); - LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, axis, MinFunctor()); + paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, axis, + MinFunctor()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu index 4ef957c617870e..379684aa9ba63b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu @@ -30,8 +30,9 @@ class ElementwiseModKernel const auto& cuda_ctx = ctx.template device_context(); int axis = PackTensorsIntoVector(ctx, &ins, &outs); - LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, ModFunctor()); + paddle::operators::LaunchElementwiseCudaKernel(cuda_ctx, ins, &outs, + axis, ModFunctor()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 5ece5cadc603fa..86a803106347d2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -38,7 +38,8 @@ class ElementwiseMulKernel std::vector outs; int axis = PackTensorsIntoVector(ctx, &ins, &outs, &x_for_selectedrows); - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( cuda_ctx, ins, &outs, axis, MulFunctor()); } else if (x_var->IsType()) { auto* x_lod = ctx.Input("X"); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 7cd04318d3f49c..3fddb553e117e6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -211,8 +211,9 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, const auto &dev_ctx = ctx.template device_context(); - LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, axis, func); + paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, + axis, func); #endif return; } @@ -1271,8 +1272,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, outs = {&tmp_dx, &tmp_dy}; } - LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, - axis, func); + paddle::operators::LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, axis, func); if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); @@ -1301,7 +1302,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx, outs = {dxy}; } - LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, axis, func); + paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, + axis, func); if (dxy->dims() != dout->dims()) { ReduceWrapper(dev_ctx, axis, &tmp_dxy, dxy); } diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index f06dbd26873a60..3cd9729d3443c5 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -57,9 +57,9 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx, init_y.push_back(static_cast(2.0)); } - TensorFromVector(init_x, ctx, tensor_x); + paddle::framework::TensorFromVector(init_x, ctx, tensor_x); tensor_x->Resize({10, 10}); - TensorFromVector(init_y, ctx, tensor_y); + paddle::framework::TensorFromVector(init_y, ctx, tensor_y); tensor_y->Resize({10, 10}); auto place = ctx.GetPlace(); @@ -74,7 +74,7 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx, op->Run(*scope, place); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); float expected; @@ -116,7 +116,7 @@ void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx, init_dout.push_back(static_cast(1.0)); } - TensorFromVector(init_dout, ctx, tensor_dout); + paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout); tensor_dout->Resize({2, 3, 5}); // run @@ -129,10 +129,10 @@ void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx, op->Run(*scope, place); std::vector dx_vec; - TensorToVector(*tensor_dx, ctx, &dx_vec); + paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec); std::vector dy_vec; - TensorToVector(*tensor_dy, ctx, &dy_vec); + paddle::framework::TensorToVector(*tensor_dy, ctx, &dy_vec); ctx.Wait(); float expected_x, expected_y; diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu index 722a53d188061b..1b24d5be3442fd 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu @@ -27,8 +27,9 @@ class ElementwisePowKernel ctx.template device_context(); int axis = PackTensorsIntoVector(ctx, &ins, &outs); - LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, PowFunctor()); + paddle::operators::LaunchElementwiseCudaKernel(cuda_ctx, ins, &outs, + axis, PowFunctor()); } }; diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index 809bad1d6c1eec..05cd893b057af7 100644 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -32,18 +32,21 @@ inline std::vector get_expand_times( auto* expand_data = expand_tensor->data(); framework::Tensor cpu_expand_tensor; if (platform::is_gpu_place(expand_tensor->place())) { - TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); + paddle::framework::TensorCopySync(*expand_tensor, platform::CPUPlace(), + &cpu_expand_tensor); expand_data = cpu_expand_tensor.data(); } #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(expand_tensor->place())) { - TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); + paddle::framework::TensorCopySync(*expand_tensor, platform::CPUPlace(), + &cpu_expand_tensor); expand_data = cpu_expand_tensor.data(); } #endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(expand_tensor->place())) { - TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); + paddle::framework::TensorCopySync(*expand_tensor, platform::CPUPlace(), + &cpu_expand_tensor); expand_data = cpu_expand_tensor.data(); } #endif @@ -61,13 +64,13 @@ inline std::vector get_expand_times( auto tensor = list_expand_times_tensor[i]; if (platform::is_gpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_epxand_times.push_back(*temp.data()); } #ifdef PADDLE_WITH_XPU else if (platform::is_xpu_place(tensor->place())) { // NOLINT framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_epxand_times.push_back(*temp.data()); } #endif diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc index 880eb341f2093b..7de2bf2e6990db 100644 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ b/paddle/fluid/operators/expand_op_npu_test.cc @@ -46,8 +46,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { auto expand_times_t = expand_times->GetMutable(); auto place = ctx.GetPlace(); - TensorFromVector(std::vector(3 * 1 * 7, 1), ctx, in_t); - TensorFromVector(std::vector({1, 10, 1}), ctx, expand_times_t); + paddle::framework::TensorFromVector(std::vector(3 * 1 * 7, 1), ctx, in_t); + paddle::framework::TensorFromVector(std::vector({1, 10, 1}), ctx, + expand_times_t); in_t->Resize(f::make_ddim({3, 1, 7})); expand_times_t->Resize(f::make_ddim({3})); diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h index fd7c6b3f27e75a..dd1625013444b6 100644 --- a/paddle/fluid/operators/expand_v2_op.h +++ b/paddle/fluid/operators/expand_v2_op.h @@ -33,18 +33,21 @@ inline std::vector get_expand_shape( auto* shape_data = shape_tensor->data(); framework::Tensor cpu_shape_tensor; if (platform::is_gpu_place(shape_tensor->place())) { - TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); + paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(), + &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); } #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(shape_tensor->place())) { - TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); + paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(), + &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); } #endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(shape_tensor->place())) { - TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); + paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(), + &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); } #endif @@ -62,20 +65,20 @@ inline std::vector get_expand_shape( auto tensor = list_expand_shapes_tensor[i]; if (platform::is_gpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_epxand_shape.push_back(*temp.data()); } #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(tensor->place())) { // NOLINT framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_epxand_shape.push_back(*temp.data()); } #endif #ifdef PADDLE_WITH_XPU else if (platform::is_xpu_place(tensor->place())) { // NOLINT framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_epxand_shape.push_back(*temp.data()); } #endif diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index fe631500a3de52..9f44c39a92c5ef 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -409,9 +409,9 @@ class TestFeedForward { void CheckOut(const T diff, bool is_relative_atol = false) { std::vector out(size_output_); std::vector bias_out(size_output_); - TensorToVector(out_, *ctx_, &out); + paddle::framework::TensorToVector(out_, *ctx_, &out); if (has_bias_) { - TensorToVector(bias_out_, *ctx_, &bias_out); + paddle::framework::TensorToVector(bias_out_, *ctx_, &bias_out); } ctx_->Wait(); @@ -437,7 +437,7 @@ class TestFeedForward { // check backward correctness between baseline and results of feedforward. void CheckGrad(const T diff, bool is_relative_atol = false) { std::vector h_dinput(size_src_); - TensorToVector(dinput_, *ctx_, &h_dinput); + paddle::framework::TensorToVector(dinput_, *ctx_, &h_dinput); for (int i = 0; i < size_src_; i++) { if (is_relative_atol) { EXPECT_LT( @@ -448,7 +448,7 @@ class TestFeedForward { } } std::vector h_dweight(size_weight_); - TensorToVector(dweight_, *ctx_, &h_dweight); + paddle::framework::TensorToVector(dweight_, *ctx_, &h_dweight); for (int i = 0; i < size_weight_; i++) { if (is_relative_atol) { EXPECT_LT(std::abs((h_dweight[i] - base_dweight_vec_[i]) / @@ -460,7 +460,7 @@ class TestFeedForward { } if (has_bias_) { std::vector h_dbias(size_bias_); - TensorToVector(dbias_, *ctx_, &h_dbias); + paddle::framework::TensorToVector(dbias_, *ctx_, &h_dbias); for (int i = 0; i < size_bias_; i++) { if (is_relative_atol) { EXPECT_LT( diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index 17c7321122b174..32cd07c916b330 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -81,7 +81,8 @@ class FillConstantKernel : public framework::OpKernel { auto tmp_place = value_tensor->place(); if (platform::is_gpu_place(tmp_place) || platform::is_xpu_place(tmp_place)) { - TensorCopySync(*value_tensor, platform::CPUPlace(), &cpu_tensor); + paddle::framework::TensorCopySync(*value_tensor, platform::CPUPlace(), + &cpu_tensor); tensor_data = cpu_tensor.data(); } value = tensor_data[0]; diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h index e0873608fa2814..b9b881cf83e0bd 100644 --- a/paddle/fluid/operators/fused/attn_gemm.h +++ b/paddle/fluid/operators/fused/attn_gemm.h @@ -67,7 +67,8 @@ class AttnMatMul { ins.emplace_back(bias); outs.emplace_back(bias_out); int elewise_add_axis = -1; - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor()); } } diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index c5995fe3554b4e..74307c3ba79175 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -164,11 +164,11 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, scope.Var("ReserveSpace")->GetMutable(); auto place = ctx.GetPlace(); - TensorCopySync(cpu_x, place, x); - TensorCopySync(cpu_scale, place, scale); - TensorCopySync(cpu_bias, place, bias); - TensorCopySync(*cpu_mean, place, mean); - TensorCopySync(*cpu_var, place, var); + paddle::framework::TensorCopySync(cpu_x, place, x); + paddle::framework::TensorCopySync(cpu_scale, place, scale); + paddle::framework::TensorCopySync(cpu_bias, place, bias); + paddle::framework::TensorCopySync(*cpu_mean, place, mean); + paddle::framework::TensorCopySync(*cpu_var, place, var); int64_t channels = x->dims()[3]; scale->Resize({channels}); @@ -195,11 +195,13 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, attrs); op->Run(scope, ctx.GetPlace()); - TensorCopySync(*y, platform::CPUPlace(), cpu_y); - TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); - TensorCopySync(*var, platform::CPUPlace(), cpu_var); - TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); - TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); + paddle::framework::TensorCopySync(*y, platform::CPUPlace(), cpu_y); + paddle::framework::TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); + paddle::framework::TensorCopySync(*var, platform::CPUPlace(), cpu_var); + paddle::framework::TensorCopySync(*saved_mean, platform::CPUPlace(), + cpu_saved_mean); + paddle::framework::TensorCopySync(*saved_var, platform::CPUPlace(), + cpu_saved_var); // reserved_space will stay on GPU and used in grad op. saved_reserve_space->ShareDataWith(*reserve_space); } @@ -226,12 +228,12 @@ void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, scope.Var("ReserveSpace")->GetMutable(); auto place = ctx.GetPlace(); - TensorCopySync(cpu_x, place, x); - TensorCopySync(cpu_z, place, z); - TensorCopySync(cpu_scale, place, scale); - TensorCopySync(cpu_bias, place, bias); - TensorCopySync(*cpu_mean, place, mean); - TensorCopySync(*cpu_var, place, var); + paddle::framework::TensorCopySync(cpu_x, place, x); + paddle::framework::TensorCopySync(cpu_z, place, z); + paddle::framework::TensorCopySync(cpu_scale, place, scale); + paddle::framework::TensorCopySync(cpu_bias, place, bias); + paddle::framework::TensorCopySync(*cpu_mean, place, mean); + paddle::framework::TensorCopySync(*cpu_var, place, var); int64_t channels = x->dims()[3]; scale->Resize({channels}); @@ -253,11 +255,13 @@ void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, attrs); op->Run(scope, ctx.GetPlace()); - TensorCopySync(*y, platform::CPUPlace(), cpu_y); - TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); - TensorCopySync(*var, platform::CPUPlace(), cpu_var); - TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); - TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); + paddle::framework::TensorCopySync(*y, platform::CPUPlace(), cpu_y); + paddle::framework::TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); + paddle::framework::TensorCopySync(*var, platform::CPUPlace(), cpu_var); + paddle::framework::TensorCopySync(*saved_mean, platform::CPUPlace(), + cpu_saved_mean); + paddle::framework::TensorCopySync(*saved_var, platform::CPUPlace(), + cpu_saved_var); // reserved_space will stay on GPU and used in grad op. saved_reserve_space->ShareDataWith(*reserve_space); } @@ -285,13 +289,13 @@ void ComputeFusedBNAddReluBackward( auto *dbias = scope.Var("Bias@GRAD")->GetMutable(); auto place = ctx.GetPlace(); - TensorCopySync(cpu_x, place, x); - TensorCopySync(cpu_y, place, y); - TensorCopySync(cpu_dy, place, dy); - TensorCopySync(cpu_scale, place, scale); - TensorCopySync(cpu_bias, place, bias); - TensorCopySync(cpu_saved_mean, place, saved_mean); - TensorCopySync(cpu_saved_var, place, saved_var); + paddle::framework::TensorCopySync(cpu_x, place, x); + paddle::framework::TensorCopySync(cpu_y, place, y); + paddle::framework::TensorCopySync(cpu_dy, place, dy); + paddle::framework::TensorCopySync(cpu_scale, place, scale); + paddle::framework::TensorCopySync(cpu_bias, place, bias); + paddle::framework::TensorCopySync(cpu_saved_mean, place, saved_mean); + paddle::framework::TensorCopySync(cpu_saved_var, place, saved_var); reserve_space->ShareDataWith(saved_reserve_space); int64_t channels = x->dims()[3]; @@ -324,10 +328,10 @@ void ComputeFusedBNAddReluBackward( attrs); op->Run(scope, ctx.GetPlace()); - TensorCopySync(*dx, platform::CPUPlace(), cpu_dx); - TensorCopySync(*dz, platform::CPUPlace(), cpu_dz); - TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale); - TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias); + paddle::framework::TensorCopySync(*dx, platform::CPUPlace(), cpu_dx); + paddle::framework::TensorCopySync(*dz, platform::CPUPlace(), cpu_dz); + paddle::framework::TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale); + paddle::framework::TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias); } template @@ -527,10 +531,10 @@ class CudnnBNAddReluTester { ComputeSumAndSquareSum(cpu_x, &cpu_sum, &cpu_sum_of_square); auto place = ctx.GetPlace(); - TensorCopySync(cpu_sum, place, sum); - TensorCopySync(cpu_sum_of_square, place, sum_of_square); - TensorCopySync(cpu_bn_scale, place, bn_scale); - TensorCopySync(cpu_bn_bias, place, bn_bias); + paddle::framework::TensorCopySync(cpu_sum, place, sum); + paddle::framework::TensorCopySync(cpu_sum_of_square, place, sum_of_square); + paddle::framework::TensorCopySync(cpu_bn_scale, place, bn_scale); + paddle::framework::TensorCopySync(cpu_bn_bias, place, bn_bias); bn_scale->Resize({1, 1, 1, channels_}); bn_bias->Resize({1, 1, 1, channels_}); @@ -572,9 +576,9 @@ class CudnnBNAddReluTester { framework::Tensor bn_bias_z; auto place = ctx.GetPlace(); - TensorCopySync(cpu_x_, place, &x); + paddle::framework::TensorCopySync(cpu_x_, place, &x); if (fuse_add_ || has_shortcut_) { - TensorCopySync(cpu_z_, place, &z); + paddle::framework::TensorCopySync(cpu_z_, place, &z); } framework::Tensor mean_x; @@ -595,12 +599,12 @@ class CudnnBNAddReluTester { framework::Tensor bitmask; InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); - TensorCopySync(*cpu_mean_x, place, &mean_x); - TensorCopySync(*cpu_var_x, place, &var_x); + paddle::framework::TensorCopySync(*cpu_mean_x, place, &mean_x); + paddle::framework::TensorCopySync(*cpu_var_x, place, &var_x); if (has_shortcut_) { InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); - TensorCopySync(*cpu_mean_z, place, &mean_z); - TensorCopySync(*cpu_var_z, place, &var_z); + paddle::framework::TensorCopySync(*cpu_mean_z, place, &mean_z); + paddle::framework::TensorCopySync(*cpu_var_z, place, &var_z); } // 1. BN Stats Finalize @@ -634,18 +638,24 @@ class CudnnBNAddReluTester { sbar_op.Forward(ctx, x, equiv_scale_x, equiv_bias_x, &z, &equiv_scale_z, &equiv_bias_z, &y, &bitmask); - TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); - TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); - TensorCopySync(saved_mean_x, platform::CPUPlace(), cpu_saved_mean_x); - TensorCopySync(saved_var_x, platform::CPUPlace(), cpu_saved_var_x); + paddle::framework::TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); + paddle::framework::TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); + paddle::framework::TensorCopySync(saved_mean_x, platform::CPUPlace(), + cpu_saved_mean_x); + paddle::framework::TensorCopySync(saved_var_x, platform::CPUPlace(), + cpu_saved_var_x); if (has_shortcut_) { - TensorCopySync(mean_z, platform::CPUPlace(), cpu_mean_z); - TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z); - TensorCopySync(saved_mean_z, platform::CPUPlace(), cpu_saved_mean_z); - TensorCopySync(saved_var_z, platform::CPUPlace(), cpu_saved_var_z); + paddle::framework::TensorCopySync(mean_z, platform::CPUPlace(), + cpu_mean_z); + paddle::framework::TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z); + paddle::framework::TensorCopySync(saved_mean_z, platform::CPUPlace(), + cpu_saved_mean_z); + paddle::framework::TensorCopySync(saved_var_z, platform::CPUPlace(), + cpu_saved_var_z); } - TensorCopySync(y, platform::CPUPlace(), cpu_y); - TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask); + paddle::framework::TensorCopySync(y, platform::CPUPlace(), cpu_y); + paddle::framework::TensorCopySync(bitmask, platform::CPUPlace(), + cpu_bitmask); } // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu @@ -664,13 +674,13 @@ class CudnnBNAddReluTester { framework::Tensor dbias; auto place = ctx.GetPlace(); - TensorCopySync(cpu_dy_, place, &dy); - TensorCopySync(cpu_x_, place, &x); - TensorCopySync(cpu_bn_scale_x_, place, &bn_scale); - TensorCopySync(cpu_bn_bias_x_, place, &bn_bias); - TensorCopySync(cpu_saved_mean_x_, place, &saved_mean); - TensorCopySync(cpu_saved_var_x_, place, &saved_var); - TensorCopySync(cpu_bitmask_, place, &bitmask); + paddle::framework::TensorCopySync(cpu_dy_, place, &dy); + paddle::framework::TensorCopySync(cpu_x_, place, &x); + paddle::framework::TensorCopySync(cpu_bn_scale_x_, place, &bn_scale); + paddle::framework::TensorCopySync(cpu_bn_bias_x_, place, &bn_bias); + paddle::framework::TensorCopySync(cpu_saved_mean_x_, place, &saved_mean); + paddle::framework::TensorCopySync(cpu_saved_var_x_, place, &saved_var); + paddle::framework::TensorCopySync(cpu_bitmask_, place, &bitmask); bn_scale.Resize({1, 1, 1, channels_}); bn_bias.Resize({1, 1, 1, channels_}); @@ -692,10 +702,10 @@ class CudnnBNAddReluTester { sbar_op.Backward(ctx, dy, x, bn_scale, bn_bias, saved_mean, saved_var, &bitmask, &dx, &dz, &dscale, &dbias, eps_); - TensorCopySync(dx, platform::CPUPlace(), cpu_dx); - TensorCopySync(dz, platform::CPUPlace(), cpu_dz); - TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale); - TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias); + paddle::framework::TensorCopySync(dx, platform::CPUPlace(), cpu_dx); + paddle::framework::TensorCopySync(dz, platform::CPUPlace(), cpu_dz); + paddle::framework::TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale); + paddle::framework::TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias); } private: diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 23983d447e4788..425782d7900b48 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -101,8 +101,8 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, auto *output = scope.Var("Output")->GetMutable(); auto place = ctx.GetPlace(); - TensorCopySync(cpu_input, place, input); - TensorCopySync(cpu_filter, place, filter); + paddle::framework::TensorCopySync(cpu_input, place, input); + paddle::framework::TensorCopySync(cpu_filter, place, filter); framework::AttributeMap attrs; bool use_cudnn = true; @@ -119,7 +119,7 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, {{"Output", {"Output"}}}, attrs); op->Run(scope, ctx.GetPlace()); - TensorCopySync(*output, platform::CPUPlace(), cpu_output); + paddle::framework::TensorCopySync(*output, platform::CPUPlace(), cpu_output); } // Use Paddle conv2d_grad op results as baseline @@ -140,9 +140,9 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, scope.Var("Filter@GRAD")->GetMutable(); auto place = ctx.GetPlace(); - TensorCopySync(cpu_input, place, input); - TensorCopySync(cpu_filter, place, filter); - TensorCopySync(cpu_output_grad, place, output_grad); + paddle::framework::TensorCopySync(cpu_input, place, input); + paddle::framework::TensorCopySync(cpu_filter, place, filter); + paddle::framework::TensorCopySync(cpu_output_grad, place, output_grad); framework::AttributeMap attrs; bool use_cudnn = true; @@ -172,8 +172,10 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, attrs); op->Run(scope, ctx.GetPlace()); - TensorCopySync(*input_grad, platform::CPUPlace(), cpu_input_grad); - TensorCopySync(*filter_grad, platform::CPUPlace(), cpu_filter_grad); + paddle::framework::TensorCopySync(*input_grad, platform::CPUPlace(), + cpu_input_grad); + paddle::framework::TensorCopySync(*filter_grad, platform::CPUPlace(), + cpu_filter_grad); } template @@ -313,8 +315,8 @@ class CudnnNormConvolutionTester { framework::Tensor sum_of_square; auto place = ctx.GetPlace(); - TensorCopySync(cpu_input_, place, &input); - TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + paddle::framework::TensorCopySync(cpu_input_, place, &input); + paddle::framework::TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); output.Resize(framework::make_ddim( {batch_size_, out_height_, out_width_, output_channels_})); @@ -329,9 +331,10 @@ class CudnnNormConvolutionTester { dilation_, group_); conv_op.Forward(ctx, input, filter_nhwc, &output, &sum, &sum_of_square); - TensorCopySync(output, platform::CPUPlace(), cpu_output); - TensorCopySync(sum, platform::CPUPlace(), cpu_sum); - TensorCopySync(sum_of_square, platform::CPUPlace(), cpu_sum_of_square); + paddle::framework::TensorCopySync(output, platform::CPUPlace(), cpu_output); + paddle::framework::TensorCopySync(sum, platform::CPUPlace(), cpu_sum); + paddle::framework::TensorCopySync(sum_of_square, platform::CPUPlace(), + cpu_sum_of_square); } void FusedBackward(const platform::CUDADeviceContext &ctx, @@ -344,9 +347,9 @@ class CudnnNormConvolutionTester { framework::Tensor filter_grad; auto place = ctx.GetPlace(); - TensorCopySync(cpu_input_, place, &input); - TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); - TensorCopySync(cpu_output_grad_, place, &output_grad); + paddle::framework::TensorCopySync(cpu_input_, place, &input); + paddle::framework::TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + paddle::framework::TensorCopySync(cpu_output_grad_, place, &output_grad); input_grad.Resize(input.dims()); filter_grad.Resize(filter_nhwc.dims()); @@ -360,8 +363,10 @@ class CudnnNormConvolutionTester { conv_grad_op.Backward(ctx, input, filter_nhwc, output_grad, &input_grad, &filter_grad); - TensorCopySync(input_grad, platform::CPUPlace(), cpu_input_grad); - TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); + paddle::framework::TensorCopySync(input_grad, platform::CPUPlace(), + cpu_input_grad); + paddle::framework::TensorCopySync(filter_grad, platform::CPUPlace(), + cpu_filter_grad); } private: diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 066e7e15e88312..8c080f97cba82c 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -119,7 +119,8 @@ class FMHARef { ins.emplace_back(src_mask_tensor); outs.emplace_back(src_mask_out_tensor); int elewise_add_axis = -1; - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor()); SoftmaxForwardCUDAKernelDriver(dev_ctx_, *src_mask_out_tensor, diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 173ef48b83dc2d..581fc45e268c2c 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -494,7 +494,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { ins.emplace_back(d_x); outs.emplace_back(d_x); int elewise_add_axis = -1; - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( ctx.cuda_device_context(), ins, &outs, elewise_add_axis, AddFunctor()); } diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index a241e3c3027250..934ce78e715bbe 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -308,7 +308,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { ins[1] = d_x; outs[0] = d_x; int elewise_add_axis = -1; - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( ctx, ins, &outs, elewise_add_axis, AddFunctor()); } diff --git a/paddle/fluid/operators/fused/fusion_group_op_test.cc b/paddle/fluid/operators/fused/fusion_group_op_test.cc index 55b4dce4929b8c..e49a71cf720be5 100644 --- a/paddle/fluid/operators/fused/fusion_group_op_test.cc +++ b/paddle/fluid/operators/fused/fusion_group_op_test.cc @@ -110,7 +110,8 @@ void CheckOutputs(framework::Scope* scope, for (size_t j = 0; j < output_names.size(); ++j) { auto* var = scope->Var(output_names[j]); const auto& dev_tensor = var->Get(); - TensorCopySync(dev_tensor, platform::CPUPlace(), &(cpu_outputs[j])); + paddle::framework::TensorCopySync(dev_tensor, platform::CPUPlace(), + &(cpu_outputs[j])); cpu_tensors->at(num_inputs + j) .mutable_data(dev_tensor.dims(), platform::CPUPlace()); @@ -159,7 +160,7 @@ void TestMain(const std::vector& input_names, SetupRandomCPUTensor(&(cpu_tensors[i]), input_shapes[i]); framework::Tensor* dev_tensor = CreateTensor(&scope, place, input_names[i], input_shapes[i]); - TensorCopySync(cpu_tensors[i], place, dev_tensor); + paddle::framework::TensorCopySync(cpu_tensors[i], place, dev_tensor); } // Create output tensors. std::vector empty_shape; diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc index 31e19d8f600c39..f50c4f5528e741 100644 --- a/paddle/fluid/operators/gather_op_npu_test.cc +++ b/paddle/fluid/operators/gather_op_npu_test.cc @@ -54,7 +54,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, } // [[1, 2],[3, 4],[5, 6]] - TensorFromVector(init_x, ctx, tensor_x); + paddle::framework::TensorFromVector(init_x, ctx, tensor_x); tensor_x->Resize(paddle::framework::make_ddim({3, 2})); std::vector init_index = {1, 2}; @@ -75,7 +75,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, op->Run(*scope, place); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); @@ -114,11 +114,11 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, tensor_index->Resize(paddle::framework::make_ddim({2})); std::vector init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - TensorFromVector(init_x, ctx, tensor_x); + paddle::framework::TensorFromVector(init_x, ctx, tensor_x); tensor_x->Resize(paddle::framework::make_ddim({3, 2})); std::vector init_dout = {5.0, 10.0, 2.0, 3.0}; - TensorFromVector(init_dout, ctx, tensor_dout); + paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout); tensor_dout->Resize(paddle::framework::make_ddim({2, 2})); ctx.Wait(); @@ -136,7 +136,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, op->Run(*scope, place); std::vector dx_vec; - TensorToVector(*tensor_dx, ctx, &dx_vec); + paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec); ctx.Wait(); diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu index 2f72fbff2668b7..da7ed05ddf55c9 100644 --- a/paddle/fluid/operators/gelu_op.cu +++ b/paddle/fluid/operators/gelu_op.cu @@ -219,10 +219,12 @@ class GeluKernel } } #endif - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); } else { - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); } } @@ -291,10 +293,12 @@ class GeluGradKernel } } #endif - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); } else { - LaunchElementwiseCudaKernel( + paddle::operators::LaunchElementwiseCudaKernel( dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); } } diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index 830dcd59839015..f47250c96817a7 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -46,7 +46,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { init_x.push_back(static_cast(1.0)); } - TensorFromVector(init_x, ctx, tensor_x); + paddle::framework::TensorFromVector(init_x, ctx, tensor_x); tensor_x->Resize({10, 10}); auto out = scope->Var("Out"); @@ -82,7 +82,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { // eval value std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); float expected = 0.841192; for (uint32_t i = 0; i < out_vec.size(); i++) { @@ -108,9 +108,9 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { init_x.push_back(static_cast(1.0)); } - TensorFromVector(init_dout, ctx, tensor_dout); + paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout); tensor_dout->Resize({10, 10}); - TensorFromVector(init_x, ctx, tensor_x); + paddle::framework::TensorFromVector(init_x, ctx, tensor_x); tensor_x->Resize({10, 10}); auto dx = scope->Var("DX"); @@ -147,7 +147,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { // eval value std::vector dx_vec; - TensorToVector(*tensor_dx, ctx, &dx_vec); + paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec); float expected = 1.082964; for (uint32_t i = 0; i < dx_vec.size(); i++) { diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index 080dadeacaae71..73fc79004b97cb 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -18,11 +18,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc index 3fd4e27de3251b..08f3169c283097 100644 --- a/paddle/fluid/operators/group_norm_op_npu.cc +++ b/paddle/fluid/operators/group_norm_op_npu.cc @@ -136,7 +136,7 @@ class GroupNormNPUKernel : public framework::OpKernel { xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]}); F.Transpose(x, &xnorm, std::vector{0, 3, 1, 2}); } else { - TensorCopy(*x, platform::NPUPlace(), &xnorm); + paddle::framework::TensorCopy(*x, platform::NPUPlace(), &xnorm); } auto N = xnorm.dims()[0]; auto C = xnorm.dims()[1]; diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu index 51d912f451b925..4e91e689fa58c6 100644 --- a/paddle/fluid/operators/gumbel_softmax_op.cu +++ b/paddle/fluid/operators/gumbel_softmax_op.cu @@ -98,7 +98,7 @@ struct OneHotGenerator { Tensor input_tensor; input_tensor.mutable_data(Out->dims(), platform::CUDAPlace()); - TensorCopy(*Out, context.GetPlace(), &input_tensor); + paddle::framework::TensorCopy(*Out, context.GetPlace(), &input_tensor); math::set_constant(context, Out, 0.0); OneHotCUDAKernel< T, thread_size><<>>( diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu index b9419cbcc57b58..2bf259f7d7a7a3 100644 --- a/paddle/fluid/operators/histogram_op.cu +++ b/paddle/fluid/operators/histogram_op.cu @@ -108,8 +108,10 @@ class HistogramCUDAKernel : public framework::OpKernel { input_max_scala.device(*place) = input_x.maximum(); Tensor input_min_cpu, input_max_cpu; - TensorCopySync(input_min_t, platform::CPUPlace(), &input_min_cpu); - TensorCopySync(input_max_t, platform::CPUPlace(), &input_max_cpu); + paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(), + &input_min_cpu); + paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(), + &input_max_cpu); output_min = input_min_cpu.data()[0]; output_max = input_max_cpu.data()[0]; diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h index 760d6a63de13ac..39ff7ea40aaa8c 100644 --- a/paddle/fluid/operators/im2sequence_op.h +++ b/paddle/fluid/operators/im2sequence_op.h @@ -53,7 +53,8 @@ class Im2SequenceKernel : public framework::OpKernel { const Tensor* imgrealsize = ctx.Input("Y"); auto out_stride = ctx.Attr>("out_stride"); Tensor cpu_shape_tensor; - TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor); + paddle::framework::TensorCopySync(*imgrealsize, platform::CPUPlace(), + &cpu_shape_tensor); std::vector imgreal_h; std::vector imgreal_w; std::vector output_height; diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc index bde349b0a33b9d..ca9420c04a2933 100644 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ b/paddle/fluid/operators/increment_op_npu_test.cc @@ -45,7 +45,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, std::vector init; init.push_back(static_cast(1.0)); - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({1}); ctx.Wait(); @@ -61,7 +61,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, op->Run(*scope, place); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); diff --git a/paddle/fluid/operators/index_sample_op.h b/paddle/fluid/operators/index_sample_op.h index 1e6b4a457ed936..ab03c17fd1a6fc 100644 --- a/paddle/fluid/operators/index_sample_op.h +++ b/paddle/fluid/operators/index_sample_op.h @@ -44,8 +44,10 @@ void IndexSampleInner(const framework::ExecutionContext &context, std::vector input_vec; std::vector index_vec; - TensorToVector(input, context.device_context(), &input_vec); - TensorToVector(index, context.device_context(), &index_vec); + paddle::framework::TensorToVector(input, context.device_context(), + &input_vec); + paddle::framework::TensorToVector(index, context.device_context(), + &index_vec); std::vector res(index_ids_num); for (int i = 0; i < index_ids_num; i++) { @@ -117,8 +119,10 @@ void IndexSampleGradInner(const framework::ExecutionContext &context, LoDTensor *x_grad) { std::vector out_grad_vec; std::vector index_vec; - TensorToVector(out_grad, context.device_context(), &out_grad_vec); - TensorToVector(index, context.device_context(), &index_vec); + paddle::framework::TensorToVector(out_grad, context.device_context(), + &out_grad_vec); + paddle::framework::TensorToVector(index, context.device_context(), + &index_vec); auto index_dims = index.dims(); auto x_grad_dims = x_grad->dims(); diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h index 0c90a3869a2a20..baa292319d36e4 100644 --- a/paddle/fluid/operators/interpolate_op.h +++ b/paddle/fluid/operators/interpolate_op.h @@ -39,7 +39,7 @@ inline std::vector get_new_shape( tensor->dims())); if (platform::is_gpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_shape.push_back(static_cast(*temp.data())); } else { vec_new_shape.push_back(static_cast(*tensor->data())); @@ -55,7 +55,8 @@ inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { auto* new_data = new_data_tensor->data(); framework::Tensor cpu_starts_tensor; if (platform::is_gpu_place(new_data_tensor->place())) { - TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); + paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), + &cpu_starts_tensor); new_data = cpu_starts_tensor.data(); } vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); diff --git a/paddle/fluid/operators/interpolate_op_xpu.cc b/paddle/fluid/operators/interpolate_op_xpu.cc index 882edc00f231b6..6d98e0220d3d13 100644 --- a/paddle/fluid/operators/interpolate_op_xpu.cc +++ b/paddle/fluid/operators/interpolate_op_xpu.cc @@ -35,7 +35,7 @@ inline std::vector get_new_shape_xpu( platform::errors::InvalidArgument("shape of dim tensor should be [1]")); if (platform::is_xpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_shape.push_back(static_cast(*temp.data())); } else { vec_new_shape.push_back(static_cast(*tensor->data())); @@ -52,7 +52,8 @@ inline std::vector get_new_data_from_tensor_xpu( auto* new_data = new_data_tensor->data(); framework::Tensor cpu_starts_tensor; if (platform::is_xpu_place(new_data_tensor->place())) { - TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); + paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), + &cpu_starts_tensor); new_data = cpu_starts_tensor.data(); } vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h index 0af799eca0c55c..a5afb18b3ff6f4 100644 --- a/paddle/fluid/operators/interpolate_v2_op.h +++ b/paddle/fluid/operators/interpolate_v2_op.h @@ -39,7 +39,7 @@ inline std::vector get_new_shape( tensor->dims())); if (platform::is_gpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_shape.push_back(static_cast(*temp.data())); } else { vec_new_shape.push_back(static_cast(*tensor->data())); @@ -55,12 +55,14 @@ inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { auto* new_data = new_data_tensor->data(); framework::Tensor cpu_starts_tensor; if (platform::is_gpu_place(new_data_tensor->place())) { - TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); + paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), + &cpu_starts_tensor); new_data = cpu_starts_tensor.data(); } #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(new_data_tensor->place())) { - TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); + paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), + &cpu_starts_tensor); new_data = cpu_starts_tensor.data(); } #endif diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc index c960f9a58be07f..33f49297a16893 100644 --- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc +++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc @@ -34,7 +34,7 @@ inline std::vector get_new_shape_xpu( tensor->dims(), framework::make_ddim({1}), platform::errors::InvalidArgument("shape of dim tensor should be [1]")); framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_shape.push_back(static_cast(*temp.data())); } @@ -46,7 +46,8 @@ inline std::vector get_new_data_from_tensor_xpu( const Tensor* new_data_tensor) { std::vector vec_new_data; framework::Tensor cpu_starts_tensor; - TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); + paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), + &cpu_starts_tensor); auto* new_data = cpu_starts_tensor.data(); vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); return vec_new_data; diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h index a54134910d0b86..99db1c7e081dad 100644 --- a/paddle/fluid/operators/isfinite_op.h +++ b/paddle/fluid/operators/isfinite_op.h @@ -22,11 +22,9 @@ #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/transform.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/fluid/operators/isfinite_v2_op.h index 332c50d75513f8..9edf3493b678c9 100644 --- a/paddle/fluid/operators/isfinite_v2_op.h +++ b/paddle/fluid/operators/isfinite_v2_op.h @@ -23,11 +23,9 @@ #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/transform.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu index 2c7a08de0f65b8..7979d3a74bb7d0 100644 --- a/paddle/fluid/operators/label_smooth_op.cu +++ b/paddle/fluid/operators/label_smooth_op.cu @@ -87,8 +87,8 @@ class LabelSmoothGPUKernel : public framework::OpKernel { std::vector ins = {in_t}; std::vector outs = {out_t}; auto functor = LabelSmoothFunctor(epsilon, label_dim); - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); } } }; @@ -107,8 +107,8 @@ class LabelSmoothGradGPUKernel : public framework::OpKernel { std::vector ins = {d_out_t}; std::vector outs = {d_in_t}; auto functor = LabelSmoothGradFunctor(epsilon); - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); } }; } // namespace operators diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu index da40518d9b4b2c..64d1a479627ce1 100644 --- a/paddle/fluid/operators/lgamma_op.cu +++ b/paddle/fluid/operators/lgamma_op.cu @@ -39,8 +39,8 @@ class LgammaKernel std::vector ins = {x}; std::vector outs = {out}; auto functor = CudaLgammaFunctor(); - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, T>(dev_ctx, ins, &outs, functor); } }; diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h index 08dd41e7b341b2..210bb7b3f0bc9a 100644 --- a/paddle/fluid/operators/lite/ut_helper.h +++ b/paddle/fluid/operators/lite/ut_helper.h @@ -95,7 +95,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, for (size_t i = 0; i < num_elements; i++) { *(temp_data + i) = random(0., 1.); } - TensorCopySync(temp_tensor, place, tensor); + paddle::framework::TensorCopySync(temp_tensor, place, tensor); } void CreateTensor(framework::Scope* scope, const std::string& name, diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h index a02b0e61d9278e..3a97b1cd848192 100644 --- a/paddle/fluid/operators/load_combine_op.h +++ b/paddle/fluid/operators/load_combine_op.h @@ -106,7 +106,7 @@ class LoadCombineOpKernel : public framework::OpKernel { auto *tensor = out_vars[i]->GetMutable(); // Get data from fin to tensor - DeserializeFromStream(*buffer, tensor, dev_ctx); + paddle::framework::DeserializeFromStream(*buffer, tensor, dev_ctx); auto in_dtype = tensor->type(); auto out_dtype = diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h index 3e58a6462d860d..66160695c3d5aa 100644 --- a/paddle/fluid/operators/load_op.h +++ b/paddle/fluid/operators/load_op.h @@ -75,9 +75,10 @@ class LoadOpKernel : public framework::OpKernel { platform::errors::InvalidArgument( "seek witn tensor must great than or equal to 0")); auto shape = ctx.Attr>("shape"); - DeserializeFromStream(fin, tensor, dev_ctx, seek, shape); + paddle::framework::DeserializeFromStream(fin, tensor, dev_ctx, seek, + shape); } else { - DeserializeFromStream(fin, tensor, dev_ctx); + paddle::framework::DeserializeFromStream(fin, tensor, dev_ctx); } auto load_as_fp16 = ctx.Attr("load_as_fp16"); diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index f241caa857a07a..f78c5b9d36187e 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -86,7 +86,7 @@ void SetValueCompute(const framework::ExecutionContext& ctx, // be two ops points to the output in graph: op1 -> output <- set_value. // In this case, we have to find a way to handle the running order of // set_value is what we want. - TensorCopy(*in, place, out); + paddle::framework::TensorCopy(*in, place, out); Tensor slice_tensor(dtype), pad_tensor(dtype); slice_tensor.mutable_data(slice_dims, place); diff --git a/paddle/fluid/operators/masked_select_op_npu.cc b/paddle/fluid/operators/masked_select_op_npu.cc index cb21c687e9982f..828a3b002c20d1 100644 --- a/paddle/fluid/operators/masked_select_op_npu.cc +++ b/paddle/fluid/operators/masked_select_op_npu.cc @@ -60,7 +60,7 @@ class MaskedSelectedNPUKernel : public framework::OpKernel { sum_runner.AddOutput(out_size); sum_runner.AddAttr("keep_dims", false); sum_runner.Run(stream); - TensorToVector(out_size, dev_ctx, &out_size_vec); + paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec); } out->Resize({out_size_vec[0]}); @@ -135,7 +135,7 @@ class MaskedSelectedGradNPUKernel : public framework::OpKernel { sum_runner.AddOutput(out_size); sum_runner.AddAttr("keep_dims", false); sum_runner.Run(stream); - TensorToVector(out_size, dev_ctx, &out_size_vec); + paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec); } Tensor topkv2_out, indices; diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc index c52ba68331580c..410abd265430c9 100644 --- a/paddle/fluid/operators/math/beam_search.cc +++ b/paddle/fluid/operators/math/beam_search.cc @@ -14,10 +14,12 @@ limitations under the License. */ #include "paddle/fluid/operators/math/beam_search.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { -namespace framework { -class Tensor; -} // namespace framework +namespace framework {} // namespace framework namespace platform { class CPUDeviceContext; } // namespace platform diff --git a/paddle/fluid/operators/math/beam_search_npu.cc b/paddle/fluid/operators/math/beam_search_npu.cc index 5aede02263dd53..2d5a3dae33b32f 100644 --- a/paddle/fluid/operators/math/beam_search_npu.cc +++ b/paddle/fluid/operators/math/beam_search_npu.cc @@ -15,10 +15,12 @@ limitations under the License. */ #include "paddle/fluid/operators/math/beam_search.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { -namespace framework { -class Tensor; -} // namespace framework +namespace framework {} // namespace framework namespace platform { class NPUDeviceContext; } // namespace platform diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc index 0df06621d9bab7..ec2e9516fcd4b9 100644 --- a/paddle/fluid/operators/math/beam_search_test.cc +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -80,10 +80,10 @@ void TestBeamSearch() { PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores); - TensorCopySync(cpu_ids, *place, &ids); - TensorCopySync(cpu_scores, *place, &scores); - TensorCopySync(cpu_pre_ids, *place, &pre_ids); - TensorCopySync(cpu_pre_scores, *place, &pre_scores); + paddle::framework::TensorCopySync(cpu_ids, *place, &ids); + paddle::framework::TensorCopySync(cpu_scores, *place, &scores); + paddle::framework::TensorCopySync(cpu_pre_ids, *place, &pre_ids); + paddle::framework::TensorCopySync(cpu_pre_scores, *place, &pre_scores); ids.set_lod(cpu_ids.lod()); scores.set_lod(cpu_scores.lod()); @@ -110,10 +110,10 @@ void TestBeamSearch() { cpu_selected_ids = selected_ids; cpu_selected_scores = selected_scores; } else { - TensorCopySync(selected_ids, paddle::platform::CPUPlace(), - &cpu_selected_ids); - TensorCopySync(selected_scores, paddle::platform::CPUPlace(), - &cpu_selected_scores); + paddle::framework::TensorCopySync( + selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids); + paddle::framework::TensorCopySync( + selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores); cpu_selected_ids.set_lod(selected_ids.lod()); cpu_selected_scores.set_lod(selected_scores.lod()); } diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 2be7695e6a8c47..f17cc3094f7fc0 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -20,7 +20,6 @@ namespace paddle { namespace framework { class ExecutionContext; -class Tensor; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 45effd404cfb32..2d23f52c0b27b0 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -17,10 +17,12 @@ limitations under the License. */ #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { -namespace framework { -class Tensor; -} // namespace framework +namespace framework {} // namespace framework namespace platform { class CPUDeviceContext; struct bfloat16; diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 0122e6cdeb4744..1400b9d105ce10 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -63,7 +63,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopySync(input_tmp, *place, &input); + paddle::framework::TensorCopySync(input_tmp, *place, &input); } output_cfo.mutable_data( {1, filter_size, filter_size, output_height, output_width}, *place); @@ -88,7 +88,8 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output_cfo.data(); } else { - TensorCopySync(output_cfo, paddle::platform::CPUPlace(), &output_tmp); + paddle::framework::TensorCopySync(output_cfo, paddle::platform::CPUPlace(), + &output_tmp); out_cfo_ptr = output_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -99,7 +100,8 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_ocf_ptr = output_ocf.data(); } else { - TensorCopySync(output_ocf, paddle::platform::CPUPlace(), &output_tmp); + paddle::framework::TensorCopySync(output_ocf, paddle::platform::CPUPlace(), + &output_tmp); out_ocf_ptr = output_tmp.data(); } @@ -120,7 +122,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopySync(input_tmp, *place, &input); + paddle::framework::TensorCopySync(input_tmp, *place, &input); } col2im(*context, output_cfo, dilation, stride, padding, &input); @@ -129,7 +131,8 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); + paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(), + &input_tmp); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -141,7 +144,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopySync(input_tmp, *place, &input); + paddle::framework::TensorCopySync(input_tmp, *place, &input); } col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); @@ -149,7 +152,8 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); + paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(), + &input_tmp); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 6ca3abe0f05a57..f2d1e79f03524a 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -284,13 +284,6 @@ struct ElementwiseAddTo { auto& place = *(ctx->eigen_device()); out.device(place) = out + in; } - void operator()(platform::CPUDeviceContext* ctx, const pten::DenseTensor& src, - pten::DenseTensor* dst) { - auto in = pten::EigenVector::Flatten(src); - auto out = pten::EigenVector::Flatten(*dst); - auto& place = *(ctx->eigen_device()); - out.device(place) = out + in; - } }; template struct ElementwiseAddTo; diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 6e2547145cfed2..960453dbe65ddf 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -283,13 +283,6 @@ struct ElementwiseAddTo { auto& place = *(ctx->eigen_device()); out.device(place) = out + in; } - void operator()(platform::CUDADeviceContext* ctx, - const pten::DenseTensor& src, pten::DenseTensor* dst) { - auto in = pten::EigenVector::Flatten(src); - auto out = pten::EigenVector::Flatten(*dst); - auto& place = *(ctx->eigen_device()); - out.device(place) = out + in; - } }; template struct ElementwiseAddTo diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc index f0b41f98dc0cd7..8aaac0295c818d 100644 --- a/paddle/fluid/operators/math/matrix_solve.cu.cc +++ b/paddle/fluid/operators/math/matrix_solve.cu.cc @@ -68,7 +68,7 @@ class MatrixSolveFunctor { Tensor tmp_a(a.type()); tmp_a.Resize(a.dims()); tmp_a.mutable_data(context.GetPlace()); - TensorCopy(a, context.GetPlace(), &tmp_a); + framework::TensorCopy(a, context.GetPlace(), &tmp_a); // copy input B to a temporary tensor tmp_b, and transpose tmp_b, // because cuBlas assumes column-major while Paddle uses row-majar. diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc index 491d40d3ae5676..8f533c446026b1 100644 --- a/paddle/fluid/operators/math/sequence_padding.cc +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -14,10 +14,12 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sequence_padding.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { -namespace framework { -class Tensor; -} // namespace framework +namespace framework {} // namespace framework namespace platform { class CPUDeviceContext; } // namespace platform diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index 19c3af03411b8c..65bf77f0d152b9 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -133,7 +133,8 @@ class UnpaddingLoDTensorFunctor { step_width, layout); /* if (!norm_by_times && seq_num == 1UL && pad_seq_len == max_seq_len) { - TensorCopy(pad_tensor, context.GetPlace(), context, seq_tensor); + paddle::framework::TensorCopy(pad_tensor, context.GetPlace(), context, + seq_tensor); seq_tensor->Resize(seq_tensor_dims); return; } diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index ea31b10c5558f6..6e7aae2ec7d9f6 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -42,7 +42,7 @@ void TestSequencePadding(const DeviceContext &context, if (paddle::platform::is_cpu_place(place)) { seq = cpu_seq; } else { - TensorCopySync(cpu_seq, place, &seq); + paddle::framework::TensorCopySync(cpu_seq, place, &seq); seq.set_lod(lod); } @@ -62,7 +62,7 @@ void TestSequencePadding(const DeviceContext &context, if (paddle::platform::is_cpu_place(place)) { pad_value = cpu_pad_value; } else { - TensorCopySync(cpu_pad_value, place, &pad_value); + paddle::framework::TensorCopySync(cpu_pad_value, place, &pad_value); } paddle::operators::math::PaddingLoDTensorFunctor()( @@ -78,7 +78,8 @@ void TestSequencePadding(const DeviceContext &context, if (paddle::platform::is_cpu_place(place)) { cpu_seq_back = seq_back; } else { - TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back); + paddle::framework::TensorCopySync(seq_back, paddle::platform::CPUPlace(), + &cpu_seq_back); cpu_seq_back.set_lod(lod); } diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index 775d8029bfd3ac..38db6b7b7e527b 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -39,7 +39,7 @@ void TestSequencePoolingSum(const DeviceContext &context, if (paddle::platform::is_cpu_place(place)) { out_grad = cpu_out_grad; } else { - TensorCopySync(cpu_out_grad, place, &out_grad); + paddle::framework::TensorCopySync(cpu_out_grad, place, &out_grad); } // construct in_grad @@ -73,7 +73,8 @@ void TestSequencePoolingSum(const DeviceContext &context, if (paddle::platform::is_cpu_place(place)) { cpu_in_grad = in_grad; } else { - TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad); + paddle::framework::TensorCopySync(in_grad, paddle::platform::CPUPlace(), + &cpu_in_grad); cpu_in_grad.set_lod(in_grad.lod()); } diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc index f4193bb71fabb6..3c2956f20889f5 100644 --- a/paddle/fluid/operators/math/sequence_scale.cc +++ b/paddle/fluid/operators/math/sequence_scale.cc @@ -14,11 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sequence_scale.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h index c6c84bb55dfa7a..574c2945dc2050 100644 --- a/paddle/fluid/operators/math/sequence_scale.h +++ b/paddle/fluid/operators/math/sequence_scale.h @@ -17,12 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device_context.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/sparse.h b/paddle/fluid/operators/math/sparse.h index 4ac68a3bdc4c66..7a5880bbfe7da1 100644 --- a/paddle/fluid/operators/math/sparse.h +++ b/paddle/fluid/operators/math/sparse.h @@ -20,7 +20,6 @@ namespace paddle { namespace framework { class ExecutionContext; -class Tensor; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index 5a8e7fcc2a76c2..8cd2824465879d 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -85,7 +85,8 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output.data(); } else { - TensorCopySync(output, paddle::platform::CPUPlace(), &output_tmp); + paddle::framework::TensorCopySync(output, paddle::platform::CPUPlace(), + &output_tmp); out_cfo_ptr = output_tmp.data(); } @@ -99,7 +100,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopySync(input_tmp, *place, &input); + paddle::framework::TensorCopySync(input_tmp, *place, &input); } paddle::operators::math::Col2VolFunctor col2vol; @@ -109,7 +110,8 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); + paddle::framework::TensorCopySync(input, paddle::platform::CPUPlace(), + &input_tmp); in_ptr = input_tmp.data(); } diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu index 87c8abc1c432ee..1891a7be24e456 100644 --- a/paddle/fluid/operators/matrix_rank_op.cu +++ b/paddle/fluid/operators/matrix_rank_op.cu @@ -81,7 +81,7 @@ class MatrixRankGPUKernel : public framework::OpKernel { // Must Copy X once, because the gesvdj will destory the content when exit. Tensor x_tmp; - TensorCopy(*x, context.GetPlace(), &x_tmp); + paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp); auto info = memory::Alloc(dev_ctx, sizeof(int) * batches); int* info_ptr = reinterpret_cast(info->ptr()); diff --git a/paddle/fluid/operators/memcpy_d2h_op.h b/paddle/fluid/operators/memcpy_d2h_op.h index 94eed5cf83fee5..fb5610dda70d9b 100644 --- a/paddle/fluid/operators/memcpy_d2h_op.h +++ b/paddle/fluid/operators/memcpy_d2h_op.h @@ -22,9 +22,12 @@ class DeviceContext; } // namespace platform } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; class SelectedRows; } // namespace framework diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index cc6e771d105ae0..e84dedd9112b74 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -23,9 +23,12 @@ class DeviceContext; } // namespace platform } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; class SelectedRows; } // namespace framework diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h index b270d87ad00ea2..d2a081ac3c2ade 100644 --- a/paddle/fluid/operators/memcpy_op.h +++ b/paddle/fluid/operators/memcpy_op.h @@ -25,9 +25,12 @@ class DeviceContext; } // namespace platform } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; class SelectedRows; } // namespace framework diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index dae598ef64220b..653283b604f072 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -14,10 +14,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { class InferShapeContext; -class Tensor; class OpDesc; class Scope; } // namespace framework diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h index e01469f26d74fa..39a62384a2740a 100644 --- a/paddle/fluid/operators/meshgrid_op.h +++ b/paddle/fluid/operators/meshgrid_op.h @@ -94,8 +94,9 @@ class MeshgridKernel : public framework::OpKernel { view_shape[i] = shape[i]; framework::Tensor reshape_ins_tensor; - TensorCopy(*ins[i], context.GetPlace(), context.device_context(), - &reshape_ins_tensor); + paddle::framework::TensorCopy(*ins[i], context.GetPlace(), + context.device_context(), + &reshape_ins_tensor); framework::DDim out_dims_reshape = framework::make_ddim(view_shape); reshape_ins_tensor.Resize(out_dims_reshape); framework::DDim out_dims = framework::make_ddim(shape); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 8630515a9fdafb..0cb074beb60d79 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -16,10 +16,12 @@ #include "paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { -namespace framework { -class Tensor; -} // namespace framework +namespace framework {} // namespace framework namespace platform { class MKLDNNDeviceContext; } // namespace platform diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index bf95ffdc11eccc..07f9183d9f8001 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -15,10 +15,12 @@ limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { -namespace framework { -class Tensor; -} // namespace framework +namespace framework {} // namespace framework namespace platform { class MKLDNNDeviceContext; } // namespace platform diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 754b46c823b28f..f6a6c6940a79d3 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -17,10 +17,12 @@ limitations under the License. */ #include "paddle/fluid/operators/fc_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { -namespace framework { -class Tensor; -} // namespace framework +namespace framework {} // namespace framework namespace platform { class MKLDNNDeviceContext; } // namespace platform diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 49c896ef80fcc2..1b9d9b8f31d357 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -17,10 +17,12 @@ limitations under the License. */ #include "paddle/fluid/operators/mul_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { -namespace framework { -class Tensor; -} // namespace framework +namespace framework {} // namespace framework namespace platform { class MKLDNNDeviceContext; } // namespace platform diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 5a19584ae380b9..0c442f2fe4d596 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -27,10 +27,12 @@ #include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { -namespace framework { -class Tensor; -} // namespace framework +namespace framework {} // namespace framework namespace platform { class CPUDeviceContext; class MKLDNNDeviceContext; diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc index 6e392bcc75e824..9da3a4c48728e7 100644 --- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc +++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc @@ -49,7 +49,7 @@ void Compare(fw::Scope* scope, const plat::DeviceContext& ctx, for (int64_t i = 0; i < num * num; ++i) { init_x.push_back(static_cast(i - 50)); } - TensorFromVector(init_x, ctx, tensor_x); + paddle::framework::TensorFromVector(init_x, ctx, tensor_x); tensor_x->Resize({num, num}); auto place = ctx.GetPlace(); @@ -80,7 +80,7 @@ void Compare(fw::Scope* scope, const plat::DeviceContext& ctx, // eval value std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); @@ -108,9 +108,9 @@ void CompareGrad(fw::Scope* scope, const plat::DeviceContext& ctx, init_out.push_back(static_cast(i - 50)); } - TensorFromVector(init_dout, ctx, tensor_dout); + paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout); tensor_dout->Resize({num, num}); - TensorFromVector(init_out, ctx, tensor_out); + paddle::framework::TensorFromVector(init_out, ctx, tensor_out); tensor_out->Resize({num, num}); auto dx = scope->Var("DX"); @@ -143,7 +143,7 @@ void CompareGrad(fw::Scope* scope, const plat::DeviceContext& ctx, // eval value std::vector dx_vec; - TensorToVector(*tensor_dx, ctx, &dx_vec); + paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec); ctx.Wait(); diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu index 5a212bcacae50d..0a32ee96fb6938 100644 --- a/paddle/fluid/operators/multiplex_op.cu +++ b/paddle/fluid/operators/multiplex_op.cu @@ -41,7 +41,7 @@ class MultiplexGPUKernel : public framework::OpKernel { auto cols = ins[0]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); + paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); platform::CUDAPlace place = ctx.GetPlace(); @@ -84,7 +84,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto cols = d_ins[idx]->numel() / rows; // copy index to cpu Tensor index_t_cpu; - TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); + paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu index 3da7a3afcc93dc..2b021748048c76 100644 --- a/paddle/fluid/operators/one_hot_op.cu +++ b/paddle/fluid/operators/one_hot_op.cu @@ -68,7 +68,8 @@ class OneHotCUDAKernel : public framework::OpKernel { auto* depth_tensor = context.Input("depth_tensor"); if (platform::is_gpu_place(depth_tensor->place())) { framework::Tensor temp; - TensorCopySync(*depth_tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(), + &temp); depth = *temp.data(); } else { depth = *depth_tensor->data(); diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu index 22eb6c81845d15..115c9460846838 100644 --- a/paddle/fluid/operators/one_hot_v2_op.cu +++ b/paddle/fluid/operators/one_hot_v2_op.cu @@ -69,7 +69,8 @@ class OneHotV2CUDAKernel : public framework::OpKernel { auto* depth_tensor = context.Input("depth_tensor"); if (platform::is_gpu_place(depth_tensor->place())) { framework::Tensor temp; - TensorCopySync(*depth_tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(), + &temp); depth = *temp.data(); } else { depth = *depth_tensor->data(); diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index 3b9cf159f1b6b1..1ef46ef085c5d7 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -176,8 +176,8 @@ class AdamOpCUDAKernel : public framework::OpKernel { "Input(SkipUpdate) size must be 1, but get %d", skip_update_tensor->numel())); std::vector skip_update_vec; - TensorToVector(*skip_update_tensor, ctx.device_context(), - &skip_update_vec); + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); skip_update = skip_update_vec[0]; } // skip_update=true, just copy input to output, and TensorCopy will call diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 233bef22d83af0..bb044b4b4986e3 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -33,11 +33,13 @@ static inline float GetAttrFromTensor(const framework::Tensor* tensor) { const float* tensor_data = tensor->data(); framework::Tensor cpu_tensor; if (platform::is_gpu_place(tensor->place())) { - TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &cpu_tensor); tensor_data = cpu_tensor.data(); } if (platform::is_xpu_place(tensor->place())) { - TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &cpu_tensor); tensor_data = cpu_tensor.data(); } return tensor_data[0]; @@ -431,8 +433,8 @@ class AdamOpKernel : public framework::OpKernel { "Input(SkipUpdate) size must be 1, but get %d", skip_update_tensor->numel())); std::vector skip_update_vec; - TensorToVector(*skip_update_tensor, ctx.device_context(), - &skip_update_vec); + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); skip_update = skip_update_vec[0]; } // skip_update=true, just copy input to output, and TensorCopy will call diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc index 744fcd3b412c45..c1846f148fd920 100644 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -66,8 +66,8 @@ class AdamNPUKernel : public framework::OpKernel { "Input(SkipUpdate) size must be 1, but get %d", skip_update_tensor->numel())); std::vector skip_update_vec; - TensorToVector(*skip_update_tensor, ctx.device_context(), - &skip_update_vec); + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); skip_update = skip_update_vec[0]; } // skip_update=true, just copy input to output, and TensorCopy will call @@ -239,8 +239,8 @@ class AdamWNPUKernel : public AdamNPUKernel { "Input(SkipUpdate) size must be 1, but get %d", skip_update_tensor->numel())); std::vector skip_update_vec; - TensorToVector(*skip_update_tensor, ctx.device_context(), - &skip_update_vec); + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); skip_update = skip_update_vec[0]; } VLOG(3) << "Skip update" << skip_update; diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index cb06b06824be15..0a653c40117194 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -68,8 +68,8 @@ class AdamOpXPUKernel : public framework::OpKernel { "Input(SkipUpdate) size must be 1, but get %d", skip_update_tensor->numel())); std::vector skip_update_vec; - TensorToVector(*skip_update_tensor, ctx.device_context(), - &skip_update_vec); + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); skip_update = skip_update_vec[0]; } // skip_update=true, just copy input to output, and TensorCopy will call @@ -138,8 +138,10 @@ class AdamOpXPUKernel : public framework::OpKernel { Tensor xpu_beta2_pow; if (beta1_pow.place() == platform::CPUPlace() && beta2_pow.place() == platform::CPUPlace()) { - TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow); - TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow); + paddle::framework::TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, + &xpu_beta1_pow); + paddle::framework::TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, + &xpu_beta2_pow); dev_ctx.Wait(); beta1_pow_ptr = xpu_beta1_pow.template data(); beta2_pow_ptr = xpu_beta2_pow.template data(); diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu index 8b152bc67a30bd..a8b16e73dbfffe 100644 --- a/paddle/fluid/operators/optimizers/adamw_op.cu +++ b/paddle/fluid/operators/optimizers/adamw_op.cu @@ -192,8 +192,8 @@ class AdamWOpCUDAKernel : public framework::OpKernel { "Input(SkipUpdate) size must be 1, but get %d", skip_update_tensor->numel())); std::vector skip_update_vec; - TensorToVector(*skip_update_tensor, ctx.device_context(), - &skip_update_vec); + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); skip_update = skip_update_vec[0]; } diff --git a/paddle/fluid/operators/optimizers/adamw_op.h b/paddle/fluid/operators/optimizers/adamw_op.h index 1904db4f7d6116..efd3a2b691f72d 100644 --- a/paddle/fluid/operators/optimizers/adamw_op.h +++ b/paddle/fluid/operators/optimizers/adamw_op.h @@ -177,8 +177,8 @@ class AdamWOpKernel : public AdamOpKernel { "Input(SkipUpdate) size must be 1, but get %d", skip_update_tensor->numel())); std::vector skip_update_vec; - TensorToVector(*skip_update_tensor, ctx.device_context(), - &skip_update_vec); + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); skip_update = skip_update_vec[0]; } VLOG(3) << "Skip update" << skip_update; diff --git a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc index c20bd6a9fadc0e..56fa11d2b08576 100644 --- a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc @@ -68,8 +68,8 @@ class AdamwOpXPUKernel : public framework::OpKernel { "Input(SkipUpdate) size must be 1, but get %d", skip_update_tensor->numel())); std::vector skip_update_vec; - TensorToVector(*skip_update_tensor, ctx.device_context(), - &skip_update_vec); + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); skip_update = skip_update_vec[0]; } auto& dev_ctx = ctx.template device_context(); @@ -129,8 +129,10 @@ class AdamwOpXPUKernel : public framework::OpKernel { Tensor xpu_beta2_pow; if (beta1_pow.place() == platform::CPUPlace() && beta2_pow.place() == platform::CPUPlace()) { - TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow); - TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow); + paddle::framework::TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, + &xpu_beta1_pow); + paddle::framework::TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, + &xpu_beta2_pow); dev_ctx.Wait(); beta1_pow_ptr = xpu_beta1_pow.template data(); beta2_pow_ptr = xpu_beta2_pow.template data(); diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc index a3a39e36e8244c..6a962b241fafb5 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc @@ -26,7 +26,8 @@ static inline float GetAttrFromTensor(const framework::Tensor* tensor) { framework::Tensor cpu_tensor; if (platform::is_gpu_place(tensor->place()) || platform::is_xpu_place(tensor->place())) { - TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &cpu_tensor); tensor_data = cpu_tensor.data(); } return tensor_data[0]; diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index b2a9ca6f937427..88e94ba039ac27 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -133,8 +133,8 @@ class PnormCUDAKernel : public framework::OpKernel { const auto& cuda_ctx = ctx.template device_context(); - LaunchSameDimsElementwiseCudaKernel>( + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, MT, T, UnsignedPowFunctor>( cuda_ctx, ins, &outs, func); framework::Tensor tmp_y; tmp_y.mutable_data(ndim, ctx.GetPlace()); @@ -145,8 +145,8 @@ class PnormCUDAKernel : public framework::OpKernel { outs = {out_norm}; auto func_inverse = UnsignedPowFunctor(1. / porder); - LaunchSameDimsElementwiseCudaKernel>( + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, MT, T, UnsignedPowFunctor>( cuda_ctx, ins, &outs, func_inverse); } } @@ -215,14 +215,14 @@ class PnormGradCUDAKernel : public framework::OpKernel { std::vector ins = {in_norm}; std::vector outs = {&tmp_norm}; auto pow_functor = PowFunctor(1. - porder); - LaunchSameDimsElementwiseCudaKernel>(cuda_ctx, ins, &outs, - pow_functor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, T, PowFunctor>(cuda_ctx, ins, &outs, + pow_functor); ins = {in_x}; outs = {out_dx}; auto unsigned_pow = UnsignedPowFunctor(porder - 1.); - LaunchSameDimsElementwiseCudaKernel>( + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, T, T, UnsignedPowFunctor>( cuda_ctx, ins, &outs, unsigned_pow); const framework::Tensor* tmp_norm_const = &tmp_norm; LaunchReduceGradKernel>( diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc index 37dbebd0762583..12501c9bebfdf6 100644 --- a/paddle/fluid/operators/pad3d_op_npu.cc +++ b/paddle/fluid/operators/pad3d_op_npu.cc @@ -26,7 +26,8 @@ static inline std::vector GetPaddings( std::vector paddings(6); auto* paddings_t = context.Input("Paddings"); if (paddings_t) { - TensorToVector(*paddings_t, context.device_context(), &paddings); + paddle::framework::TensorToVector(*paddings_t, context.device_context(), + &paddings); } else { auto pads = context.Attr>("paddings"); std::copy(pads.begin(), pads.end(), paddings.data()); diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc index cef2993fc30d5f..11ec3c2842b791 100644 --- a/paddle/fluid/operators/print_op.cc +++ b/paddle/fluid/operators/print_op.cc @@ -16,10 +16,13 @@ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/tensor_formatter.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { class InferShapeContext; -class Tensor; class OpDesc; class Scope; } // namespace framework diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index 9eddb03828b5d4..c8b6404830cdac 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -74,7 +74,7 @@ class QrGPUKernel : public framework::OpKernel { context.GetPlace(), size_t(batch_size * m * n * sizeof(math::Real))); // BatchedGeqrf performs computation in-place and 'qr' must be a copy of // input - TensorCopy(x, context.GetPlace(), &qr); + paddle::framework::TensorCopy(x, context.GetPlace(), &qr); // Prepare tau auto tau_dims_vec = framework::vectorize(x_dims); diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index f2f395314c0cc8..081cafdf67b99b 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -43,21 +43,21 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, auto tensor_start = start->GetMutable(); std::vector init_start; init_start.push_back(static_cast(1)); - TensorFromVector(init_start, ctx, tensor_start); + paddle::framework::TensorFromVector(init_start, ctx, tensor_start); tensor_start->Resize({1}); auto end = scope->Var("End"); auto tensor_end = end->GetMutable(); std::vector init_end; init_end.push_back(static_cast(10)); - TensorFromVector(init_end, ctx, tensor_end); + paddle::framework::TensorFromVector(init_end, ctx, tensor_end); tensor_end->Resize({1}); auto step = scope->Var("Step"); auto tensor_step = step->GetMutable(); std::vector init_step; init_step.push_back(static_cast(2)); - TensorFromVector(init_step, ctx, tensor_step); + paddle::framework::TensorFromVector(init_step, ctx, tensor_step); tensor_step->Resize({1}); ctx.Wait(); @@ -74,7 +74,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, op->Run(*scope, place); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); EXPECT_EQ(static_cast(out_vec.size()), static_cast(5)); diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 7adf7962e1987c..fcfdb8b72ecde6 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -14,10 +14,13 @@ limitations under the License. */ #include "paddle/fluid/operators/recurrent_op.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { class InferShapeContext; -class Tensor; class OpDesc; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h index a478690976bd39..06c9f23dd2c26f 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h @@ -23,7 +23,8 @@ namespace operators { #define HANDLE_DIM(NDIM, RDIM) \ if (ndim == NDIM && rdim == RDIM) { \ - ReduceFunctor( \ + paddle::operators::ReduceFunctor( \ context.template device_context(), *input, output, \ axis, keepdim); \ } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index eb4d4a5c1680ec..25603b07c7ad3a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -37,7 +37,8 @@ namespace operators { #define HANDLE_DIM(NDIM, RDIM) \ if (ndim == NDIM && rdim == RDIM) { \ - ReduceFunctor( \ + paddle::operators::ReduceFunctor( \ context.template device_context(), *input, output, \ dims, keep_dim); \ } @@ -131,7 +132,7 @@ void HandleLargeDim(const framework::ExecutionContext& context, shuffled_input.Resize({unreduced, reduced}); DDim output_dim = output->dims(); output->Resize({unreduced}); - ReduceFunctor( + paddle::operators::ReduceFunctor( context.template device_context(), shuffled_input, output, {1}, keep_dim); output->Resize(output_dim); diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu index b21b9fde56f247..1d76eaf27e8189 100644 --- a/paddle/fluid/operators/renorm_op.cu +++ b/paddle/fluid/operators/renorm_op.cu @@ -151,8 +151,8 @@ class CUDARenormKernel : public framework::OpKernel { const auto& cuda_ctx = context.template device_context(); - LaunchSameDimsElementwiseCudaKernel>( + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kUnary, MT, T, UnsignedPowFunctor>( cuda_ctx, ins, &outs, func); std::vector reduce_axis = {0, 2}; TensorReduceFunctorImpl>( diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc index 4ba071032162a4..fe2c38850fba07 100644 --- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc +++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc @@ -15,10 +15,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { class LoDRankTable; -class Tensor; class OpDesc; class Scope; } // namespace framework diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 9e343517e3fbf6..01e13cd1dcf836 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -56,7 +56,7 @@ inline std::vector get_new_shape( if (platform::is_gpu_place(tensor->place()) || platform::is_xpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_shape.push_back(static_cast(*temp.data())); } else { @@ -410,7 +410,8 @@ class ReshapeKernel { if (platform::is_gpu_place(tensor->place()) || platform::is_xpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &temp); pt_vec_shape.push_back( std::move(*(paddle::experimental::MakePtenDenseTensor(temp)))); } else { @@ -424,7 +425,8 @@ class ReshapeKernel { if (platform::is_gpu_place(shape_tensor->place()) || platform::is_xpu_place(shape_tensor->place())) { framework::Tensor temp; - TensorCopySync(*shape_tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*shape_tensor, platform::CPUPlace(), + &temp); pt_shape = paddle::experimental::MakePtenDenseTensor(temp); } else { pt_shape = paddle::experimental::MakePtenDenseTensor(*shape_tensor); diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h index bf91e2f57a6676..d5e331e2fe5f69 100644 --- a/paddle/fluid/operators/reverse_op.h +++ b/paddle/fluid/operators/reverse_op.h @@ -66,7 +66,7 @@ class ReverseKernel : public framework::OpKernel { auto* out_tensor = &out_array->at(out_offset); out_tensor->set_lod(x_tensor.lod()); - TensorCopy(x_tensor, context.GetPlace(), out_tensor); + paddle::framework::TensorCopy(x_tensor, context.GetPlace(), out_tensor); } return; } diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h index affb5f226ed555..413c7bcfc15eb1 100644 --- a/paddle/fluid/operators/roll_op.h +++ b/paddle/fluid/operators/roll_op.h @@ -100,7 +100,8 @@ class RollKernel : public framework::OpKernel { std::vector dims = context.Attr>("axis"); std::vector out_vec; - TensorToVector(input, context.device_context(), &out_vec); + paddle::framework::TensorToVector(input, context.device_context(), + &out_vec); size_t nums = shifts.size(); DDim input_dim = input.dims(); @@ -143,7 +144,8 @@ class RollGradKernel : public framework::OpKernel { std::vector dims = context.Attr>("axis"); std::vector out_vec; - TensorToVector(input, context.device_context(), &out_vec); + paddle::framework::TensorToVector(input, context.device_context(), + &out_vec); size_t nums = shifts.size(); DDim input_dim = input.dims(); diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index da7c8c607a92a9..4faa23b6c16048 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -30,7 +30,8 @@ static inline T GetAttrFromTensor(const framework::Tensor* tensor) { framework::Tensor cpu_tensor; if (platform::is_gpu_place(tensor->place()) || platform::is_npu_place(tensor->place())) { - TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &cpu_tensor); tensor_data = cpu_tensor.data(); } return tensor_data[0]; diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h index 2ce0b02d437b77..a9de2d683fee92 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h @@ -86,7 +86,8 @@ class SequenceMaskKernel : public framework::OpKernel { "But received Input(MaxLenTensor) is NULL")); if (platform::is_gpu_place(max_len_tensor->place())) { framework::Tensor temp; - TensorCopySync(*max_len_tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*max_len_tensor, platform::CPUPlace(), + &temp); maxlen = *temp.data(); } else { maxlen = *max_len_tensor->data(); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc index a69961afe02214..675ea175a16a96 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc @@ -36,7 +36,8 @@ class SequenceMaskNPUKernel : public framework::OpKernel { "Input(MaxLenTensor) should not be NULL." "But received Input(MaxLenTensor) is NULL")); framework::Tensor temp; - TensorCopySync(*max_len_tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*max_len_tensor, platform::CPUPlace(), + &temp); maxlen = *temp.data(); PADDLE_ENFORCE_GT( maxlen, 0, diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 71eb03895404d2..1580ef140ada1c 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -233,7 +233,7 @@ class SetValueKernel : public framework::OpKernel { // be two ops points to the output in graph: op1 -> output <- set_value. // In this case, we have to find a way to handle the running order of // set_value is what we want. - TensorCopy(*in, place, out); + paddle::framework::TensorCopy(*in, place, out); Tensor slice_tensor(dtype), pad_tensor(dtype); slice_tensor.mutable_data(slice_dims, place); @@ -441,7 +441,7 @@ class SetValueGradKernel : public framework::OpKernel { if (grad_input) { // Set gradient of `Input` - TensorCopy(*in, context.GetPlace(), grad_input); + paddle::framework::TensorCopy(*in, context.GetPlace(), grad_input); auto grad_input_t = framework::EigenTensor { slice_dims_for_assign = framework::make_ddim(slice_dims_with_none); } - TensorCopy(*in, ctx.GetPlace(), out); + paddle::framework::TensorCopy(*in, ctx.GetPlace(), out); auto starts_indices = std::vector(in_dims.size(), 0); auto ends_indices = std::vector(in_dims.size(), 0); diff --git a/paddle/fluid/operators/size_op.h b/paddle/fluid/operators/size_op.h index e8c53d6e683305..8840fde287d662 100644 --- a/paddle/fluid/operators/size_op.h +++ b/paddle/fluid/operators/size_op.h @@ -36,7 +36,7 @@ class SizeKernel : public framework::OpKernel { auto cpu_data = cpu_tensor.mutable_data(out_t->dims(), cpu_place); cpu_data[0] = in_t->numel(); - TensorCopy(cpu_tensor, place, out_t); + paddle::framework::TensorCopy(cpu_tensor, place, out_t); } } }; diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc index 08b73c2040a626..5826d2b4a8742b 100644 --- a/paddle/fluid/operators/size_op_npu.cc +++ b/paddle/fluid/operators/size_op_npu.cc @@ -30,8 +30,9 @@ class SizeNPUKernel : public framework::OpKernel { auto cpu_data = cpu_tensor.mutable_data(out->dims(), platform::CPUPlace()); cpu_data[0] = x->numel(); - TensorCopy(cpu_tensor, ctx.GetPlace(), - ctx.template device_context(), out); + paddle::framework::TensorCopy( + cpu_tensor, ctx.GetPlace(), + ctx.template device_context(), out); ctx.template device_context().Wait(); } }; diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h index 658939a91f39a7..15d52880ed9ca1 100644 --- a/paddle/fluid/operators/slice_op.h +++ b/paddle/fluid/operators/slice_op.h @@ -59,7 +59,7 @@ inline void DealTensorArray(const framework::ExecutionContext& ctx, auto in_tensor = in_array->at(i + start); out_tensor->set_lod(in_tensor.lod()); if (in_tensor.memory_size() > 0) { - TensorCopy(in_tensor, ctx.GetPlace(), out_tensor); + paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out_tensor); } else { VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " "nothing has been written to output array[" @@ -69,7 +69,7 @@ inline void DealTensorArray(const framework::ExecutionContext& ctx, } else { auto out = ctx.Output("Out"); auto in_tensor = in_array->at(start); - TensorCopy(in_tensor, ctx.GetPlace(), out); + paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out); } } @@ -309,12 +309,13 @@ class SliceGradKernel : public framework::OpKernel { ctx.Input(framework::GradVarName("Out")); int d_out_size = d_out_arr->size(); for (int i = 0; i < d_out_size; ++i) { - TensorCopy(d_out_arr->at(i), ctx.GetPlace(), - &(d_in_arr->at(start + i))); + paddle::framework::TensorCopy(d_out_arr->at(i), ctx.GetPlace(), + &(d_in_arr->at(start + i))); } } else { auto* d_out = ctx.Input(framework::GradVarName("Out")); - TensorCopy(*d_out, ctx.GetPlace(), &(d_in_arr->at(start))); + paddle::framework::TensorCopy(*d_out, ctx.GetPlace(), + &(d_in_arr->at(start))); } return; } diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc index d20b3ac04bf95c..8e9e077b845cea 100644 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -44,7 +44,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { init.push_back(static_cast(i)); } - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({2, 3}); ctx.Wait(); @@ -70,7 +70,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { ctx.Wait(); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); for (int i = 0; i < static_cast(out_vec.size()); ++i) { VLOG(3) << "out_vec[" << i << "] : " << out_vec[i]; @@ -96,7 +96,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { out_init.push_back(static_cast(0.4112)); out_init.push_back(static_cast(0.5457)); - TensorFromVector(out_init, ctx, tensor_out); + paddle::framework::TensorFromVector(out_init, ctx, tensor_out); tensor_out->Resize({2, 3}); ctx.Wait(); @@ -109,7 +109,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { dout_init.push_back(static_cast(1.0)); } - TensorFromVector(dout_init, ctx, tensor_dout); + paddle::framework::TensorFromVector(dout_init, ctx, tensor_dout); tensor_dout->Resize({2, 3}); ctx.Wait(); @@ -144,7 +144,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { ctx.Wait(); std::vector out_vec; - TensorToVector(*tensor_dx, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_dx, ctx, &out_vec); ctx.Wait(); diff --git a/paddle/fluid/operators/spectral_norm_op.h b/paddle/fluid/operators/spectral_norm_op.h index 954edc796914c8..b8a15579e5345a 100644 --- a/paddle/fluid/operators/spectral_norm_op.h +++ b/paddle/fluid/operators/spectral_norm_op.h @@ -138,15 +138,15 @@ class SpectralNormKernel : public framework::OpKernel { for (int i = 0; i < rank; i++) { real_dims.push_back(i); } - TensorCopySync(*weight, ctx.GetPlace(), &weight_mat); + paddle::framework::TensorCopySync(*weight, ctx.GetPlace(), &weight_mat); } weight_mat = weight_mat.Resize({h, w}); Tensor sigma; sigma.mutable_data(weight_mat.dims(), ctx.GetPlace()); Tensor uu, vv; - TensorCopySync(*u, ctx.GetPlace(), &uu); - TensorCopySync(*v, ctx.GetPlace(), &vv); + paddle::framework::TensorCopySync(*u, ctx.GetPlace(), &uu); + paddle::framework::TensorCopySync(*v, ctx.GetPlace(), &vv); CalcMatrixSigmaAndNormWeight( &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat, power_iters, eps, ctx); @@ -167,7 +167,8 @@ class SpectralNormKernel : public framework::OpKernel { rank, weight_mat.Resize(framework::make_ddim(real_dims)), out, perm, dev_ctx); } else { - TensorCopySync(weight_mat.Resize(dims), ctx.GetPlace(), out); + paddle::framework::TensorCopySync(weight_mat.Resize(dims), ctx.GetPlace(), + out); } } }; @@ -217,8 +218,9 @@ class SpectralNormGradKernel : public framework::OpKernel { for (int i = 0; i < rank; i++) { real_dims.push_back(i); } - TensorCopySync(*weight, ctx.GetPlace(), &weight_mat); - TensorCopySync(*out_grad, ctx.GetPlace(), &out_grad_mat); + paddle::framework::TensorCopySync(*weight, ctx.GetPlace(), &weight_mat); + paddle::framework::TensorCopySync(*out_grad, ctx.GetPlace(), + &out_grad_mat); } weight_mat = weight_mat.Resize({h, w}); out_grad_mat = out_grad_mat.Resize({h, w}); @@ -226,8 +228,8 @@ class SpectralNormGradKernel : public framework::OpKernel { Tensor sigma; sigma.mutable_data(weight_mat.dims(), ctx.GetPlace()); Tensor uu, vv; - TensorCopySync(*u, ctx.GetPlace(), &uu); - TensorCopySync(*v, ctx.GetPlace(), &vv); + paddle::framework::TensorCopySync(*u, ctx.GetPlace(), &uu); + paddle::framework::TensorCopySync(*v, ctx.GetPlace(), &vv); CalcMatrixSigmaAndNormWeight( &sigma, &(uu.Resize({h, 1})), &(vv.Resize({w, 1})), &weight_mat, power_iters, eps, ctx); @@ -266,7 +268,8 @@ class SpectralNormGradKernel : public framework::OpKernel { rank, weight_grad_mat.Resize(framework::make_ddim(real_dims)), weight_grad, perm, dev_ctx); } else { - TensorCopySync(weight_grad_mat.Resize(dims), ctx.GetPlace(), weight_grad); + paddle::framework::TensorCopySync(weight_grad_mat.Resize(dims), + ctx.GetPlace(), weight_grad); } } }; diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index 0ff622d3299195..9c22fa4797219f 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -15,10 +15,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { class InferShapeContext; -class Tensor; class OpDesc; class Scope; } // namespace framework diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc index 1de7ca8c7bdbf4..3f6c43d7af2fe0 100644 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ b/paddle/fluid/operators/squeeze_op_npu_test.cc @@ -50,7 +50,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { init.push_back(static_cast(0.1)); } - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({dim0, dim1, dim2}); ctx.Wait(); @@ -75,7 +75,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1)); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); for (uint32_t i = 0; i < out_vec.size(); i++) { EXPECT_EQ(out_vec[i], static_cast(0.1)); } diff --git a/paddle/fluid/operators/strided_slice_op.h b/paddle/fluid/operators/strided_slice_op.h index 9eae27cca6840d..eaef9496a92dcf 100644 --- a/paddle/fluid/operators/strided_slice_op.h +++ b/paddle/fluid/operators/strided_slice_op.h @@ -376,7 +376,8 @@ class StridedSliceKernel : public framework::OpKernel { auto* out_tensor = &out_array->at(out_offset); out_tensor->set_lod(in_tensor.lod()); - TensorCopy(in_tensor, context.GetPlace(), out_tensor); + paddle::framework::TensorCopy(in_tensor, context.GetPlace(), + out_tensor); } } else { @@ -608,7 +609,8 @@ class StridedSliceGradKernel : public framework::OpKernel { in_offset)); d_out_tensor->set_lod(in_tensor.lod()); - TensorCopy(in_tensor, context.GetPlace(), d_out_tensor); + paddle::framework::TensorCopy(in_tensor, context.GetPlace(), + d_out_tensor); } else { d_out_tensor->Resize(dim); diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc index 6a8726ce3351dc..95c4357f3280b3 100644 --- a/paddle/fluid/operators/strided_slice_op_npu.cc +++ b/paddle/fluid/operators/strided_slice_op_npu.cc @@ -145,12 +145,12 @@ class StridedSliceNPUKernel : public framework::OpKernel { ends_indices_tensor.mutable_data({D}, place); strides_indices_tensor.mutable_data({D}, place); - TensorFromVector(starts_indices_vector, ctx.device_context(), - &starts_indices_tensor); - TensorFromVector(ends_indices_vector, ctx.device_context(), - &ends_indices_tensor); - TensorFromVector(strides_indices_vector, ctx.device_context(), - &strides_indices_tensor); + paddle::framework::TensorFromVector( + starts_indices_vector, ctx.device_context(), &starts_indices_tensor); + paddle::framework::TensorFromVector( + ends_indices_vector, ctx.device_context(), &ends_indices_tensor); + paddle::framework::TensorFromVector( + strides_indices_vector, ctx.device_context(), &strides_indices_tensor); auto out_dims_origin = out_dims; if (decrease_axis.size() > 0) { @@ -199,9 +199,9 @@ class StridedSliceNPUKernel : public framework::OpKernel { if (need_reverse) { Tensor out_tmp; out_tmp.mutable_data(out_dims, place); - TensorCopy(*out, place, - ctx.template device_context(), - &out_tmp); + paddle::framework::TensorCopy( + *out, place, ctx.template device_context(), + &out_tmp); Tensor reverse_axis; std::vector reverse_axis_vector; @@ -212,8 +212,8 @@ class StridedSliceNPUKernel : public framework::OpKernel { } reverse_axis.mutable_data( {static_cast(reverse_axis_vector.size())}, place); - TensorFromVector(reverse_axis_vector, ctx.device_context(), - &reverse_axis); + paddle::framework::TensorFromVector(reverse_axis_vector, + ctx.device_context(), &reverse_axis); const auto& runner_reverse = NpuOpRunner("ReverseV2", {out_tmp, reverse_axis}, {*out}); @@ -346,16 +346,20 @@ class StridedSliceGradNPUKernel : public framework::OpKernel { ends_indices_tensor.mutable_data({D}, place); strides_indices_tensor.mutable_data({D}, place); - TensorFromVector(starts_indices_vector, dev_ctx, &starts_indices_tensor); - TensorFromVector(ends_indices_vector, dev_ctx, &ends_indices_tensor); - TensorFromVector(strides_indices_vector, dev_ctx, &strides_indices_tensor); + paddle::framework::TensorFromVector(starts_indices_vector, dev_ctx, + &starts_indices_tensor); + paddle::framework::TensorFromVector(ends_indices_vector, dev_ctx, + &ends_indices_tensor); + paddle::framework::TensorFromVector(strides_indices_vector, dev_ctx, + &strides_indices_tensor); std::vector input_dims_vector; for (int i = 0; i < input_dims.size(); i++) { input_dims_vector.push_back(input_dims[i]); } Tensor input_dims_tensor; - TensorFromVector(input_dims_vector, dev_ctx, &input_dims_tensor); + paddle::framework::TensorFromVector(input_dims_vector, dev_ctx, + &input_dims_tensor); bool need_reverse = false; for (size_t axis = 0; axis < axes.size(); axis++) { @@ -382,7 +386,8 @@ class StridedSliceGradNPUKernel : public framework::OpKernel { } reverse_axis.mutable_data( {static_cast(reverse_axis_vector.size())}, place); - TensorFromVector(reverse_axis_vector, dev_ctx, &reverse_axis); + paddle::framework::TensorFromVector(reverse_axis_vector, dev_ctx, + &reverse_axis); Tensor dout_tmp; dout_tmp.mutable_data(dout->dims(), place); diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc index a2e446e1525ad5..ec7ba1d03237bf 100644 --- a/paddle/fluid/operators/sum_op_npu.cc +++ b/paddle/fluid/operators/sum_op_npu.cc @@ -38,7 +38,7 @@ class SumNPUKernel : public framework::OpKernel { int n = static_cast(x.size()); if (n == 1) { - TensorCopy(*x[0], place, out); + paddle::framework::TensorCopy(*x[0], place, out); return; } diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu index f17e92e47b7312..e987589e83c19c 100644 --- a/paddle/fluid/operators/svd_op.cu +++ b/paddle/fluid/operators/svd_op.cu @@ -55,7 +55,7 @@ class SvdGPUKernel : public framework::OpKernel { // then view A as n x m and do A^T SVD, we can avoid transpose // Must Copy X once, because the gesvdj will change the origin input matrix Tensor x_tmp; - TensorCopy(*x, context.GetPlace(), &x_tmp); + paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp); auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count); int* info_ptr = reinterpret_cast(info->ptr()); diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc index a89bb0861c9cee..c666d5a11fa312 100644 --- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc +++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc @@ -49,7 +49,7 @@ void training_or_inference( { common_mean_tile_1.Resize({C}); common_mean_tile_1.mutable_data(place); - TensorCopySync(*common_mean, place, &common_mean_tile_1); + paddle::framework::TensorCopySync(*common_mean, place, &common_mean_tile_1); if (layout == framework::DataLayout::kNCHW) common_mean_tile_1.Resize({1, C, 1, 1}); else if (layout == framework::DataLayout::kNHWC) @@ -70,7 +70,7 @@ void training_or_inference( { common_var_tile_1.Resize({C}); common_var_tile_1.mutable_data(place); - TensorCopySync(*common_var, place, &common_var_tile_1); + paddle::framework::TensorCopySync(*common_var, place, &common_var_tile_1); if (layout == framework::DataLayout::kNCHW) common_var_tile_1.Resize({1, C, 1, 1}); else if (layout == framework::DataLayout::kNHWC) @@ -129,7 +129,7 @@ void training_or_inference( { scale_tile_1.Resize({C}); scale_tile_1.mutable_data(place); - TensorCopySync(*scale, place, &scale_tile_1); + paddle::framework::TensorCopySync(*scale, place, &scale_tile_1); if (layout == framework::DataLayout::kNCHW) scale_tile_1.Resize({1, C, 1, 1}); else if (layout == framework::DataLayout::kNHWC) @@ -159,7 +159,7 @@ void training_or_inference( { bias_tile_1.Resize({C}); bias_tile_1.mutable_data(place); - TensorCopySync(*bias, place, &bias_tile_1); + paddle::framework::TensorCopySync(*bias, place, &bias_tile_1); if (layout == framework::DataLayout::kNCHW) bias_tile_1.Resize({1, C, 1, 1}); else if (layout == framework::DataLayout::kNHWC) @@ -339,11 +339,11 @@ class SyncBatchNormNPUKernel : public framework::OpKernel { if (test_mode) { // inference // cacl saved_mean saved_mean->mutable_data(place); - TensorCopySync(*mean, place, saved_mean); + paddle::framework::TensorCopySync(*mean, place, saved_mean); // cacl saved_variance saved_variance->mutable_data(place); - TensorCopySync(*variance, place, saved_variance); + paddle::framework::TensorCopySync(*variance, place, saved_variance); // cacl y training_or_inference(ctx, stream, place, layout, test_mode, N, C, H, @@ -354,7 +354,8 @@ class SyncBatchNormNPUKernel : public framework::OpKernel { if (ctx.HasInput("MomentumTensor")) { const auto *mom_tensor = ctx.Input("MomentumTensor"); Tensor mom_cpu; - TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); + paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), + &mom_cpu); momentum = mom_cpu.data()[0]; } @@ -417,8 +418,8 @@ class SyncBatchNormNPUKernel : public framework::OpKernel { } std::vector device_count_vec(1); - TensorToVector(device_count_tensor, ctx.device_context(), - &device_count_vec); + paddle::framework::TensorToVector( + device_count_tensor, ctx.device_context(), &device_count_vec); device_counts = device_count_vec[0]; // HcclAllReduce x_sum @@ -560,8 +561,8 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel { } std::vector device_count_vec(1); - TensorToVector(device_count_tensor, ctx.device_context(), - &device_count_vec); + paddle::framework::TensorToVector( + device_count_tensor, ctx.device_context(), &device_count_vec); device_counts = device_count_vec[0]; PADDLE_ENFORCE_GE(device_counts, 2, platform::errors::PreconditionNotMet( "device_counts should >= 2.")); @@ -626,7 +627,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel { { saved_mean_tile_1.Resize({C}); saved_mean_tile_1.mutable_data(place); - TensorCopySync(*saved_mean, place, &saved_mean_tile_1); + paddle::framework::TensorCopySync(*saved_mean, place, &saved_mean_tile_1); if (layout == framework::DataLayout::kNCHW) saved_mean_tile_1.Resize({1, C, 1, 1}); else if (layout == framework::DataLayout::kNHWC) @@ -656,7 +657,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel { { var_ref_tile_1.Resize({C}); var_ref_tile_1.mutable_data(place); - TensorCopySync(var_ref, place, &var_ref_tile_1); + paddle::framework::TensorCopySync(var_ref, place, &var_ref_tile_1); if (layout == framework::DataLayout::kNCHW) var_ref_tile_1.Resize({1, C, 1, 1}); else if (layout == framework::DataLayout::kNHWC) @@ -793,7 +794,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel { { dy_mean_tile_1.Resize({C}); dy_mean_tile_1.mutable_data(place); - TensorCopySync(dy_mean, place, &dy_mean_tile_1); + paddle::framework::TensorCopySync(dy_mean, place, &dy_mean_tile_1); if (layout == framework::DataLayout::kNCHW) dy_mean_tile_1.Resize({1, C, 1, 1}); else if (layout == framework::DataLayout::kNHWC) @@ -842,8 +843,8 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel { { dy_mul_x_sub_mean_mean_tile_1.Resize({C}); dy_mul_x_sub_mean_mean_tile_1.mutable_data(place); - TensorCopySync(dy_mul_x_sub_mean_mean, place, - &dy_mul_x_sub_mean_mean_tile_1); + paddle::framework::TensorCopySync(dy_mul_x_sub_mean_mean, place, + &dy_mul_x_sub_mean_mean_tile_1); if (layout == framework::DataLayout::kNCHW) dy_mul_x_sub_mean_mean_tile_1.Resize({1, C, 1, 1}); else if (layout == framework::DataLayout::kNHWC) @@ -900,7 +901,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel { { scale_tile_1.Resize({C}); scale_tile_1.mutable_data(place); - TensorCopySync(*scale, place, &scale_tile_1); + paddle::framework::TensorCopySync(*scale, place, &scale_tile_1); if (layout == framework::DataLayout::kNCHW) scale_tile_1.Resize({1, C, 1, 1}); else if (layout == framework::DataLayout::kNHWC) diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc index 558f5f2a3128f4..b65ebee9b2662f 100644 --- a/paddle/fluid/operators/tensor_formatter.cc +++ b/paddle/fluid/operators/tensor_formatter.cc @@ -124,7 +124,7 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor, data = print_tensor.data(); } else { platform::CPUPlace cpu_place; - TensorCopy(print_tensor, cpu_place, &cpu_tensor); + paddle::framework::TensorCopy(print_tensor, cpu_place, &cpu_tensor); #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(print_tensor.place())) { platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait(); diff --git a/paddle/fluid/operators/tensor_formatter.h b/paddle/fluid/operators/tensor_formatter.h index 38e3e7a94a5240..5181e65faf50a4 100644 --- a/paddle/fluid/operators/tensor_formatter.h +++ b/paddle/fluid/operators/tensor_formatter.h @@ -18,11 +18,9 @@ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/var_type.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h index 5211d72336124e..7dda2865cd802f 100644 --- a/paddle/fluid/operators/tile_op.h +++ b/paddle/fluid/operators/tile_op.h @@ -35,7 +35,8 @@ inline std::vector get_repeat_times( if (platform::is_gpu_place(repeat_tensor->place()) || platform::is_xpu_place(repeat_tensor->place()) || platform::is_npu_place(repeat_tensor->place())) { - TensorCopySync(*repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor); + paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(), + &cpu_repeat_tensor); repeat_data = cpu_repeat_tensor.data(); } auto vec_repeat_times = @@ -54,7 +55,7 @@ inline std::vector get_repeat_times( platform::is_xpu_place(tensor->place()) || platform::is_npu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_repeat_times.push_back(*temp.data()); } else { vec_repeat_times.push_back(*tensor->data()); diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc index 5d7e423590b1cf..843f7620cac44e 100755 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ b/paddle/fluid/operators/top_k_v2_op_npu.cc @@ -41,7 +41,7 @@ class TopkV2NPUKernel : public framework::OpKernel { if (k_tensor != nullptr) { std::vector v_tmp(1); - TensorToVector( + paddle::framework::TensorToVector( *k_tensor, context.template device_context(), &v_tmp); diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h index 28135e37ed7bbe..74d086015eeb46 100644 --- a/paddle/fluid/operators/transfer_layout_op.h +++ b/paddle/fluid/operators/transfer_layout_op.h @@ -27,9 +27,12 @@ class DeviceContext; } // namespace platform } // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; } // namespace framework } // namespace paddle @@ -78,7 +81,7 @@ class TransferLayoutFunctor { } else { // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel // Do transform via MKLDNN lib - innerTransDataLayoutFromMKLDNN( + paddle::framework::innerTransDataLayoutFromMKLDNN( in_layout, paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout(), in_tensor, &out_tensor, dev_ctx_.GetPlace()); diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index f6712814e1e3b8..91923da819dc5e 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -48,7 +48,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { int dim0 = 2; int dim1 = 3; - TensorFromVector(std::vector({0, 1, 2, 3, 4, 5}), ctx, x_t); + paddle::framework::TensorFromVector(std::vector({0, 1, 2, 3, 4, 5}), ctx, + x_t); ctx.Wait(); x_t->Resize({dim0, dim1}); out_t->Resize({dim0, dim1}); @@ -66,7 +67,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { op->Run(*scope, place); ctx.Wait(); std::vector out_v; - TensorToVector(*out_t, ctx, &out_v); + paddle::framework::TensorToVector(*out_t, ctx, &out_v); ctx.Wait(); EXPECT_EQ(out_t->numel(), dim0 * dim1); @@ -93,7 +94,8 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { int dim1 = 3; auto place = ctx.GetPlace(); - TensorFromVector(std::vector({0, 1, 2, 3, 4, 5}), ctx, out_grad_t); + paddle::framework::TensorFromVector(std::vector({0, 1, 2, 3, 4, 5}), ctx, + out_grad_t); ctx.Wait(); x_grad_t->Resize({dim0, dim1}); @@ -112,7 +114,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { op->Run(*scope, place); ctx.Wait(); std::vector out_v; - TensorToVector(*x_grad_t, ctx, &out_v); + paddle::framework::TensorToVector(*x_grad_t, ctx, &out_v); ctx.Wait(); EXPECT_EQ(x_grad_t->numel(), dim0 * dim1); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 8d135e698f204e..f88fefd1c6a78a 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -31,7 +31,8 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel { Tensor shape_tensor(framework::proto::VarType::INT32); shape_tensor.mutable_data({static_cast(shape.size())}, ctx.GetPlace()); - TensorFromVector(shape, ctx.device_context(), &shape_tensor); + paddle::framework::TensorFromVector(shape, ctx.device_context(), + &shape_tensor); float mean = ctx.Attr("mean"); Tensor mean_tensor(framework::proto::VarType::FP32); mean_tensor.mutable_data({1}, ctx.GetPlace()); diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 18a4154be30ac7..f4ae8d82690566 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -29,8 +29,8 @@ inline std::vector GetNewDataFromShapeTensor( auto* new_data = new_data_tensor->data(); framework::Tensor cpu_starts_tensor; if (platform::is_gpu_place(new_data_tensor->place())) { - TensorCopySync(*new_data_tensor, platform::CPUPlace(), - &cpu_starts_tensor); + paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), + &cpu_starts_tensor); new_data = cpu_starts_tensor.data(); } std::vector vec_new_data(new_data, @@ -41,8 +41,8 @@ inline std::vector GetNewDataFromShapeTensor( std::vector vec_new_data; framework::Tensor cpu_starts_tensor; if (platform::is_gpu_place(new_data_tensor->place())) { - TensorCopySync(*new_data_tensor, platform::CPUPlace(), - &cpu_starts_tensor); + paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), + &cpu_starts_tensor); new_data = cpu_starts_tensor.data(); } for (int i = 0; i < new_data_tensor->numel(); ++i) { @@ -73,7 +73,7 @@ inline std::vector GetNewDataFromShapeTensorList( if (tensor->type() == framework::proto::VarType::INT32) { if (platform::is_gpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_shape.push_back(static_cast(*temp.data())); } else { vec_new_shape.push_back(static_cast(*tensor->data())); @@ -81,7 +81,7 @@ inline std::vector GetNewDataFromShapeTensorList( } else if (tensor->type() == framework::proto::VarType::INT64) { if (platform::is_gpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_shape.push_back(*temp.data()); } else { vec_new_shape.push_back(*tensor->data()); diff --git a/paddle/fluid/operators/unique_consecutive_op.cu b/paddle/fluid/operators/unique_consecutive_op.cu index 1f0023c467c01c..12bd742c9f9b43 100644 --- a/paddle/fluid/operators/unique_consecutive_op.cu +++ b/paddle/fluid/operators/unique_consecutive_op.cu @@ -96,8 +96,10 @@ void IndexSelect(const framework::ExecutionContext& context, std::vector input_vec; std::vector index_vec; - TensorToVector(input, context.device_context(), &input_vec); - TensorToVector(index, context.device_context(), &index_vec); + paddle::framework::TensorToVector(input, context.device_context(), + &input_vec); + paddle::framework::TensorToVector(index, context.device_context(), + &index_vec); std::vector out_vec(output->numel()); for (int i = 0; i < index_size; i++) { diff --git a/paddle/fluid/operators/unique_op.cu b/paddle/fluid/operators/unique_op.cu index 87a46e11d9f91b..98cd13a600f205 100644 --- a/paddle/fluid/operators/unique_op.cu +++ b/paddle/fluid/operators/unique_op.cu @@ -119,8 +119,10 @@ void IndexSelect(const framework::ExecutionContext& context, std::vector input_vec; std::vector index_vec; - TensorToVector(input, context.device_context(), &input_vec); - TensorToVector(index, context.device_context(), &index_vec); + paddle::framework::TensorToVector(input, context.device_context(), + &input_vec); + paddle::framework::TensorToVector(index, context.device_context(), + &index_vec); std::vector out_vec(output->numel()); for (int i = 0; i < index_size; i++) { diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc index a145c914a8621b..cf96ef57a4df08 100644 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc @@ -49,7 +49,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { init.push_back(static_cast(0.1)); } - TensorFromVector(init, ctx, tensor_x); + paddle::framework::TensorFromVector(init, ctx, tensor_x); tensor_x->Resize({dim0, dim1}); ctx.Wait(); @@ -75,7 +75,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10)); std::vector out_vec; - TensorToVector(*tensor_out, ctx, &out_vec); + paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); for (uint32_t i = 0; i < out_vec.size(); i++) { EXPECT_EQ(out_vec[i], static_cast(0.1)); } diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h index 770369e64f46fd..a413c4a331b65b 100644 --- a/paddle/fluid/operators/utils.h +++ b/paddle/fluid/operators/utils.h @@ -27,7 +27,8 @@ inline std::vector GetDataFromTensor(const framework::Tensor* x) { auto* data = x->data(); framework::Tensor cpu_attr_tensor; if (!platform::is_cpu_place(x->place())) { - TensorCopySync(*x, platform::CPUPlace(), &cpu_attr_tensor); + paddle::framework::TensorCopySync(*x, platform::CPUPlace(), + &cpu_attr_tensor); data = cpu_attr_tensor.data(); } vec_new_data = std::vector(data, data + x->numel()); @@ -35,7 +36,8 @@ inline std::vector GetDataFromTensor(const framework::Tensor* x) { auto* data = x->data(); framework::Tensor cpu_attr_tensor; if (!platform::is_cpu_place(x->place())) { - TensorCopySync(*x, platform::CPUPlace(), &cpu_attr_tensor); + paddle::framework::TensorCopySync(*x, platform::CPUPlace(), + &cpu_attr_tensor); data = cpu_attr_tensor.data(); } // NOTE: Converting int64 to int32 may cause data overflow. @@ -64,7 +66,7 @@ inline std::vector GetDataFromTensorList( if (tensor->type() == framework::proto::VarType::INT32) { if (!platform::is_cpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_data.push_back(static_cast(*temp.data())); } else { vec_new_data.push_back(static_cast(*tensor->data())); @@ -72,7 +74,7 @@ inline std::vector GetDataFromTensorList( } else if (tensor->type() == framework::proto::VarType::INT64) { if (!platform::is_cpu_place(tensor->place())) { framework::Tensor temp; - TensorCopySync(*tensor, platform::CPUPlace(), &temp); + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); // NOTE: Converting int64 to int32 may cause data overflow. vec_new_data.push_back(static_cast(*temp.data())); } else { diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu index 086ff05b084612..d40d14435a5fd0 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ b/paddle/fluid/operators/viterbi_decode_op.cu @@ -66,8 +66,9 @@ struct BinaryOperation { const Tensor& rhs, Tensor* output) { std::vector ins{&lhs, &rhs}; std::vector outs{output}; - LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, -1, BinaryFunctor()); + paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, -1, + BinaryFunctor()); } }; @@ -78,8 +79,9 @@ struct GetMask { std::vector ins = {&lhs, &rhs}; std::vector outs = {mask}; auto& dev_ctx = ctx.template device_context(); - LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, CompareFunctor()); + paddle::operators::LaunchSameDimsElementwiseCudaKernel< + ElementwiseType::kBinary, int64_t, T>(dev_ctx, ins, &outs, + CompareFunctor()); } }; diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index 4cce33c3f520f0..56f1d8d97ba618 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -299,7 +299,8 @@ class WarpCTCKernel : public framework::OpKernel { ctx.AllocateTmpTensor(warpctc_logits_dims, dev_ctx); warpctc_logits.ShareDataWith(warpctc_logits_tmp); if (ctx.HasInput("LogitsLength")) { - TensorCopySync(*logits, ctx.GetPlace(), &warpctc_logits); + paddle::framework::TensorCopySync(*logits, ctx.GetPlace(), + &warpctc_logits); } else { LoDTensor cpu_pad_value; T* pad_value_data = @@ -309,7 +310,8 @@ class WarpCTCKernel : public framework::OpKernel { if (platform::is_cpu_place(ctx.GetPlace())) { pad_value = cpu_pad_value; } else { - TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value); + paddle::framework::TensorCopySync(cpu_pad_value, ctx.GetPlace(), + &pad_value); } math::PaddingLoDTensorFunctor()( @@ -361,10 +363,12 @@ class WarpCTCKernel : public framework::OpKernel { ctx.template device_context(), *label, &gpu_label, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/, false /*norm_by_times*/, math::kBatchLengthWidth); - TensorCopySync(gpu_label, platform::CPUPlace(), &warpctc_label); + paddle::framework::TensorCopySync(gpu_label, platform::CPUPlace(), + &warpctc_label); } } else { - TensorCopySync(*label, platform::CPUPlace(), &warpctc_label); + paddle::framework::TensorCopySync(*label, platform::CPUPlace(), + &warpctc_label); } const int* warpctc_label_data = warpctc_label.data(); @@ -381,7 +385,8 @@ class WarpCTCKernel : public framework::OpKernel { sequence_width, num_sequences, blank, warpctc_loss_data); // Copy the loss back - TensorCopy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss); + paddle::framework::TensorCopy(warpctc_loss, ctx.GetPlace(), + ctx.device_context(), loss); } }; diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc index 9a11f300bcb096..226f1461ed4390 100644 --- a/paddle/fluid/operators/where_index_op_npu.cc +++ b/paddle/fluid/operators/where_index_op_npu.cc @@ -70,7 +70,8 @@ class NPUWhereIndexKernel : public framework::OpKernel { sum_runner.Run(stream); Tensor local_true_num; - TensorCopySync(sumed_true_num, platform::CPUPlace(), &local_true_num); + paddle::framework::TensorCopySync(sumed_true_num, platform::CPUPlace(), + &local_true_num); auto true_num = *local_true_num.data(); out->Resize(framework::make_ddim({true_num, rank})); diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h index 7bff2c69381e69..a75759e2ae0796 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h @@ -26,11 +26,9 @@ #include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" #include "paddle/fluid/platform/device_context.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h index d2389ba409e5eb..f4533c859fcd35 100644 --- a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h @@ -26,11 +26,9 @@ #include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" #include "paddle/fluid/platform/device_context.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -} // namespace paddle +namespace pten { +class DenseTensor; +} // namespace pten namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc index 78e5cb0ab106e4..b9c92e07612b08 100644 --- a/paddle/fluid/platform/device/npu/npu_op_runner.cc +++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc @@ -231,7 +231,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { auto *dev_ctx = static_cast(pool.Get(platform::CPUPlace())); Tensor host_tensor; - TensorFromVector(dims, *dev_ctx, &host_tensor); + paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor); host_tensors_.emplace_back(host_tensor); // create aclTensorDesc @@ -247,7 +247,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { auto *dev_ctx = static_cast(pool.Get(platform::CPUPlace())); Tensor host_tensor; - TensorFromVector(dims, *dev_ctx, &host_tensor); + paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor); host_tensors_.emplace_back(host_tensor); // create aclTensorDesc @@ -263,7 +263,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector &&values) { auto *dev_ctx = static_cast(pool.Get(platform::CPUPlace())); Tensor host_tensor; - TensorFromVector(values, *dev_ctx, &host_tensor); + paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor); host_tensors_.emplace_back(host_tensor); // create aclTensorDesc @@ -279,7 +279,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector &&values) { auto *dev_ctx = static_cast(pool.Get(platform::CPUPlace())); Tensor host_tensor; - TensorFromVector(values, *dev_ctx, &host_tensor); + paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor); host_tensors_.emplace_back(host_tensor); // create aclTensorDesc diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc index aadfffb59133bf..2307f843838aff 100644 --- a/paddle/fluid/platform/device_code_test.cc +++ b/paddle/fluid/platform/device_code_test.cc @@ -80,8 +80,8 @@ TEST(DeviceCode, cuda) { float* y_data = y.mutable_data(dims, place); float* z_data = z.mutable_data(dims, place); - TensorCopySync(cpu_x, place, &x); - TensorCopySync(cpu_y, place, &y); + paddle::framework::TensorCopySync(cpu_x, place, &x); + paddle::framework::TensorCopySync(cpu_y, place, &y); EXPECT_EQ(code.Compile(), true); @@ -93,7 +93,7 @@ TEST(DeviceCode, cuda) { auto* dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place); dev_ctx->Wait(); - TensorCopySync(z, paddle::platform::CPUPlace(), &cpu_z); + paddle::framework::TensorCopySync(z, paddle::platform::CPUPlace(), &cpu_z); for (size_t i = 0; i < n; i++) { EXPECT_EQ(cpu_z.data()[i], static_cast(i) * scale + 0.5); } diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc index 4a5dfbee15de28..ca5d156802851f 100644 --- a/paddle/fluid/platform/lodtensor_printer.cc +++ b/paddle/fluid/platform/lodtensor_printer.cc @@ -16,9 +16,12 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace paddle { namespace framework { -class Tensor; class Variable; } // namespace framework } // namespace paddle @@ -47,29 +50,29 @@ void PrintVar(framework::Scope* scope, const std::string& var_name, *sstream << print_info; -#define PrintTensorCallback(cpp_type, proto_type) \ - do { \ - if (tensor->type() == proto_type) { \ - *sstream << "["; \ - const cpp_type* data = nullptr; \ - framework::LoDTensor cpu_tensor; \ - if (is_cpu_place(tensor->place())) { \ - data = tensor->data(); \ - } else { \ - platform::CPUPlace cpu_place; \ - TensorCopy(*tensor, cpu_place, &cpu_tensor); \ - data = cpu_tensor.data(); \ - } \ - auto element_num = tensor->numel(); \ - *sstream << element_num << "]:["; \ - if (element_num > 0) { \ - *sstream << data[0]; \ - for (int j = 1; j < element_num; ++j) { \ - *sstream << " " << data[j]; \ - } \ - } \ - *sstream << "]"; \ - } \ +#define PrintTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor->type() == proto_type) { \ + *sstream << "["; \ + const cpp_type* data = nullptr; \ + framework::LoDTensor cpu_tensor; \ + if (is_cpu_place(tensor->place())) { \ + data = tensor->data(); \ + } else { \ + platform::CPUPlace cpu_place; \ + paddle::framework::TensorCopy(*tensor, cpu_place, &cpu_tensor); \ + data = cpu_tensor.data(); \ + } \ + auto element_num = tensor->numel(); \ + *sstream << element_num << "]:["; \ + if (element_num > 0) { \ + *sstream << data[0]; \ + for (int j = 1; j < element_num; ++j) { \ + *sstream << " " << data[j]; \ + } \ + } \ + *sstream << "]"; \ + } \ } while (0) _ForEachDataType_(PrintTensorCallback); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 3650b44ed0a85b..cc3b066de47256 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1253,7 +1253,8 @@ void BindImperative(py::module *m_ptr) { ->GetMutable(); auto *dev_ctx = platform::DeviceContextPool::Instance().Get( tracer->ExpectedPlace()); - TensorFromVector(list_select_idxs, *dev_ctx, idx_tensor); + paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx, + idx_tensor); imperative::NameVarBaseMap ins = {{"X", {self}}, {"Index", {select_index}}}; diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h index a6e2c4d1037696..c26c9ce8394581 100644 --- a/paddle/pten/api/include/tensor.h +++ b/paddle/pten/api/include/tensor.h @@ -36,6 +36,10 @@ using gpuStream_t = hipStream_t; #include "paddle/pten/common/layout.h" #include "paddle/pten/common/place.h" +namespace pten { +class DenseTensor; +} // namespace pten + namespace pten { class TensorBase; } // namespace pten @@ -47,7 +51,6 @@ class DDim; namespace experimental { -class Tensor; class CompatiblePTenTensorUtils; class AbstractAutogradMeta { diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc index f304268bedf45d..93b1957fe14428 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.cc +++ b/paddle/pten/api/lib/utils/tensor_utils.cc @@ -260,7 +260,8 @@ std::unique_ptr MakePtenTensorBaseFromVar( const auto& tensor = variable.Get(); if (!platform::is_same_place(tensor.value().place(), expected_place)) { framework::Tensor tmp_tensor; - TensorCopySync(tensor.value(), expected_place, &tmp_tensor); + paddle::framework::TensorCopySync( + tensor.value(), expected_place, &tmp_tensor); // TODO(chenweihang): adapt SelectedRows by xiaowei's design return MakePtenDenseTensor(tmp_tensor); } else { @@ -303,7 +304,7 @@ void MovesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) { dst, platform::errors::InvalidArgument( "The destination Tensor is nullptr when move storage.")); - dst->Resize(src->dims()); + dst->ResizeAndAllocate(src->dims()); dst->set_type(pten::TransToProtoVarType(src->dtype())); auto storage = src->MoveMemoryHolder(); dst->ResetHolderWithType(storage, pten::TransToProtoVarType(src->dtype())); @@ -324,7 +325,7 @@ void SharesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) { dst, platform::errors::InvalidArgument( "The destination Tensor is nullptr when move allocation.")); - dst->Resize(src->dims()); + dst->ResizeAndAllocate(src->dims()); dst->ResetHolderWithType(src->Holder(), pten::TransToProtoVarType(src->dtype())); dst->set_offset(src->meta().offset); @@ -412,7 +413,8 @@ void ReMakePtenDenseTensorFromVar(const framework::Variable& variable, "argument's definition in kernel.")); if (!platform::is_same_place(tensor.value().place(), expected_place)) { framework::Tensor tmp_tensor; - TensorCopySync(tensor.value(), expected_place, &tmp_tensor); + paddle::framework::TensorCopySync( + tensor.value(), expected_place, &tmp_tensor); // TODO(chenweihang): adapt SelectedRows by xiaowei's design ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst); } else { @@ -457,7 +459,7 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src, auto* tensor = variable->GetMutable(); auto dtype = pten::TransToProtoVarType(src->dtype()); - tensor->Resize(src->dims()); + tensor->ResizeAndAllocate(src->dims()); SetLoD(tensor->mutable_lod(), src->lod()); if (!tensor->IsInitialized() || diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index fe088a95681468..06531fe8bfd3b8 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -22,6 +22,14 @@ limitations under the License. */ #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/convert_utils.h" +namespace paddle { +namespace framework { +extern void TensorCopy(const pten::DenseTensor& src, + const paddle::platform::Place& dst_place, + pten::DenseTensor* dst); +} +} + namespace pten { DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta) @@ -198,7 +206,7 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) { storage_ won't be initialized until the first call to mutable_data(place) */ -void DenseTensor::Resize(const DDim& dims) { +void DenseTensor::ResizeAndAllocate(const DDim& dims) { meta_.dims = dims; if (storage_ != nullptr) { mutable_data(); @@ -519,4 +527,119 @@ size_t DenseTensor::NumElements(size_t level) const { return (meta_.lod)[level].size() - 1; } +DenseTensor& DenseTensor::Resize(const DDim& dims) { + meta_.dims = dims; + return *this; +} + +DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const { + check_memory_size(); + PADDLE_ENFORCE_GE(begin_idx, + 0, + paddle::platform::errors::OutOfRange( + "The start row index must be greater than 0." + "But received the start index is d%.", + begin_idx)); + PADDLE_ENFORCE_LE(end_idx, + meta_.dims[0], + paddle::platform::errors::OutOfRange( + "The end row index is out of bound.")); + PADDLE_ENFORCE_LT( + begin_idx, + end_idx, + paddle::platform::errors::InvalidArgument( + "The start row index must be less than the end row index." + "But received the start index = %d, the end index = %d.", + begin_idx, + end_idx)); + + if (meta_.dims[0] == 1) { + return *this; + } else { + size_t base = numel() / meta_.dims[0]; + DenseTensor dst; + dst.storage_ = pten::make_intrusive( + storage_->data_shared()); + dst.meta_.layout = meta_.layout; + dst.meta_.dtype = meta_.dtype; + DDim dst_dims = meta_.dims; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype()); + return dst; + } +} + +std::vector DenseTensor::Split(int64_t split_size, + int64_t axis) const { + check_memory_size(); + + PADDLE_ENFORCE_GE(meta_.dims.size(), + 0, + paddle::platform::errors::OutOfRange( + "split expects at least a 1-dimensional tensor")); + + PADDLE_ENFORCE_GE( + split_size, + 0, + paddle::platform::errors::OutOfRange( + "split expects split_size be non-negative, but got split_size is %d", + split_size)); + + int64_t numel_size = meta_.dims[axis]; + + int64_t num_splits = 1; + if (split_size != 0) { + num_splits = + std::max((numel_size + split_size - 1) / split_size, 1); + } + + std::vector splits(num_splits); + int64_t last_split_size = split_size - (split_size * num_splits - numel_size); + + for (int64_t i = 0; i < num_splits; ++i) { + int64_t length = i < num_splits - 1 ? split_size : last_split_size; + splits[i] = Slice(i * split_size, i * split_size + length); + } + return splits; +} + +std::vector DenseTensor::Chunk(int64_t chunks, + int64_t axis) const { + check_memory_size(); + PADDLE_ENFORCE_GE(meta_.dims.size(), + 0, + paddle::platform::errors::OutOfRange( + "split expects at least a 1-dimensional tensor")); + PADDLE_ENFORCE_GE( + chunks, + 0, + paddle::platform::errors::OutOfRange( + "chunks expects to be greater than 0, but got chunks is %d", chunks)); + + int64_t numel_size = meta_.dims[axis]; + int64_t split_size = (numel_size + chunks - 1) / chunks; + return Split(split_size, axis); +} + +DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) { + src.check_memory_size(); + // Preserve LoD + auto lod = meta_.lod; + *this = src; + meta_.lod = lod; + return *this; +} + +DenseTensor& DenseTensor::ShareInplaceVersionCounterWith( + const DenseTensor& src) { + PADDLE_ENFORCE_NOT_NULL( + inplace_version_counter_, + paddle::platform::errors::PreconditionNotMet( + "Tensor does not hold inplace_version_counter_.")); + + inplace_version_counter_ = src.inplace_version_counter_; + return *this; +} + } // namespace pten diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index db8d7a2a39c908..216689c9b64173 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -159,7 +159,9 @@ class DenseTensor : public TensorBase, /// \param dims The new dims of the dense tensor. /// \param lod The new lod of the dense tensor. // void Resize(const DDim& dims); - void Resize(const DDim& dims); + void ResizeAndAllocate(const DDim& dims); + + DenseTensor& Resize(const DDim& dims); /// \brief Change the lod information in the metadata. /// \param lod The new lod of the dense tensor. @@ -309,6 +311,18 @@ class DenseTensor : public TensorBase, return *inplace_version_counter_; } + /*! The internal of two tensors share the same memory block. */ + DenseTensor& ShareDataWith(const DenseTensor& src); + + /*! The internal of two tensors share the same inplace version counter. */ + DenseTensor& ShareInplaceVersionCounterWith(const DenseTensor& src); + + DenseTensor Slice(int64_t begin_idx, int64_t end_idx) const; + + std::vector Split(int64_t split_size, int64_t axis) const; + + std::vector Chunk(int64_t chunks, int64_t axis) const; + protected: std::shared_ptr inplace_version_counter_; diff --git a/paddle/pten/kernels/cpu/copy_kernel.cc b/paddle/pten/kernels/cpu/copy_kernel.cc index 28623b539d8475..1889838e253c93 100644 --- a/paddle/pten/kernels/cpu/copy_kernel.cc +++ b/paddle/pten/kernels/cpu/copy_kernel.cc @@ -37,7 +37,7 @@ void Copy(const Context& dev_ctx, VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; - dst->Resize(src.dims()); + dst->ResizeAndAllocate(src.dims()); auto* dst_ptr = dst->mutable_data(); if (src_ptr == dst_ptr && src_place == dst_place) { diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h index 1e9c1e885f44d6..b38f17aa02a556 100644 --- a/paddle/pten/kernels/cpu/reduce.h +++ b/paddle/pten/kernels/cpu/reduce.h @@ -118,7 +118,7 @@ void GetShuffledInput(const DeviceContext& dev_ctx, std::vector perm_axis(input.dims().size()); GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis); - shuffled_input->Resize(shuffled_dims); + shuffled_input->ResizeAndAllocate(shuffled_dims); shuffled_input->mutable_data(); pten::math::TransposeNormal trans; @@ -141,12 +141,12 @@ void HandleLargeDim(const DeviceContext& dev_ctx, // transpose to 2D tensor whose shape is {unreduced, reduced}. const int64_t unreduced = output->numel(); const int64_t reduced = shuffled_input.numel() / unreduced; - shuffled_input.Resize({unreduced, reduced}); + shuffled_input.ResizeAndAllocate({unreduced, reduced}); DDim output_dim = output->dims(); - output->Resize({unreduced}); + output->ResizeAndAllocate({unreduced}); ReduceFunctor( dev_ctx, shuffled_input, output, {1}, keep_dim); - output->Resize(output_dim); + output->ResizeAndAllocate(output_dim); } ////////////// ReduceKernel diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index eb67ed6655f479..c133d7fc791349 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -24,7 +24,7 @@ template void EmptyKernel(const Context& dev_ctx, const ScalarArray& shape, DenseTensor* out) { - out->Resize(paddle::framework::make_ddim(shape.GetData())); + out->ResizeAndAllocate(paddle::framework::make_ddim(shape.GetData())); } template diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc index 45f3c6558d9c87..e45ac516e16ed3 100644 --- a/paddle/pten/kernels/flatten_grad_kernel.cc +++ b/paddle/pten/kernels/flatten_grad_kernel.cc @@ -28,7 +28,7 @@ void FlattenGradKernel(const Context& dev_ctx, auto x_dims = paddle::framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); pten::Copy(dev_ctx, out_grad, false, x_grad); - x_grad->Resize(x_dims); + x_grad->ResizeAndAllocate(x_dims); } } // namespace pten diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc index 9201a8df9d166c..b0d05803ac351c 100644 --- a/paddle/pten/kernels/flatten_kernel.cc +++ b/paddle/pten/kernels/flatten_kernel.cc @@ -29,7 +29,7 @@ void FlattenKernel(const Context& dev_ctx, DenseTensor* out) { auto out_dims = out->dims(); pten::Copy(dev_ctx, x, false, out); - out->Resize(out_dims); + out->ResizeAndAllocate(out_dims); } // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate diff --git a/paddle/pten/kernels/funcs/common_shape.h b/paddle/pten/kernels/funcs/common_shape.h index 9d16d18d6b6ec3..8693fd2b36c4e7 100644 --- a/paddle/pten/kernels/funcs/common_shape.h +++ b/paddle/pten/kernels/funcs/common_shape.h @@ -26,7 +26,7 @@ inline void SetXShape(const DenseTensor &x, DenseTensor *xshape) { for (int i = 0; i < in_dims.size(); ++i) { xshape_dims[i + 1] = in_dims[i]; } - xshape->Resize(paddle::framework::make_ddim(xshape_dims)); + xshape->ResizeAndAllocate(paddle::framework::make_ddim(xshape_dims)); xshape->ResetLoD(x.meta().lod); } diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu index 687519debcb704..aa61155221bf15 100644 --- a/paddle/pten/kernels/gpu/cast_kernel.cu +++ b/paddle/pten/kernels/gpu/cast_kernel.cu @@ -44,9 +44,9 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx, inputs.emplace_back(&x); outputs.emplace_back(out); out->mutable_data(); - funcs::LaunchSameDimsElementwiseCudaKernel( + pten::funcs::LaunchSameDimsElementwiseCudaKernel( dev_ctx, inputs, &outputs, CastFuctor()); } diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu index 7eeef85f0f3e61..10b2aa415d45bf 100644 --- a/paddle/pten/kernels/gpu/copy_kernel.cu +++ b/paddle/pten/kernels/gpu/copy_kernel.cu @@ -42,7 +42,7 @@ void Copy(const Context& dev_ctx, VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; - dst->Resize(src.dims()); + dst->ResizeAndAllocate(src.dims()); auto* dst_ptr = dst->mutable_data(); if (src_ptr == dst_ptr && src_place == dst_place) { diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index 291550a3d6e702..c3ff91e7b15cd6 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -574,7 +574,7 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx, ? *std::max_element(dims_size.begin(), dims_size.end()) - *std::min_element(dims_size.begin(), dims_size.end()) : axis; - LaunchBroadcastElementwiseCudaKernel( + pten::LaunchBroadcastElementwiseCudaKernel( ctx, ins, outs, axis, func); } } diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index c4e3a0b354d68e..e7d1d2d5f44fc1 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -326,7 +326,7 @@ struct ReduceConfig { const paddle::platform::Place& place, pten::DenseTensor* tmp) { if (should_reduce_again) { - tmp->Resize(paddle::framework::make_ddim( + tmp->ResizeAndAllocate(paddle::framework::make_ddim( {static_cast(left_num * grid.z * grid.y * sizeof(Ty))})); output_data = tmp->mutable_data(); } else { diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu index b6897efcdd25bd..b49902ff5e300a 100644 --- a/paddle/pten/kernels/gpu/scale_kernel.cu +++ b/paddle/pten/kernels/gpu/scale_kernel.cu @@ -55,7 +55,9 @@ void ScaleKernel(const Context& dev_ctx, inputs.emplace_back(&x); outputs.emplace_back(out); out->mutable_data(); - funcs::LaunchSameDimsElementwiseCudaKernel( + pten::funcs::LaunchSameDimsElementwiseCudaKernel( dev_ctx, inputs, &outputs, diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h index 79ca63c9b0669b..134a815799de60 100644 --- a/paddle/pten/kernels/impl/full_kernel_impl.h +++ b/paddle/pten/kernels/impl/full_kernel_impl.h @@ -36,7 +36,7 @@ void FullKernel(const Context& dev_ctx, const ScalarArray& shape, const Scalar& val, DenseTensor* out) { - out->Resize(paddle::framework::make_ddim(shape.GetData())); + out->ResizeAndAllocate(paddle::framework::make_ddim(shape.GetData())); FullValue(dev_ctx, out, val.to()); } diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h index f5f69f327a69f2..5ea9729655ecc8 100644 --- a/paddle/pten/kernels/impl/matmul_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h @@ -164,7 +164,7 @@ void MatMulFunction(const Context& dev_ctx, std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin()); out_dims.back() = y_dims.back(); } - Out->Resize(paddle::framework::make_ddim(out_dims)); + Out->ResizeAndAllocate(paddle::framework::make_ddim(out_dims)); Out->mutable_data(); if (trans_y) { const int M = Y.numel() / N; @@ -242,7 +242,7 @@ void MatMulFunction(const Context& dev_ctx, } else { std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin()); } - Out->Resize(paddle::framework::make_ddim(out_dims)); + Out->ResizeAndAllocate(paddle::framework::make_ddim(out_dims)); Out->mutable_data(); if (trans_x) { @@ -330,7 +330,7 @@ void MatMulFunction(const Context& dev_ctx, out_broadcast_dims[ndim - 2] = M; out_broadcast_dims[ndim - 1] = N; - Out->Resize(paddle::framework::make_ddim(out_broadcast_dims)); + Out->ResizeAndAllocate(paddle::framework::make_ddim(out_broadcast_dims)); Out->mutable_data(); const int batch_dim = ndim - 2; diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc index d7e2e2707ee1b9..7f58bbbd3732d0 100644 --- a/paddle/pten/kernels/reshape_kernel.cc +++ b/paddle/pten/kernels/reshape_kernel.cc @@ -28,11 +28,11 @@ void ReshapeKernel(const Context& dev_ctx, DenseTensor* out) { auto out_meta = InferMetaFromVecValue(x.meta(), shape.GetData()); if (x.data() == out->data() && x.numel() == out->numel()) { - out->Resize(out_meta.dims); + out->ResizeAndAllocate(out_meta.dims); return; } pten::Copy(dev_ctx, x, false, out); - out->Resize(out_meta.dims); + out->ResizeAndAllocate(out_meta.dims); out->ResetLoD(x.lod()); } diff --git a/paddle/pten/kernels/xpu/copy_kernel.cc b/paddle/pten/kernels/xpu/copy_kernel.cc index f464a4926d3b59..3287fa1f7a8572 100644 --- a/paddle/pten/kernels/xpu/copy_kernel.cc +++ b/paddle/pten/kernels/xpu/copy_kernel.cc @@ -43,7 +43,7 @@ void Copy(const Context& dev_ctx, VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; - dst->Resize(src.dims()); + dst->ResizeAndAllocate(src.dims()); CHECK(dst->layout() == src.layout()); auto size = src.numel() * paddle::framework::SizeOfType(TransToProtoVarType(src.dtype())); diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc index 8564969796c7ec..56722d35f325ec 100644 --- a/paddle/pten/tests/core/test_dense_tensor.cc +++ b/paddle/pten/tests/core/test_dense_tensor.cc @@ -110,7 +110,7 @@ TEST(dense_tensor, resize) { DenseTensor tensor_0(alloc, meta); CHECK_EQ(tensor_0.capacity(), 2u); - tensor_0.Resize({1, 2, 3}); + tensor_0.ResizeAndAllocate({1, 2, 3}); CHECK_EQ(tensor_0.capacity(), 6u); tensor_0.mutable_data(); CHECK_EQ(tensor_0.capacity(), 6u);