[NPU] Support npu save load (#31893)

pangyoki · web-flow · commit 7039c06d78ce · 2021-04-09T17:12:49.000+08:00
* support save load for NPU

* add save load npu unittest

* support np.array transform in NPU

* fix errors

* delete dygraph in unittest

* add Wait

* fix unittest

* fix review comment

* fix unittest problem

* fix little problem
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
@@ -822,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "XPUPlace is not supported when not compiled with XPU"));
+#endif
+    } else if (platform::is_npu_place(tensor.place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& npu_dev_ctx =
+          static_cast<const platform::NPUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::NPUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     npu_dev_ctx.stream());
+        npu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPlace is not supported when not compiled with NPU"));
 #endif
     } else {
       os.write(static_cast<const char*>(data_ptr),
@@ -877,8 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
-        platform::is_xpu_place(dev_ctx.GetPlace())) {
-#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
+        platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_npu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU || \
+    defined PADDLE_WITH_ASCEND_CL
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -887,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+        dev_ctx.Wait();
+      }
 #else
       if (platform::is_gpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "CUDAPlace is not supported when not compiled with CUDA"));
-      } else {
+      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "NPUPlace is not supported when not compiled with NPU"));
       }
 #endif
     } else {
@@ -934,8 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
-        platform::is_xpu_place(dev_ctx.GetPlace())) {
-#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
+        platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_npu_place(dev_ctx.GetPlace())) {
+#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU || \
+    defined PADDLE_WITH_ASCEND_CL
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -944,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+        dev_ctx.Wait();
+      }
 #else
       if (platform::is_gpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "CUDAPlace is not supported when not compiled with CUDA"));
-      } else {
+      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "NPUPlace is not supported when not compiled with NPU"));
       }
 #endif
     } else {
diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
@@ -644,6 +644,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
   }
   bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
   bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
+  bool is_npu_tensor = platform::is_npu_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = tensor.type();
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -662,7 +663,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
 
-  if (!is_gpu_tensor && !is_xpu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) {
     if (!need_deep_copy) {
       auto base = py::cast(std::move(tensor));
       return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
@@ -729,6 +730,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use CUDAPlace in CPU only version, "
         "Please recompile or reinstall Paddle with CUDA support."));
+#endif
+  } else if (is_npu_tensor) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(tensor.place());
+    paddle::memory::Copy(
+        platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
+        copy_bytes,
+        reinterpret_cast<const platform::NPUDeviceContext &>(ctx).stream());
+    ctx.Wait();
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use NPUPlace in CPU/GPU/XPU version, "
+        "Please recompile or reinstall Paddle with NPU support."));
 #endif
   }
   PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
@@ -1973,6 +1973,10 @@ def set_var(var, ndarray):
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
             place = paddle.fluid.XPUPlace(p.xpu_device_id())
+        elif p.is_npu_place():
+            p = paddle.fluid.core.Place()
+            p.set_place(t._place())
+            place = paddle.fluid.NPUPlace(p.npu_device_id())
         else:
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
@@ -2115,8 +2119,8 @@ def _load_vars_with_try_catch(exe,
                     error_str = "Failed to load model/variables `%s`, please make sure " \
                                 "model/variables file is saved with the following APIs: " \
                                 "save_params, save_persistables, save_vars."
-                    filenames = [var.name for var in vars
-                                 ] if filename is None else filename
+                    filenames = [var.name for var in
+                                 vars] if filename is None else filename
                     if raise_error:
                         raise RuntimeError(error_str % filenames)
                     else:
@@ -2256,6 +2260,10 @@ def set_program_state(program, state_dict):
                 p = paddle.fluid.core.Place()
                 p.set_place(ten_place)
                 py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
+            elif ten_place.is_npu_place():
+                p = paddle.fluid.core.Place()
+                p.set_place(ten_place)
+                py_place = paddle.fluid.NPUPlace(p.npu_device_id())
 
             ten.set(new_para_np, py_place)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+sys.path.append("..")
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.nn import Embedding
+import paddle.fluid.framework as framework
+from paddle.fluid.optimizer import Adam
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+from paddle.fluid.executor import global_scope
+import numpy as np
+import six
+import pickle
+import os
+import errno
+from test_static_save_load import *
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadBase(TestSaveLoadBase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadPartial(TestSaveLoadPartial):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadSetStateDict(TestSaveLoadSetStateDict):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStatePartial(TestProgramStatePartial):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPULoadFromOldInterface(TestLoadFromOldInterface):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPULoadFromOldInterfaceSingleFile(TestLoadFromOldInterfaceSingleFile):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStateOldSave(TestProgramStateOldSave):
+    def setUp(self):
+        self.test_dygraph = False
+
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStateOldSaveSingleModel(TestProgramStateOldSaveSingleModel):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py