add xpu slice op (PaddlePaddle#27349)

Thunderbrook · chen-zhiyu · commit 4860feb868d9 · 2020-10-15T13:49:01.000+08:00
* add xpu slice op
test=xpu

* add slice xpu op
test=xpu

* code style
test=kunlun

* style
test=kunlun

* format
test=kunlun
diff --git a/paddle/fluid/operators/slice_xpu_op.cc b/paddle/fluid/operators/slice_xpu_op.cc
@@ -0,0 +1,203 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/slice_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SliceXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto in = ctx.Input<framework::Tensor>("Input");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+    auto in_dims = in->dims();
+
+    // prepare starts, ends on XPU
+    int dim_value = 0, start = 0, end = 0;
+    // If a negative value is passed for any of the start or end indices,
+    // it represents number of elements before the end of that dimension.
+    // If the value passed to start or end is larger than the n
+    // (the number of elements in this dimension), it represents n.
+    for (size_t i = 0; i < axes.size(); ++i) {
+      dim_value = in_dims[axes[i]];
+      start = starts[i];
+      end = ends[i];
+      start = start < 0 ? (start + dim_value) : start;
+      end = end < 0 ? (end + dim_value) : end;
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      end = std::min(end, dim_value);
+      PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument(
+                                        "end should greater than start"));
+      starts[i] = start;
+      ends[i] = end;
+    }
+    size_t shape_size = in_dims.size();
+    // the slice XPU kernel require that the length of `start`, `end` must be
+    // equal
+    // to the dims size of input tensor, therefore, if shape_size > axes.size(),
+    // the `starts_extension` and `ends_extension` is necessary.
+    std::vector<int> starts_extension(shape_size, 0);
+    std::vector<int> ends_extension(shape_size, 0);
+    if (shape_size > axes.size()) {
+      for (size_t i = 0; i < shape_size; ++i) {
+        ends_extension[i] = in_dims[i];
+      }
+      for (size_t i = 0; i < axes.size(); ++i) {
+        starts_extension[axes[i]] = starts[i];
+        ends_extension[axes[i]] = ends[i];
+      }
+    } else {
+      starts_extension = std::move(starts);
+      ends_extension = std::move(ends);
+    }
+
+    // prepare shape on XPU
+    std::vector<int> shape(shape_size, 0);
+    for (size_t i = 0; i < shape_size; ++i) {
+      shape[i] = in_dims[i];
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto* in_data = in->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int r = xpu::slice_forward(dev_ctx.x_context(), shape.data(),
+                               starts_extension.data(), ends_extension.data(),
+                               shape_size, in_data, out_data);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External("XPU slice kernel error!"));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SliceGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
+    d_in->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dims = d_in->dims();
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    // prepare starts, ends on XPU
+    int dim_value = 0, start = 0, end = 0;
+    // If a negative value is passed for any of the start or end indices,
+    // it represents number of elements before the end of that dimension.
+    // If the value passed to start or end is larger than the n
+    // (the number of elements in this dimension), it represents n.
+    for (size_t i = 0; i < axes.size(); ++i) {
+      dim_value = in_dims[axes[i]];
+      start = starts[i];
+      end = ends[i];
+      start = start < 0 ? (start + dim_value) : start;
+      end = end < 0 ? (end + dim_value) : end;
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      end = std::min(end, dim_value);
+      PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument(
+                                        "end should greater than start"));
+      starts[i] = start;
+      ends[i] = end;
+    }
+    size_t shape_size = in_dims.size();
+    // the slice XPU kernel require that the length of `start`, `end` must be
+    // equal
+    // to the dims size of input tensor, therefore, if shape_size > axes.size(),
+    // the `starts_extension` and `ends_extension` is necessary.
+    std::vector<int> starts_extension(shape_size, 0);
+    std::vector<int> ends_extension(shape_size, 0);
+    if (shape_size > axes.size()) {
+      for (size_t i = 0; i < shape_size; ++i) {
+        ends_extension[i] = in_dims[i];
+      }
+      for (size_t i = 0; i < axes.size(); ++i) {
+        starts_extension[axes[i]] = starts[i];
+        ends_extension[axes[i]] = ends[i];
+      }
+    }
+    int* starts_device = nullptr;
+    int* ends_device = nullptr;
+    int* starts_host =
+        shape_size > axes.size() ? starts_extension.data() : starts.data();
+    int* ends_host =
+        shape_size > axes.size() ? ends_extension.data() : ends.data();
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc((void**)(&starts_device), shape_size * sizeof(int)),
+        XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc((void**)(&ends_device), shape_size * sizeof(int)),
+        XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 starts_device, platform::CPUPlace(), starts_host,
+                 shape_size * sizeof(int));
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 ends_device, platform::CPUPlace(), ends_host,
+                 shape_size * sizeof(int));
+
+    // prepare shape on XPU
+    std::vector<int> shape(shape_size, 0);
+    for (size_t i = 0; i < shape_size; ++i) {
+      shape[i] = in_dims[i];
+    }
+    int* shape_device = nullptr;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc((void**)(&shape_device), shape_size * sizeof(int)),
+        XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 shape_device, platform::CPUPlace(), shape.data(),
+                 shape_size * sizeof(int));
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r =
+        xpu::slice_backward(dev_ctx.x_context(), shape_device, starts_device,
+                            ends_device, shape_size, d_out->data<T>(),
+                            d_in->data<T>(), d_in->numel(), d_out->numel());
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External("xpu slice kernel error"));
+    dev_ctx.Wait();
+    // free device data
+    xpu_free(shape_device);
+    xpu_free(starts_device);
+    xpu_free(ends_device);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    slice, ops::SliceXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    slice_grad,
+    ops::SliceGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
@@ -0,0 +1,171 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+
+# Situation 1: starts(list, no tensor), ends(list, no tensor)
+# 1.1 without attr(decrease)
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            "use_xpu": True
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(place, ['Input'], 'Out')
+
+
+class TestCase1(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+
+class TestCase2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+
+
+# 1.2 with attr(decrease)
+class TestSliceOp_decs_dim(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+            "use_xpu": True
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(place, ['Input'], 'Out')
+
+
+class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [-1, 0, 2]
+        self.ends = [1000000, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 7]).astype("float64")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[:, :, :, -1]
+
+
+class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float64")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+if __name__ == '__main__':
+    unittest.main()