add_reduce (#8173)

xiaoxiaohehe001 · web-flow · commit 2ffcb77df199 · 2022-01-04T10:07:21.000+08:00
diff --git a/lite/backends/metal/metal_kernel/texture/MaxKernel.metal b/lite/backends/metal/metal_kernel/texture/MaxKernel.metal
@@ -21,30 +21,6 @@ struct ArgParam {
     int orank;
 };
 
-kernel void reduce_max_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
-    texture2d_array<ftype, access::write> outTexture[[texture(1)]],
-    uint3 gid[[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size())
-        return;
-
-#if LITE_WITH_METAL_FULL
-    float omax = FLT_MIN;
-#else
-    half omax = HALF_MIN;
-#endif
-    uint iAL = inTexture.get_array_size();
-    for (uint i = 0; i < iAL; ++i) {
-        ftype4 in = inTexture.read(uint2(gid.x, gid.y), gid.z);
-        omax = max(omax, in.r);
-        omax = max(omax, in.g);
-        omax = max(omax, in.b);
-        omax = max(omax, in.a);
-    }
-
-    outTexture.write(ftype4(omax, 0.0, 0.0, 0.0), gid.xy, gid.z);
-}
-
 inline int max_index(texture2d_array<ftype, access::read> inTexture[[texture(0)]], uint2 gid) {
     int index = 0;
 #if LITE_WITH_METAL_FULL
diff --git a/lite/backends/metal/metal_kernel/texture/Reduce.metal b/lite/backends/metal/metal_kernel/texture/Reduce.metal
@@ -0,0 +1,123 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "Common.metal"
+#include <metal_stdlib>
+
+using namespace metal;
+
+kernel void reduce_max_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
+    texture2d_array<ftype, access::write> outTexture[[texture(1)]],
+    uint3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size())
+        return;
+
+#if LITE_WITH_METAL_FULL
+    float omax = FLT_MIN;
+#else
+    half omax = HALF_MIN;
+#endif
+    uint iAL = inTexture.get_array_size();
+    for (uint i = 0; i < iAL; ++i) {
+        ftype4 in = inTexture.read(uint2(gid.x, gid.y), i);
+        omax = max(omax, in.x);
+        omax = max(omax, in.y);
+        omax = max(omax, in.z);
+        omax = max(omax, in.w);
+    }
+    outTexture.write(ftype4(omax, 0.0, 0.0, 0.0), gid.xy, 0);
+}
+
+kernel void reduce_min_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
+    texture2d_array<ftype, access::write> outTexture[[texture(1)]],
+    uint3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size())
+        return;
+
+#if LITE_WITH_METAL_FULL
+    float omin = FLT_MAX;
+#else
+    half omin = HALF_MAX;
+#endif
+
+    uint iAL = inTexture.get_array_size();
+    for (uint i = 0; i < iAL - 1; ++i) {
+        ftype4 in = inTexture.read(uint2(gid.x, gid.y), i);
+        omin = min(omin, in.x);
+        omin = min(omin, in.y);
+        omin = min(omin, in.z);
+        omin = min(omin, in.w);
+    }
+    ftype4 in_ = inTexture.read(uint2(gid.x, gid.y), iAL - 1);
+    omin = abs(in_.x <= 1e-6) ? omin : min(omin, in_.x);
+    omin = abs(in_.y <= 1e-6) ? omin : min(omin, in_.y);
+    omin = abs(in_.z <= 1e-6) ? omin : min(omin, in_.z);
+    omin = abs(in_.w <= 1e-6) ? omin : min(omin, in_.w);
+    outTexture.write(ftype4(omin, 0.0, 0.0, 0.0), gid.xy, 0);
+}
+
+kernel void reduce_mean_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
+    texture2d_array<ftype, access::write> outTexture[[texture(1)]],
+    uint3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size())
+        return;
+
+#if LITE_WITH_METAL_FULL
+    float omean = 0;
+#else
+    half omean = 0;
+#endif
+    uint iAL = inTexture.get_array_size();
+    uint count = 4 * (iAL - 1);
+    for (uint i = 0; i < iAL; ++i) {
+        ftype4 in = inTexture.read(uint2(gid.x, gid.y), i);
+        omean += in.x;
+        omean += in.y;
+        omean += in.z;
+        omean += in.w;
+    }
+    ftype4 in_ = inTexture.read(uint2(gid.x, gid.y), iAL - 1);
+    count = abs(in_.x <= 1e-6) ? count : count + 1;
+    count = abs(in_.y <= 1e-6) ? count : count + 1;
+    count = abs(in_.z <= 1e-6) ? count : count + 1;
+    count = abs(in_.w <= 1e-6) ? count : count + 1;
+    omean = omean / count;
+    outTexture.write(ftype4(omean, 0.0, 0.0, 0.0), gid.xy, 0);
+}
+
+kernel void reduce_sum_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
+    texture2d_array<ftype, access::write> outTexture[[texture(1)]],
+    uint3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size())
+        return;
+
+#if LITE_WITH_METAL_FULL
+    float osum = 0;
+#else
+    half osum = 0;
+#endif
+    uint iAL = inTexture.get_array_size();
+    for (uint i = 0; i < iAL; ++i) {
+        ftype4 in = inTexture.read(uint2(gid.x, gid.y), i);
+        osum += in.x;
+        osum += in.y;
+        osum += in.z;
+        osum += in.w;
+    }
+    outTexture.write(ftype4(osum, 0.0, 0.0, 0.0), gid.xy, 0);
+}
diff --git a/lite/kernels/metal/image_op/reduce_image_compute.h b/lite/kernels/metal/image_op/reduce_image_compute.h
@@ -52,9 +52,11 @@ class ReduceImageCompute
     void* mps_input_image_{nullptr};
     void* mps_output_image_{nullptr};
 
+    template <typename T>
     void setup_with_mps();
     void setup_without_mps();
 
+    template <typename T>
     void run_with_mps();
     void run_without_mps();
 
diff --git a/lite/kernels/metal/image_op/reduce_image_compute.mm b/lite/kernels/metal/image_op/reduce_image_compute.mm
@@ -42,24 +42,44 @@
 
     // use mps or not
     bool should_use_mps = false;
-    if (@available(iOS 11.3, *)) {
+    auto reduce_type_ = KernelBase::op_type();
+
+    if (@available(iOS 11.3, macOS 10.13.4, macCatalyst 13.0, *)) {
         if (metal_context_->use_mps()) {
             should_use_mps = true;
         }
     }
+    if (input_buffer_->tensor_dim_[1] < 4) should_use_mps = false;
 
     use_mps_ = should_use_mps;
     if (use_mps_) {
-        setup_with_mps();
+        if (reduce_type_ == ("reduce_max")) {
+            setup_with_mps<MPSNNReduceFeatureChannelsMax>();
+        } else if (reduce_type_ == ("reduce_min")) {
+            setup_with_mps<MPSNNReduceFeatureChannelsMin>();
+        } else if (reduce_type_ == ("reduce_mean")) {
+            setup_with_mps<MPSNNReduceFeatureChannelsMean>();
+        } else if (reduce_type_ == ("reduce_sum")) {
+            setup_with_mps<MPSNNReduceFeatureChannelsSum>();
+        }
     } else {
         setup_without_mps();
     }
 }
 
 void ReduceImageCompute::Run() {
     @autoreleasepool {
+        auto reduce_type_ = KernelBase::op_type();
         if (use_mps_) {
-            run_with_mps();
+            if (reduce_type_ == ("reduce_max")) {
+                run_with_mps<MPSNNReduceFeatureChannelsMax>();
+            } else if (reduce_type_ == ("reduce_min")) {
+                run_with_mps<MPSNNReduceFeatureChannelsMin>();
+            } else if (reduce_type_ == ("reduce_mean")) {
+                run_with_mps<MPSNNReduceFeatureChannelsMean>();
+            } else if (reduce_type_ == ("reduce_sum")) {
+                run_with_mps<MPSNNReduceFeatureChannelsSum>();
+            }
         } else {
             run_without_mps();
         }
@@ -83,30 +103,39 @@
 void ReduceImageCompute::setup_without_mps() {
     const auto& param = this->Param<param_t>();
     auto irank = input_buffer_->tensor_dim_.size();
+    auto reduce_type_ = KernelBase::op_type();
 
     // only support reduce_max by channel
     if (param.dim.size() == 1 && param.dim[0] == 1 && param.keep_dim == true && irank == 4) {
     } else {
         LOG(FATAL) << "reduce: only support max by channel";
     }
 
-    function_name_ = "reduce_max_c";
+    if (reduce_type_ == ("reduce_max")) {
+        function_name_ = "reduce_max_c";
+    } else if (reduce_type_ == ("reduce_min")) {
+        function_name_ = "reduce_min_c";
+    } else if (reduce_type_ == ("reduce_mean")) {
+        function_name_ = "reduce_mean_c";
+    } else if (reduce_type_ == ("reduce_sum")) {
+        function_name_ = "reduce_sum_c";
+    }
     // pipline
     auto backend = (__bridge MetalContextImp*)metal_context_->backend();
     pipline_ = [backend pipline:function_name_];
 }
 
 #pragma mark - MPS
 
+template <typename T>
 void ReduceImageCompute::run_with_mps() {
     auto backend = (__bridge MetalContextImp*)metal_context_->backend();
     auto cmdbuf = [backend commandBuffer];
     if (mps_op_) {
-        if (@available(iOS 11.3, *)) {
-            [((__bridge MPSNNReduceFeatureChannelsMax*)mps_op_)
-                encodeToCommandBuffer:cmdbuf
-                          sourceImage:(__bridge MPSImage*)mps_input_image_
-                     destinationImage:(__bridge MPSImage*)mps_output_image_];
+        if (@available(iOS 11.3, macOS 10.13.4, macCatalyst 13.0, *)) {
+            [((__bridge T*)mps_op_) encodeToCommandBuffer:cmdbuf
+                                              sourceImage:(__bridge MPSImage*)mps_input_image_
+                                         destinationImage:(__bridge MPSImage*)mps_output_image_];
         }
     }
     [backend commit:cmdbuf];
@@ -127,7 +156,9 @@
     }
 }
 
+template <typename T>
 void ReduceImageCompute::setup_with_mps() {
+    auto reduce_type_ = KernelBase::op_type();
     const auto& param = this->Param<param_t>();
     auto irank = input_buffer_->tensor_dim_.size();
     auto orank = output_buffer_->tensor_dim_.size();
@@ -139,9 +170,8 @@
     }
 
     auto backend = (__bridge MetalContextImp*)metal_context_->backend();
-    if (@available(iOS 11.3, *)) {
-        mps_op_ = (__bridge_retained void*)[[MPSNNReduceFeatureChannelsMax alloc]
-            initWithDevice:backend.device];
+    if (@available(iOS 11.3, macOS 10.13.4, macCatalyst 13.0, *)) {
+        mps_op_ = (__bridge_retained void*)[[T alloc] initWithDevice:backend.device];
         // MPS input and output
         auto input_c = MAX(4, static_cast<int>(input_buffer_->tensor_dim_[1]));
         mps_input_image_ =
@@ -218,3 +248,51 @@
             PRECISION(kFloat),
             DATALAYOUT(kMetalTexture2DArray))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(reduce_min,
+    kMetal,
+    kFloat,
+    kMetalTexture2DArray,
+    paddle::lite::kernels::metal::ReduceImageCompute,
+    def)
+    .BindInput("X",
+        {LiteType::GetTensorTy(TARGET(kMetal),
+            PRECISION(kFloat),
+            DATALAYOUT(kMetalTexture2DArray))})
+    .BindOutput("Out",
+        {LiteType::GetTensorTy(TARGET(kMetal),
+            PRECISION(kFloat),
+            DATALAYOUT(kMetalTexture2DArray))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(reduce_sum,
+    kMetal,
+    kFloat,
+    kMetalTexture2DArray,
+    paddle::lite::kernels::metal::ReduceImageCompute,
+    def)
+    .BindInput("X",
+        {LiteType::GetTensorTy(TARGET(kMetal),
+            PRECISION(kFloat),
+            DATALAYOUT(kMetalTexture2DArray))})
+    .BindOutput("Out",
+        {LiteType::GetTensorTy(TARGET(kMetal),
+            PRECISION(kFloat),
+            DATALAYOUT(kMetalTexture2DArray))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(reduce_mean,
+    kMetal,
+    kFloat,
+    kMetalTexture2DArray,
+    paddle::lite::kernels::metal::ReduceImageCompute,
+    def)
+    .BindInput("X",
+        {LiteType::GetTensorTy(TARGET(kMetal),
+            PRECISION(kFloat),
+            DATALAYOUT(kMetalTexture2DArray))})
+    .BindOutput("Out",
+        {LiteType::GetTensorTy(TARGET(kMetal),
+            PRECISION(kFloat),
+            DATALAYOUT(kMetalTexture2DArray))})
+    .Finalize();