Skip to content

Commit 2ffcb77

Browse files
add_reduce (#8173)
1 parent 9054d00 commit 2ffcb77

4 files changed

Lines changed: 215 additions & 36 deletions

File tree

lite/backends/metal/metal_kernel/texture/MaxKernel.metal

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -21,30 +21,6 @@ struct ArgParam {
2121
int orank;
2222
};
2323

24-
kernel void reduce_max_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
25-
texture2d_array<ftype, access::write> outTexture[[texture(1)]],
26-
uint3 gid[[thread_position_in_grid]]) {
27-
if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
28-
gid.z >= outTexture.get_array_size())
29-
return;
30-
31-
#if LITE_WITH_METAL_FULL
32-
float omax = FLT_MIN;
33-
#else
34-
half omax = HALF_MIN;
35-
#endif
36-
uint iAL = inTexture.get_array_size();
37-
for (uint i = 0; i < iAL; ++i) {
38-
ftype4 in = inTexture.read(uint2(gid.x, gid.y), gid.z);
39-
omax = max(omax, in.r);
40-
omax = max(omax, in.g);
41-
omax = max(omax, in.b);
42-
omax = max(omax, in.a);
43-
}
44-
45-
outTexture.write(ftype4(omax, 0.0, 0.0, 0.0), gid.xy, gid.z);
46-
}
47-
4824
inline int max_index(texture2d_array<ftype, access::read> inTexture[[texture(0)]], uint2 gid) {
4925
int index = 0;
5026
#if LITE_WITH_METAL_FULL
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "Common.metal"
16+
#include <metal_stdlib>
17+
18+
using namespace metal;
19+
20+
kernel void reduce_max_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
21+
texture2d_array<ftype, access::write> outTexture[[texture(1)]],
22+
uint3 gid[[thread_position_in_grid]]) {
23+
if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
24+
gid.z >= outTexture.get_array_size())
25+
return;
26+
27+
#if LITE_WITH_METAL_FULL
28+
float omax = FLT_MIN;
29+
#else
30+
half omax = HALF_MIN;
31+
#endif
32+
uint iAL = inTexture.get_array_size();
33+
for (uint i = 0; i < iAL; ++i) {
34+
ftype4 in = inTexture.read(uint2(gid.x, gid.y), i);
35+
omax = max(omax, in.x);
36+
omax = max(omax, in.y);
37+
omax = max(omax, in.z);
38+
omax = max(omax, in.w);
39+
}
40+
outTexture.write(ftype4(omax, 0.0, 0.0, 0.0), gid.xy, 0);
41+
}
42+
43+
kernel void reduce_min_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
44+
texture2d_array<ftype, access::write> outTexture[[texture(1)]],
45+
uint3 gid[[thread_position_in_grid]]) {
46+
if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
47+
gid.z >= outTexture.get_array_size())
48+
return;
49+
50+
#if LITE_WITH_METAL_FULL
51+
float omin = FLT_MAX;
52+
#else
53+
half omin = HALF_MAX;
54+
#endif
55+
56+
uint iAL = inTexture.get_array_size();
57+
for (uint i = 0; i < iAL - 1; ++i) {
58+
ftype4 in = inTexture.read(uint2(gid.x, gid.y), i);
59+
omin = min(omin, in.x);
60+
omin = min(omin, in.y);
61+
omin = min(omin, in.z);
62+
omin = min(omin, in.w);
63+
}
64+
ftype4 in_ = inTexture.read(uint2(gid.x, gid.y), iAL - 1);
65+
omin = abs(in_.x <= 1e-6) ? omin : min(omin, in_.x);
66+
omin = abs(in_.y <= 1e-6) ? omin : min(omin, in_.y);
67+
omin = abs(in_.z <= 1e-6) ? omin : min(omin, in_.z);
68+
omin = abs(in_.w <= 1e-6) ? omin : min(omin, in_.w);
69+
outTexture.write(ftype4(omin, 0.0, 0.0, 0.0), gid.xy, 0);
70+
}
71+
72+
kernel void reduce_mean_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
73+
texture2d_array<ftype, access::write> outTexture[[texture(1)]],
74+
uint3 gid[[thread_position_in_grid]]) {
75+
if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
76+
gid.z >= outTexture.get_array_size())
77+
return;
78+
79+
#if LITE_WITH_METAL_FULL
80+
float omean = 0;
81+
#else
82+
half omean = 0;
83+
#endif
84+
uint iAL = inTexture.get_array_size();
85+
uint count = 4 * (iAL - 1);
86+
for (uint i = 0; i < iAL; ++i) {
87+
ftype4 in = inTexture.read(uint2(gid.x, gid.y), i);
88+
omean += in.x;
89+
omean += in.y;
90+
omean += in.z;
91+
omean += in.w;
92+
}
93+
ftype4 in_ = inTexture.read(uint2(gid.x, gid.y), iAL - 1);
94+
count = abs(in_.x <= 1e-6) ? count : count + 1;
95+
count = abs(in_.y <= 1e-6) ? count : count + 1;
96+
count = abs(in_.z <= 1e-6) ? count : count + 1;
97+
count = abs(in_.w <= 1e-6) ? count : count + 1;
98+
omean = omean / count;
99+
outTexture.write(ftype4(omean, 0.0, 0.0, 0.0), gid.xy, 0);
100+
}
101+
102+
kernel void reduce_sum_c(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
103+
texture2d_array<ftype, access::write> outTexture[[texture(1)]],
104+
uint3 gid[[thread_position_in_grid]]) {
105+
if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
106+
gid.z >= outTexture.get_array_size())
107+
return;
108+
109+
#if LITE_WITH_METAL_FULL
110+
float osum = 0;
111+
#else
112+
half osum = 0;
113+
#endif
114+
uint iAL = inTexture.get_array_size();
115+
for (uint i = 0; i < iAL; ++i) {
116+
ftype4 in = inTexture.read(uint2(gid.x, gid.y), i);
117+
osum += in.x;
118+
osum += in.y;
119+
osum += in.z;
120+
osum += in.w;
121+
}
122+
outTexture.write(ftype4(osum, 0.0, 0.0, 0.0), gid.xy, 0);
123+
}

lite/kernels/metal/image_op/reduce_image_compute.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,11 @@ class ReduceImageCompute
5252
void* mps_input_image_{nullptr};
5353
void* mps_output_image_{nullptr};
5454

55+
template <typename T>
5556
void setup_with_mps();
5657
void setup_without_mps();
5758

59+
template <typename T>
5860
void run_with_mps();
5961
void run_without_mps();
6062

lite/kernels/metal/image_op/reduce_image_compute.mm

Lines changed: 90 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,24 +42,44 @@
4242

4343
// use mps or not
4444
bool should_use_mps = false;
45-
if (@available(iOS 11.3, *)) {
45+
auto reduce_type_ = KernelBase::op_type();
46+
47+
if (@available(iOS 11.3, macOS 10.13.4, macCatalyst 13.0, *)) {
4648
if (metal_context_->use_mps()) {
4749
should_use_mps = true;
4850
}
4951
}
52+
if (input_buffer_->tensor_dim_[1] < 4) should_use_mps = false;
5053

5154
use_mps_ = should_use_mps;
5255
if (use_mps_) {
53-
setup_with_mps();
56+
if (reduce_type_ == ("reduce_max")) {
57+
setup_with_mps<MPSNNReduceFeatureChannelsMax>();
58+
} else if (reduce_type_ == ("reduce_min")) {
59+
setup_with_mps<MPSNNReduceFeatureChannelsMin>();
60+
} else if (reduce_type_ == ("reduce_mean")) {
61+
setup_with_mps<MPSNNReduceFeatureChannelsMean>();
62+
} else if (reduce_type_ == ("reduce_sum")) {
63+
setup_with_mps<MPSNNReduceFeatureChannelsSum>();
64+
}
5465
} else {
5566
setup_without_mps();
5667
}
5768
}
5869

5970
void ReduceImageCompute::Run() {
6071
@autoreleasepool {
72+
auto reduce_type_ = KernelBase::op_type();
6173
if (use_mps_) {
62-
run_with_mps();
74+
if (reduce_type_ == ("reduce_max")) {
75+
run_with_mps<MPSNNReduceFeatureChannelsMax>();
76+
} else if (reduce_type_ == ("reduce_min")) {
77+
run_with_mps<MPSNNReduceFeatureChannelsMin>();
78+
} else if (reduce_type_ == ("reduce_mean")) {
79+
run_with_mps<MPSNNReduceFeatureChannelsMean>();
80+
} else if (reduce_type_ == ("reduce_sum")) {
81+
run_with_mps<MPSNNReduceFeatureChannelsSum>();
82+
}
6383
} else {
6484
run_without_mps();
6585
}
@@ -83,30 +103,39 @@
83103
void ReduceImageCompute::setup_without_mps() {
84104
const auto& param = this->Param<param_t>();
85105
auto irank = input_buffer_->tensor_dim_.size();
106+
auto reduce_type_ = KernelBase::op_type();
86107

87108
// only support reduce_max by channel
88109
if (param.dim.size() == 1 && param.dim[0] == 1 && param.keep_dim == true && irank == 4) {
89110
} else {
90111
LOG(FATAL) << "reduce: only support max by channel";
91112
}
92113

93-
function_name_ = "reduce_max_c";
114+
if (reduce_type_ == ("reduce_max")) {
115+
function_name_ = "reduce_max_c";
116+
} else if (reduce_type_ == ("reduce_min")) {
117+
function_name_ = "reduce_min_c";
118+
} else if (reduce_type_ == ("reduce_mean")) {
119+
function_name_ = "reduce_mean_c";
120+
} else if (reduce_type_ == ("reduce_sum")) {
121+
function_name_ = "reduce_sum_c";
122+
}
94123
// pipline
95124
auto backend = (__bridge MetalContextImp*)metal_context_->backend();
96125
pipline_ = [backend pipline:function_name_];
97126
}
98127

99128
#pragma mark - MPS
100129

130+
template <typename T>
101131
void ReduceImageCompute::run_with_mps() {
102132
auto backend = (__bridge MetalContextImp*)metal_context_->backend();
103133
auto cmdbuf = [backend commandBuffer];
104134
if (mps_op_) {
105-
if (@available(iOS 11.3, *)) {
106-
[((__bridge MPSNNReduceFeatureChannelsMax*)mps_op_)
107-
encodeToCommandBuffer:cmdbuf
108-
sourceImage:(__bridge MPSImage*)mps_input_image_
109-
destinationImage:(__bridge MPSImage*)mps_output_image_];
135+
if (@available(iOS 11.3, macOS 10.13.4, macCatalyst 13.0, *)) {
136+
[((__bridge T*)mps_op_) encodeToCommandBuffer:cmdbuf
137+
sourceImage:(__bridge MPSImage*)mps_input_image_
138+
destinationImage:(__bridge MPSImage*)mps_output_image_];
110139
}
111140
}
112141
[backend commit:cmdbuf];
@@ -127,7 +156,9 @@
127156
}
128157
}
129158

159+
template <typename T>
130160
void ReduceImageCompute::setup_with_mps() {
161+
auto reduce_type_ = KernelBase::op_type();
131162
const auto& param = this->Param<param_t>();
132163
auto irank = input_buffer_->tensor_dim_.size();
133164
auto orank = output_buffer_->tensor_dim_.size();
@@ -139,9 +170,8 @@
139170
}
140171

141172
auto backend = (__bridge MetalContextImp*)metal_context_->backend();
142-
if (@available(iOS 11.3, *)) {
143-
mps_op_ = (__bridge_retained void*)[[MPSNNReduceFeatureChannelsMax alloc]
144-
initWithDevice:backend.device];
173+
if (@available(iOS 11.3, macOS 10.13.4, macCatalyst 13.0, *)) {
174+
mps_op_ = (__bridge_retained void*)[[T alloc] initWithDevice:backend.device];
145175
// MPS input and output
146176
auto input_c = MAX(4, static_cast<int>(input_buffer_->tensor_dim_[1]));
147177
mps_input_image_ =
@@ -218,3 +248,51 @@
218248
PRECISION(kFloat),
219249
DATALAYOUT(kMetalTexture2DArray))})
220250
.Finalize();
251+
252+
REGISTER_LITE_KERNEL(reduce_min,
253+
kMetal,
254+
kFloat,
255+
kMetalTexture2DArray,
256+
paddle::lite::kernels::metal::ReduceImageCompute,
257+
def)
258+
.BindInput("X",
259+
{LiteType::GetTensorTy(TARGET(kMetal),
260+
PRECISION(kFloat),
261+
DATALAYOUT(kMetalTexture2DArray))})
262+
.BindOutput("Out",
263+
{LiteType::GetTensorTy(TARGET(kMetal),
264+
PRECISION(kFloat),
265+
DATALAYOUT(kMetalTexture2DArray))})
266+
.Finalize();
267+
268+
REGISTER_LITE_KERNEL(reduce_sum,
269+
kMetal,
270+
kFloat,
271+
kMetalTexture2DArray,
272+
paddle::lite::kernels::metal::ReduceImageCompute,
273+
def)
274+
.BindInput("X",
275+
{LiteType::GetTensorTy(TARGET(kMetal),
276+
PRECISION(kFloat),
277+
DATALAYOUT(kMetalTexture2DArray))})
278+
.BindOutput("Out",
279+
{LiteType::GetTensorTy(TARGET(kMetal),
280+
PRECISION(kFloat),
281+
DATALAYOUT(kMetalTexture2DArray))})
282+
.Finalize();
283+
284+
REGISTER_LITE_KERNEL(reduce_mean,
285+
kMetal,
286+
kFloat,
287+
kMetalTexture2DArray,
288+
paddle::lite::kernels::metal::ReduceImageCompute,
289+
def)
290+
.BindInput("X",
291+
{LiteType::GetTensorTy(TARGET(kMetal),
292+
PRECISION(kFloat),
293+
DATALAYOUT(kMetalTexture2DArray))})
294+
.BindOutput("Out",
295+
{LiteType::GetTensorTy(TARGET(kMetal),
296+
PRECISION(kFloat),
297+
DATALAYOUT(kMetalTexture2DArray))})
298+
.Finalize();

0 commit comments

Comments
 (0)