Skip to content

Commit e72fea1

Browse files
committed
change codes according to pr suggestions about transpose file
1 parent 8cf2c83 commit e72fea1

File tree

9 files changed

+227
-288
lines changed

9 files changed

+227
-288
lines changed

paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include "paddle/fluid/framework/op_registry.h"
1717
#include "paddle/fluid/memory/malloc.h"
1818
#include "paddle/fluid/platform/mkldnn_reuse.h"
19-
#include "paddle/phi/kernels/funcs/transpose_functor.h"
2019

2120
namespace paddle {
2221
namespace operators {

paddle/fluid/operators/transpose_op.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ limitations under the License. */
2020
#include "paddle/fluid/platform/mkldnn_helper.h"
2121
#endif
2222
#include "paddle/fluid/framework/op_registry.h"
23-
#include "paddle/phi/kernels/funcs/transpose_functor.h"
2423

2524
namespace paddle {
2625
namespace operators {

paddle/fluid/operators/transpose_op_mlu.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
16-
#include "paddle/phi/kernels/funcs/transpose_functor.h"
1716

1817
namespace paddle {
1918
namespace operators {

paddle/fluid/operators/unique_op.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ limitations under the License. */
2424
#include "paddle/fluid/framework/op_registry.h"
2525
#include "paddle/fluid/operators/math/concat_and_split.h"
2626
#include "paddle/phi/kernels/funcs/math_function.h"
27-
#include "paddle/phi/kernels/funcs/transpose_functor.h"
2827

2928
namespace paddle {
3029
namespace operators {

paddle/phi/kernels/autotune/auto_tune_base.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ class AutoTuneBase {
123123
float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
124124
// Regard 1st run as warmup, judge the compare result by the time cost
125125
// of rest cycles.
126-
constexpr int repeats = 4;
126+
constexpr int repeats = 6;
127127
phi::GpuTimer timer;
128128
float time_cost = 0;
129129
const auto& stream = ctx.stream();

paddle/phi/kernels/funcs/broadcast_function.h

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ struct LoaderTypeClassifier {
8383
};
8484

8585
#ifndef PADDLE_WITH_XPU_KP
86+
// Common broadcast/elementwise Loader.
8687
template <typename T, int VecSize, int Arity, bool IsBoundary, int LoadType>
8788
struct BroadcastDataLoader {
8889
__device__ __forceinline__ void operator()(
@@ -107,6 +108,7 @@ struct BroadcastDataLoader {
107108
}
108109
};
109110

111+
// Scalar elementwise Loader with consideration of IsBoundary.
110112
template <typename T, int VecSize, int Arity>
111113
struct BroadcastDataLoader<T, VecSize, Arity, true, kElementwise> {
112114
__device__ __forceinline__ void operator()(
@@ -117,17 +119,12 @@ struct BroadcastDataLoader<T, VecSize, Arity, true, kElementwise> {
117119
const int block_offset,
118120
const int num,
119121
const uint32_t numel) {
120-
#pragma unroll
121-
for (int i = 0; i < Arity; ++i) {
122-
#pragma unroll
123-
kps::Init<T, VecSize>(args[i], static_cast<T>(1));
124-
}
125-
126122
int thread_offset = threadIdx.x * VecSize + block_offset;
127123
#pragma unroll
128124
for (int i = 0; i < Arity; ++i) {
129125
#pragma unroll
130126
for (int idx = 0; idx < VecSize; ++idx) {
127+
args[i][idx] = static_cast<T>(1);
131128
int index = thread_offset + idx;
132129
if (index < numel) {
133130
args[i][idx] = ins[i][index];
@@ -137,6 +134,7 @@ struct BroadcastDataLoader<T, VecSize, Arity, true, kElementwise> {
137134
}
138135
};
139136

137+
// Vectorized elementwise Loader without consideration of IsBoundary.
140138
template <typename T, int VecSize, int Arity>
141139
struct BroadcastDataLoader<T, VecSize, Arity, false, kElementwise> {
142140
__device__ __forceinline__ void operator()(
@@ -164,6 +162,7 @@ struct BroadcastDataLoader<T, VecSize, Arity, false, kElementwise> {
164162
}
165163
};
166164

165+
// Common broadcast data loader.
167166
template <typename T, int VecSize, int Arity, bool IsBoundary>
168167
struct BroadcastDataLoader<T, VecSize, Arity, IsBoundary, kBroadcast> {
169168
__device__ __forceinline__ void operator()(
@@ -405,11 +404,10 @@ void LaunchBroadcastKernel(
405404
auto gpu_config =
406405
phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
407406
auto stream = ctx.stream();
408-
auto threads = gpu_config.thread_per_block;
407+
auto threads = gpu_config.GetBlockSize();
409408
auto blocks = gpu_config.block_per_grid;
410-
int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
411-
gpu_config.GetBlockSize();
412-
int tail_tid = numel % (VecSize * gpu_config.GetBlockSize());
409+
int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
410+
int tail_tid = numel % (VecSize * threads);
413411

414412
if (loader_classifier.all_elementwise) {
415413
VectorizedBroadcastKernel<Func,

paddle/phi/kernels/funcs/dims_simplifier.h

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,6 @@ struct BroadcastDimsSimplifier {
3434
BroadcastDimsSimplifier(const std::vector<const DenseTensor *> &ins,
3535
const phi::DDim &dims,
3636
int axis) {
37-
if (!NeedBroadcast(ins, dims)) {
38-
int64_t numel = phi::product(dims);
39-
rank = 1;
40-
N = ins.size();
41-
out_dims = DimVector{numel};
42-
in_dims.resize(N);
43-
for (int64_t i = 0; i < N; ++i) {
44-
in_dims[i] = DimVector{numel};
45-
}
46-
return;
47-
}
48-
4937
N = std::max(static_cast<int>(ins.size()), 2);
5038
in_dims.resize(N);
5139
rank = dims.size();
@@ -273,18 +261,18 @@ struct BroadcastDimsSimplifier {
273261
};
274262

275263
// Simplify the input dims and permute dims if possible.
276-
struct DimsSimplifier {
264+
struct PermuteDimsSimplifier {
277265
public:
278-
explicit DimsSimplifier(const int rank,
279-
const int64_t numel,
280-
const std::vector<int32_t> &perm,
281-
const std::vector<int64_t> &dims)
266+
PermuteDimsSimplifier(const int rank,
267+
const int64_t numel,
268+
const std::vector<int32_t> &perm,
269+
const std::vector<int64_t> &dims)
282270
: perm_(rank), src_dims_(rank), count_(numel) {
283271
SimplifyPermAndDims(rank, dims, perm);
284272
perm_.resize(rank_);
285273
src_dims_.resize(rank_);
286274
dst_dims_.resize(rank_);
287-
if (!is_seq_perm_) {
275+
if (!is_sequence_perm_) {
288276
for (auto i = 0; i < rank_; ++i) {
289277
dst_dims_[i] = src_dims_[perm_[i]];
290278
}
@@ -294,7 +282,7 @@ struct DimsSimplifier {
294282
}
295283
}
296284

297-
~DimsSimplifier() = default;
285+
~PermuteDimsSimplifier() = default;
298286

299287
const int &GetRank() const { return rank_; }
300288
const int64_t &GetCount() const { return count_; }
@@ -305,8 +293,8 @@ struct DimsSimplifier {
305293
private:
306294
int rank_{1};
307295
int64_t count_{0};
308-
bool is_seq_perm_{true};
309296
std::vector<int> perm_;
297+
bool is_sequence_perm_{true};
310298
std::vector<int64_t> src_dims_;
311299
std::vector<int64_t> dst_dims_;
312300

@@ -365,11 +353,11 @@ struct DimsSimplifier {
365353
const int mapped = valid_map[perm[i]];
366354
if (mapped >= 0) {
367355
perm_[perm_idx] = mapped;
368-
is_seq_perm_ &= (mapped == perm_idx);
356+
is_sequence_perm_ &= (mapped == perm_idx);
369357
perm_idx += 1;
370358
}
371359
}
372-
rank_ = is_seq_perm_ ? 1 : valid_dim_idx;
360+
rank_ = is_sequence_perm_ ? 1 : valid_dim_idx;
373361
}
374362
};
375363

0 commit comments

Comments
 (0)