Skip to content

Commit 527779a

Browse files
committed
remove useless while loop and optimize variable name, test=develop
1 parent ad79dff commit 527779a

2 files changed

Lines changed: 68 additions & 56 deletions

File tree

paddle/fluid/operators/amp/check_finite_and_unscale_op.cu

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -39,21 +39,24 @@ __global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale,
3939
__syncthreads();
4040

4141
const int64_t num = s_starts[size];
42-
int pre_xs_index = 0;
42+
int xs_index = 0;
4343
bool t_found_inf = false;
4444
const MT t_scale = *scale;
4545
for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) {
46-
// get the xs's index of thread
47-
int xs_index = pre_xs_index;
48-
while (idx < s_starts[xs_index]) xs_index++;
49-
// avoid some tensor's numel is zero
50-
while (idx >= s_starts[xs_index]) xs_index++;
51-
pre_xs_index = xs_index - 1;
46+
// get the "out" index of "id"
47+
// For example:
48+
// idx = 15, starts = [0, 10, 10, 20, 30]
49+
// because 10 <= idx < 20 ==>
50+
// the idx element locate in the 3rd tensor (notice the 2nd tensor size is
51+
// 0)
52+
int next_xs_index = xs_index;
53+
while (idx >= s_starts[next_xs_index]) next_xs_index++;
54+
xs_index = next_xs_index - 1;
5255

5356
// get in data and out data
54-
const T* in = xs[pre_xs_index];
55-
T* out = outs[pre_xs_index];
56-
int64_t in_idx = idx - s_starts[pre_xs_index];
57+
const T* in = xs[xs_index];
58+
T* out = outs[xs_index];
59+
int64_t in_idx = idx - s_starts[xs_index];
5760

5861
// Unscale
5962
MT val = static_cast<MT>(in[in_idx]) * t_scale;
@@ -94,28 +97,30 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
9497
scale_data, inverse_scale_v, found_inf_data);
9598

9699
size_t xs_size = xs.size();
100+
const auto& cpu_place = platform::CPUPlace();
97101
// calculate each tensor's start index and copy to device
98102
auto h_starts_tensor =
99-
memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t));
103+
memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
100104
int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
101105

102106
auto d_starts_tensor =
103107
memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
104108
int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
105109

110+
// the start index value of each tensor is
111+
// the sum of previous tensor's size. For example:
112+
// xs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
106113
h_starts[0] = 0;
107114
for (int i = 1; i <= xs_size; i++) {
108-
// the start index value of each tensor is
109-
// the sum of previous tensor's size
110115
h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
111116
}
112117
int64_t total_num = h_starts[xs_size];
113118
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
114-
d_starts, platform::CPUPlace(), h_starts,
115-
(xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
119+
d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
120+
dev_ctx.stream());
116121

117122
// copy each tensor's data address to device
118-
auto h_mem = memory::Alloc(platform::CPUPlace(), 2 * xs_size * sizeof(T*));
123+
auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
119124
const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
120125
T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
121126

@@ -128,16 +133,18 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
128133
h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
129134
}
130135
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs,
131-
platform::CPUPlace(), h_xs, 2 * xs_size * sizeof(T*),
132-
dev_ctx.stream());
136+
cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream());
133137

134138
// Launch Kernel
135-
int block = 1024;
136-
int block_num = block * 20; // each thread deal with 20 number
137-
int grid = (total_num + block_num - 1) / block_num;
139+
int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
140+
int elements_per_block =
141+
threads_per_block * 20; // each thread deal with 20 number
142+
int blocks_per_grid =
143+
(total_num + elements_per_block - 1) / elements_per_block;
138144
VLOG(3) << "launch kernel";
139-
CheckFiniteAndUnscale<T, MPDType><<<
140-
grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
145+
CheckFiniteAndUnscale<
146+
T, MPDType><<<blocks_per_grid, threads_per_block,
147+
(xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
141148
d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs);
142149
VLOG(3) << "finish kernel";
143150
}

paddle/fluid/operators/amp/update_loss_scaling_op.cu

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -42,26 +42,28 @@ __global__ void FusedFillIf(T** outs, const size_t xs_size,
4242
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
4343

4444
// copy starts array from global memory to shared memory
45-
extern __shared__ int64_t starts_s[];
45+
extern __shared__ int64_t s_starts[];
4646
for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) {
47-
starts_s[i] = starts[i];
47+
s_starts[i] = starts[i];
4848
}
4949
__syncthreads();
5050

51-
const int64_t total_num = starts_s[xs_size];
51+
const int64_t total_num = s_starts[xs_size];
5252
int out_index = 0;
5353

5454
for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) {
5555
// get the "out" index of "id"
56+
// For example:
57+
// id = 15, starts = [0, 10, 10, 20, 30]
58+
// because 10 <= id < 20 ==>
59+
// the id element locate in the 3rd tensor (notice the 2nd tensor size is 0)
5660
int next_out_index = out_index;
57-
while (id < starts_s[next_out_index]) next_out_index++;
58-
// avoid some tensor's numel is zero
59-
while (id >= starts_s[next_out_index]) next_out_index++;
61+
while (id >= s_starts[next_out_index]) next_out_index++;
6062
out_index = next_out_index - 1;
6163

6264
// get data pointer and index
6365
T* out_data = outs[out_index];
64-
int64_t idx = id - starts_s[out_index];
66+
int64_t idx = id - s_starts[out_index];
6567

6668
// set value
6769
out_data[idx] = value;
@@ -93,48 +95,51 @@ class LazyZeros<platform::CUDADeviceContext, T> {
9395
const std::vector<const framework::Tensor*>& xs,
9496
const std::vector<framework::Tensor*>& outs) const {
9597
size_t xs_size = xs.size();
98+
const auto& cpu_place = platform::CPUPlace();
9699
// alloc each tensor's start index and copy to device
97-
auto starts_h_tensor =
98-
memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t));
99-
int64_t* starts_h = reinterpret_cast<int64_t*>(starts_h_tensor->ptr());
100+
auto h_in_starts_mem =
101+
memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
102+
int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
100103

101-
auto starts_d_tensor =
104+
auto d_in_starts_mem =
102105
memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
103-
int64_t* starts_d = reinterpret_cast<int64_t*>(starts_d_tensor->ptr());
106+
int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());
104107

105-
starts_h[0] = 0;
108+
// the start index value of each tensor is
109+
// the sum of previous tensor's size. For example:
110+
// outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
111+
h_starts[0] = 0;
106112
for (int i = 0; i < xs_size; i++) {
107-
// the start index value of each tensor is
108-
// the sum of previous tensor's size
109-
starts_h[i + 1] = starts_h[i] + outs[i]->numel();
113+
h_starts[i + 1] = h_starts[i] + outs[i]->numel();
110114
}
111115
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
112-
starts_d, platform::CPUPlace(), starts_h,
113-
(xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
116+
d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
117+
dev_ctx.stream());
114118

115119
// copy each tensor of "outs" data address array to device
116-
auto outs_addr_h_tensor =
117-
memory::Alloc(platform::CPUPlace(), xs_size * sizeof(T*));
118-
T** outs_addr_h = reinterpret_cast<T**>(outs_addr_h_tensor->ptr());
120+
auto h_out_addrs_tensor = memory::Alloc(cpu_place, xs_size * sizeof(T*));
121+
T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_tensor->ptr());
119122

120-
auto outs_addr_d_tensor = memory::Alloc(dev_ctx, xs_size * sizeof(T*));
121-
T** outs_addr_d = reinterpret_cast<T**>(outs_addr_d_tensor->ptr());
123+
auto d_out_addrs_tensor = memory::Alloc(dev_ctx, xs_size * sizeof(T*));
124+
T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_tensor->ptr());
122125

123126
for (size_t i = 0; i < xs_size; ++i) {
124-
outs_addr_h[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
127+
h_out_addrs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
125128
}
126129
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
127-
outs_addr_d, platform::CPUPlace(), outs_addr_h,
128-
xs_size * sizeof(T*), dev_ctx.stream());
130+
d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*),
131+
dev_ctx.stream());
129132

130133
// launch cuda kernel
131-
int64_t total_num = starts_h[xs_size];
132-
int64_t block = std::min(static_cast<int64_t>(1024), total_num);
133-
int64_t block_num = block * 50; // each thread deal with 50 data
134-
int64_t grid = (total_num + block_num - 1) / block_num;
135-
FusedFillIf<
136-
T><<<grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
137-
outs_addr_d, xs_size, starts_d, static_cast<T>(0), found_inf_data);
134+
int64_t total_num = h_starts[xs_size];
135+
int64_t threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
136+
int64_t elements_per_block =
137+
threads_per_block * 50; // each thread deal with 50 data
138+
int64_t blocks_per_grid =
139+
(total_num + elements_per_block - 1) / elements_per_block;
140+
FusedFillIf<T><<<blocks_per_grid, threads_per_block,
141+
(xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
142+
d_out_addrs, xs_size, d_starts, static_cast<T>(0), found_inf_data);
138143
}
139144
};
140145

0 commit comments

Comments
 (0)