Skip to content

Commit c66eec7

Browse files
committed
support num_distribution different multinomial distributions
1 parent 8e14302 commit c66eec7

File tree

3 files changed

+150
-55
lines changed

3 files changed

+150
-55
lines changed

paddle/fluid/operators/multinomial_op.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class MultinomialOpMaker : public framework::OpProtoAndCheckerMaker {
3030
void Make() override {
3131
AddInput("X", "A tensor contains probabilities of categories");
3232
AddOutput("Out", "The output tensor of multinomial op");
33+
// AddOutput("yokiOut", "yoki");
3334
AddAttr<int>("num_samples", "number of the generated samples")
3435
.SetDefault(1);
3536
AddAttr<bool>("replacement", "can a category be sampled more than once")
@@ -49,7 +50,7 @@ class MultinomialOp : public framework::OperatorWithKernel {
4950

5051
void InferShape(framework::InferShapeContext *ctx) const override {
5152
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Multinomial");
52-
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multinomial");
53+
// OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multinomial");
5354

5455
auto x_dim = ctx->GetInputDim("X");
5556
int64_t x_rank = x_dim.size();
@@ -62,6 +63,7 @@ class MultinomialOp : public framework::OperatorWithKernel {
6263
out_dims[x_rank - 1] = num_samples;
6364

6465
ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
66+
// ctx->SetOutputDim("yokiOut", x_dim);
6567
}
6668
};
6769

@@ -72,11 +74,16 @@ class MultinomialOpKernel<platform::CPUDeviceContext, T>
7274
void Compute(const framework::ExecutionContext &ctx) const override {
7375
const auto x = ctx.Input<framework::Tensor>("X");
7476
auto out = ctx.Output<framework::Tensor>("Out");
77+
// auto yokiout = ctx.Output<framework::Tensor>("yokiOut");
7578
const int64_t num_samples = ctx.Attr<int>("num_samples");
7679
const bool replacement = ctx.Attr<bool>("replacement");
7780

7881
auto *in_data = x->data<T>();
7982
auto *out_data = out->mutable_data<T>(ctx.GetPlace());
83+
/*auto *yokiout_data = yokiout->mutable_data<T>(ctx.GetPlace());
84+
for (int i = 0; i < x->numel(); i++) {
85+
yokiout_data[i] = in_data[i];
86+
}*/
8087

8188
auto in_dims = x->dims();
8289
int64_t in_rank = in_dims.size();

paddle/fluid/operators/multinomial_op.cu

Lines changed: 134 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,31 @@ template <typename T>
7070
__global__ void NormalizeProbability(T* norm_probs, const T* in_data,
7171
T* sum_rows) {
7272
// int id = blockIdx.x * blockDim.x + threadIdx.x;
73-
int id = threadIdx.x;
74-
norm_probs[id] = in_data[id] / sum_rows[0];
73+
// int id = threadIdx.x;
74+
int id = threadIdx.x + blockIdx.x * blockDim.x +
75+
blockIdx.y * gridDim.x * blockDim.x;
76+
norm_probs[id] = in_data[id] / sum_rows[blockIdx.y];
77+
}
78+
79+
template <typename T>
80+
__global__ void yokiFunc(const T* in_data, T* out) {
81+
// int id = blockIdx.x * blockDim.x + threadIdx.x;
82+
// int id = threadIdx.x;
83+
int id = threadIdx.x + blockIdx.x * blockDim.x +
84+
blockIdx.y * gridDim.x * blockDim.x;
85+
out[id] = in_data[id];
86+
}
87+
88+
template <typename T>
89+
__global__ void Cumsum(T* norm_probs_data, int64_t num_distributions,
90+
int64_t num_categories, T* cumulative_probs) {
91+
// int id = blockIdx.x;
92+
for (int id = blockIdx.x; id < num_distributions; id += gridDim.x) {
93+
thrust::inclusive_scan(thrust::device,
94+
norm_probs_data + id * num_categories,
95+
norm_probs_data + (id + 1) * num_categories,
96+
cumulative_probs + id * num_categories);
97+
}
7598
}
7699

77100
template <typename T>
@@ -141,21 +164,29 @@ __global__ void sampleMultinomialWithReplacement(
141164
// global index formula for 2D grid of 1D blocks
142165
// int idx = blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x +
143166
// threadIdx.x;
144-
int idx = blockIdx.x * blockDim.x + threadIdx.x;
145167

146-
for (int sample = blockIdx.x * blockDim.x + threadIdx.x;
147-
sample < totalSamples; sample += blockDim.x * gridDim.x) {
148-
// we are losing 3 out of 4 generated numbers but it's ok
149-
// this kernel is not very efficient anyway
168+
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
150169

151-
// T uniform_random = dist(rng);
152-
T uniform_random = rng[sample];
170+
int idx = threadIdx.x + blockIdx.x * blockDim.x +
171+
blockIdx.y * gridDim.x * blockDim.x;
153172

154-
// Find the bucket that a uniform sample lies in
155-
int choice = binarySearchForMultinomial<T>(normDistPrefixSum, normDist,
156-
categories, uniform_random);
173+
for (int curDist = blockIdx.y; curDist < distributions;
174+
curDist += gridDim.y) {
175+
for (int sample = blockIdx.x * blockDim.x + threadIdx.x;
176+
sample < totalSamples; sample += blockDim.x * gridDim.x) {
177+
// we are losing 3 out of 4 generated numbers but it's ok
178+
// this kernel is not very efficient anyway
157179

158-
dest[sample] = choice;
180+
// T uniform_random = dist(rng);
181+
T uniform_random = rng[sample + curDist * totalSamples];
182+
183+
// Find the bucket that a uniform sample lies in
184+
int choice = binarySearchForMultinomial<T>(
185+
normDistPrefixSum + curDist * categories,
186+
normDist + curDist * categories, categories, uniform_random);
187+
188+
dest[sample + curDist * totalSamples] = choice;
189+
}
159190
}
160191
}
161192

@@ -167,17 +198,48 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
167198
const auto x = ctx.Input<framework::Tensor>("X");
168199
auto out = ctx.Output<framework::Tensor>("Out");
169200

201+
// auto yokiout = ctx.Output<framework::Tensor>("yokiOut");
202+
170203
const int64_t num_samples = ctx.Attr<int>("num_samples");
171204
const bool replacement = ctx.Attr<bool>("replacement");
172205

173206
auto* in_data = x->data<T>();
174207
auto* out_data = out->mutable_data<T>(ctx.GetPlace());
208+
// auto* yokiout_data = yokiout->mutable_data<T>(ctx.GetPlace());
175209

176210
auto in_dims = x->dims();
177211
int64_t in_rank = in_dims.size();
178212
const int64_t num_categories = in_dims[in_rank - 1];
179213
const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;
180214

215+
if (!replacement) {
216+
int in_data_numel = x->numel();
217+
int out_data_numel = out->numel();
218+
// std::vector<T> cpu_in_data(in_data_numel);
219+
// std::vector<T> cpu_out_data(out_data_numel);
220+
// T cpu_in_data[in_data_numel];
221+
// T cpu_out_data[out_data_numel];
222+
223+
T* cpu_in_data = new T[in_data_numel];
224+
T* cpu_out_data = new T[out_data_numel];
225+
226+
cudaMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T),
227+
cudaMemcpyDeviceToHost);
228+
229+
VLOG(3) << "Print cpu_in_data " << cpu_in_data[0] << "\n";
230+
VLOG(3) << "Print in_data_numel " << in_data_numel << "\n";
231+
VLOG(3) << "Print out_data_numel " << out_data_numel << "\n";
232+
233+
MultinomialFunctor<T>(cpu_out_data, cpu_in_data, num_samples, replacement,
234+
num_categories, num_distributions);
235+
cudaMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(T),
236+
cudaMemcpyHostToDevice);
237+
238+
delete[] cpu_in_data;
239+
delete[] cpu_out_data;
240+
return;
241+
}
242+
181243
// std::vector<T> sum_rows(num_distributions);
182244
// SumArrayCUDAKernel<T>(in_data, sum_rows,)
183245

@@ -188,74 +250,91 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
188250
VLOG(3) << "Print in_rank " << in_rank << "\n";
189251

190252
framework::Tensor sum_rows_t;
191-
auto* sum_rows_data = sum_rows_t.mutable_data<T>({1}, ctx.GetPlace());
253+
auto* sum_rows_data =
254+
sum_rows_t.mutable_data<T>({num_distributions}, ctx.GetPlace());
192255
// auto* sum_rows_data =
193-
// sum_rows_t->mutable_data<T>(framework::make_ddim({1}), ctx.GetPlace());
256+
// sum_rows_t->mutable_data<T>(framework::make_ddim({num_distributions}),
257+
// ctx.GetPlace());
194258

195259
auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
196260
.eigen_device();
197261

198-
auto eigen_input = framework::EigenVector<T>::Flatten(*x);
199-
// auto eigen_sum_rows = framework::EigenVector<T>::From(sum_rows_t);
200-
auto eigen_sum_rows = framework::EigenScalar<T>::From(sum_rows_t);
201-
eigen_sum_rows.device(place) =
202-
eigen_input.sum(Eigen::DSizes<int, 1>(0))
203-
.eval()
204-
.reshape(Eigen::DSizes<int, 1>(sum_rows_t.dims()[0]));
205-
// eigen_sum_rows.device(place) =
206-
// eigen_input.sum().eval().reshape(Eigen::DSizes<int, 1>(1));
207-
208-
dim3 grid(num_distributions);
209-
dim3 block(num_categories);
262+
if (num_distributions == 1) {
263+
auto eigen_input = framework::EigenVector<T>::Flatten(*x);
264+
auto eigen_sum_rows = framework::EigenVector<T>::From(sum_rows_t);
265+
// auto eigen_sum_rows = framework::EigenScalar<T>::From(sum_rows_t);
266+
eigen_sum_rows.device(place) =
267+
eigen_input.sum(Eigen::DSizes<int, 1>(1))
268+
.eval()
269+
.reshape(Eigen::DSizes<int, 1>(sum_rows_t.dims()[0]));
270+
} else {
271+
auto eigen_input = framework::EigenMatrix<T>::From(*x);
272+
// auto eigen_sum_rows = framework::EigenVector<T>::From(sum_rows_t);
273+
auto eigen_sum_rows = framework::EigenVector<T>::From(sum_rows_t);
274+
eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes<int, 1>(1));
275+
// .eval()
276+
// .reshape(Eigen::DSizes<int, 1>(sum_rows_t.dims()[0]));
277+
// eigen_sum_rows.device(place) =
278+
// eigen_input.sum().eval().reshape(Eigen::DSizes<int, 1>(1));
279+
}
210280

211281
// std::vector<T> in_data_norm(num_categories);
212282
framework::Tensor norm_probs_t;
213-
auto* norm_probs_data =
214-
norm_probs_t.mutable_data<T>({num_categories}, ctx.GetPlace());
283+
auto* norm_probs_data = norm_probs_t.mutable_data<T>(
284+
{num_distributions, num_categories}, ctx.GetPlace());
285+
286+
// dim3 grid(num_distributions);
287+
// dim3 block(num_categories);
288+
289+
dim3 block(num_categories < 512 ? num_categories : 512);
290+
dim3 grid((num_categories - 1) / block.x + 1, num_distributions);
215291
NormalizeProbability<
216292
T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
217293
norm_probs_data, in_data, sum_rows_data);
218294

219295
// num_distributions can only be 1.
220296
// std::vector<T> cumulative_probs(num_categories);
221297
framework::Tensor cumulative_probs_t;
222-
auto* cumulative_probs =
223-
cumulative_probs_t.mutable_data<T>({num_categories}, ctx.GetPlace());
298+
auto* cumulative_probs = cumulative_probs_t.mutable_data<T>(
299+
{num_distributions, num_categories}, ctx.GetPlace());
224300
// T cumulative_probs[num_categories];
225-
int64_t size = num_categories;
226-
thrust::inclusive_scan(thrust::device, norm_probs_data,
227-
norm_probs_data + num_categories, cumulative_probs);
301+
dim3 block1(1);
302+
dim3 grid1(num_distributions);
303+
Cumsum<T><<<grid1, block1, 0, ctx.cuda_device_context().stream()>>>(
304+
norm_probs_data, num_distributions, num_categories, cumulative_probs);
305+
306+
/*
307+
dim3 block2(num_categories < 512 ? num_categories : 512);
308+
dim3 grid2((num_categories-1)/block2.x+1, num_distributions);
309+
yokiFunc<T><<<grid2, block2, 0, ctx.cuda_device_context().stream()>>>(
310+
cumulative_probs, yokiout_data);*/
311+
312+
// int64_t size = num_categories;
313+
// thrust::inclusive_scan(thrust::device, norm_probs_data,
314+
// norm_probs_data + num_categories,
315+
// cumulative_probs);
316+
317+
VLOG(3) << "Print cumsum " << cumulative_probs << "\n";
228318

229319
if (replacement) {
230320
dim3 block(128);
231321
// int grid_y = 1;
232-
dim3 grid((num_samples - 1) / block.x + 1);
233-
234-
/*
235-
// std::vector<T> rng(num_samples);
236-
T rng[num_samples];
237-
std::uniform_real_distribution<T> dist(0, 1);
238-
auto gen_ptr = framework::DefaultCPUGenerator();
239-
auto engine = gen_ptr->GetCPUEngine();
240-
241-
for (int s = 0; s < num_samples; s++) {
242-
rng[s] = dist(*engine);
243-
}
244-
*/
322+
dim3 grid((num_samples - 1) / block.x + 1, num_distributions);
245323

246324
std::random_device rd;
247325
auto seed = rd();
248326

249327
framework::Tensor rng_data_t;
250-
auto* rng_data =
251-
rng_data_t.mutable_data<T>({num_samples}, ctx.GetPlace());
328+
auto* rng_data = rng_data_t.mutable_data<T>(
329+
{num_distributions, num_samples}, ctx.GetPlace());
252330

253331
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
254332
platform::Transform<platform::CUDADeviceContext> trans;
255333
auto* context = static_cast<const platform::CUDADeviceContext*>(
256334
&ctx.device_context());
257-
trans(*context, index_sequence_begin, index_sequence_begin + num_samples,
258-
rng_data, RandomGeneratorCudaFunctor<T>(seed));
335+
trans(*context, index_sequence_begin,
336+
index_sequence_begin + num_distributions * num_samples, rng_data,
337+
RandomGeneratorCudaFunctor<T>(seed));
259338

260339
VLOG(3) << "Print enter\n";
261340
// VLOG(3) << "Print size in_data " <<
@@ -267,8 +346,12 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
267346
T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
268347
rng_data, num_samples, out_data, num_distributions, num_categories,
269348
cumulative_probs, norm_probs_data);
349+
350+
VLOG(3) << "Print end\n" << out_data;
270351
}
271352

353+
VLOG(3) << "Print final end\n";
354+
272355
// MultinomialCudaFunctor<T>(out_data, in_data, num_samples, replacement,
273356
// num_categories, num_distributions);
274357
}

python/paddle/fluid/tests/unittests/test_multinomial_op.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def init_data(self):
3838
# input probability is a vector, and replacement is True
3939
self.input_np = np.random.rand(4)
4040
self.outputs = {"Out": np.zeros(100000).astype("int64")}
41+
# self.outputs = {"yokiOut": np.zeros(4).astype("int64")}
4142
self.attrs = {"num_samples": 100000, "replacement": True}
4243

4344
def test_check_output(self):
@@ -53,19 +54,21 @@ def verify_output(self, outs):
5354
# normalize the input to get the probability
5455
prob = self.input_np / self.input_np.sum(axis=-1, keepdims=True)
5556
sample_prob = self.sample_output(np.array(outs[0]))
56-
print("sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
57+
# sample_prob = np.array(outs[0])
58+
# print("input", self.input_np)
59+
# print("sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
5760
self.assertTrue(
5861
np.allclose(
5962
sample_prob, prob, rtol=0, atol=0.01),
6063
"sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
6164

6265

63-
"""
6466
class TestMultinomialOp2(TestMultinomialOp):
6567
def init_data(self):
6668
# input probability is a matrix
6769
self.input_np = np.random.rand(3, 4)
6870
self.outputs = {"Out": np.zeros((3, 100000)).astype("int64")}
71+
# self.outputs = {"yokiOut": np.zeros((3, 4)).astype("int64")}
6972
self.attrs = {"num_samples": 100000, "replacement": True}
7073

7174
def sample_output(self, out):
@@ -88,11 +91,13 @@ def init_data(self):
8891

8992
def verify_output(self, outs):
9093
out = np.array(outs[0])
94+
# print("op3out", out)
9195
unique_out = np.unique(out)
9296
self.assertEqual(
9397
len(unique_out), 100,
9498
"replacement is False. categories can't be sampled repeatedly")
95-
"""
99+
100+
96101
"""
97102
class TestReplacementError(unittest.TestCase):
98103
def init_data(self):

0 commit comments

Comments
 (0)