@@ -70,8 +70,31 @@ template <typename T>
7070__global__ void NormalizeProbability (T* norm_probs, const T* in_data,
7171 T* sum_rows) {
7272 // int id = blockIdx.x * blockDim.x + threadIdx.x;
73- int id = threadIdx .x ;
74- norm_probs[id] = in_data[id] / sum_rows[0 ];
73+ // int id = threadIdx.x;
74+ int id = threadIdx .x + blockIdx .x * blockDim .x +
75+ blockIdx .y * gridDim .x * blockDim .x ;
76+ norm_probs[id] = in_data[id] / sum_rows[blockIdx .y ];
77+ }
78+
79+ template <typename T>
80+ __global__ void yokiFunc (const T* in_data, T* out) {
81+ // int id = blockIdx.x * blockDim.x + threadIdx.x;
82+ // int id = threadIdx.x;
83+ int id = threadIdx .x + blockIdx .x * blockDim .x +
84+ blockIdx .y * gridDim .x * blockDim .x ;
85+ out[id] = in_data[id];
86+ }
87+
88+ template <typename T>
89+ __global__ void Cumsum (T* norm_probs_data, int64_t num_distributions,
90+ int64_t num_categories, T* cumulative_probs) {
91+ // int id = blockIdx.x;
92+ for (int id = blockIdx .x ; id < num_distributions; id += gridDim .x ) {
93+ thrust::inclusive_scan (thrust::device,
94+ norm_probs_data + id * num_categories,
95+ norm_probs_data + (id + 1 ) * num_categories,
96+ cumulative_probs + id * num_categories);
97+ }
7598}
7699
77100template <typename T>
@@ -141,21 +164,29 @@ __global__ void sampleMultinomialWithReplacement(
141164 // global index formula for 2D grid of 1D blocks
142165 // int idx = blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x +
143166 // threadIdx.x;
144- int idx = blockIdx .x * blockDim .x + threadIdx .x ;
145167
146- for (int sample = blockIdx .x * blockDim .x + threadIdx .x ;
147- sample < totalSamples; sample += blockDim .x * gridDim .x ) {
148- // we are losing 3 out of 4 generated numbers but it's ok
149- // this kernel is not very efficient anyway
168+ // int idx = blockIdx.x * blockDim.x + threadIdx.x;
150169
151- // T uniform_random = dist(rng);
152- T uniform_random = rng[sample] ;
170+ int idx = threadIdx . x + blockIdx . x * blockDim . x +
171+ blockIdx . y * gridDim . x * blockDim . x ;
153172
154- // Find the bucket that a uniform sample lies in
155- int choice = binarySearchForMultinomial<T>(normDistPrefixSum, normDist,
156- categories, uniform_random);
173+ for (int curDist = blockIdx .y ; curDist < distributions;
174+ curDist += gridDim .y ) {
175+ for (int sample = blockIdx .x * blockDim .x + threadIdx .x ;
176+ sample < totalSamples; sample += blockDim .x * gridDim .x ) {
177+ // we are losing 3 out of 4 generated numbers but it's ok
178+ // this kernel is not very efficient anyway
157179
158- dest[sample] = choice;
180+ // T uniform_random = dist(rng);
181+ T uniform_random = rng[sample + curDist * totalSamples];
182+
183+ // Find the bucket that a uniform sample lies in
184+ int choice = binarySearchForMultinomial<T>(
185+ normDistPrefixSum + curDist * categories,
186+ normDist + curDist * categories, categories, uniform_random);
187+
188+ dest[sample + curDist * totalSamples] = choice;
189+ }
159190 }
160191}
161192
@@ -167,17 +198,48 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
167198 const auto x = ctx.Input <framework::Tensor>(" X" );
168199 auto out = ctx.Output <framework::Tensor>(" Out" );
169200
201+ // auto yokiout = ctx.Output<framework::Tensor>("yokiOut");
202+
170203 const int64_t num_samples = ctx.Attr <int >(" num_samples" );
171204 const bool replacement = ctx.Attr <bool >(" replacement" );
172205
173206 auto * in_data = x->data <T>();
174207 auto * out_data = out->mutable_data <T>(ctx.GetPlace ());
208+ // auto* yokiout_data = yokiout->mutable_data<T>(ctx.GetPlace());
175209
176210 auto in_dims = x->dims ();
177211 int64_t in_rank = in_dims.size ();
178212 const int64_t num_categories = in_dims[in_rank - 1 ];
179213 const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2 ] : 1 ;
180214
215+ if (!replacement) {
216+ int in_data_numel = x->numel ();
217+ int out_data_numel = out->numel ();
218+ // std::vector<T> cpu_in_data(in_data_numel);
219+ // std::vector<T> cpu_out_data(out_data_numel);
220+ // T cpu_in_data[in_data_numel];
221+ // T cpu_out_data[out_data_numel];
222+
223+ T* cpu_in_data = new T[in_data_numel];
224+ T* cpu_out_data = new T[out_data_numel];
225+
226+ cudaMemcpy (cpu_in_data, in_data, in_data_numel * sizeof (T),
227+ cudaMemcpyDeviceToHost);
228+
229+ VLOG (3 ) << " Print cpu_in_data " << cpu_in_data[0 ] << " \n " ;
230+ VLOG (3 ) << " Print in_data_numel " << in_data_numel << " \n " ;
231+ VLOG (3 ) << " Print out_data_numel " << out_data_numel << " \n " ;
232+
233+ MultinomialFunctor<T>(cpu_out_data, cpu_in_data, num_samples, replacement,
234+ num_categories, num_distributions);
235+ cudaMemcpy (out_data, cpu_out_data, out_data_numel * sizeof (T),
236+ cudaMemcpyHostToDevice);
237+
238+ delete[] cpu_in_data;
239+ delete[] cpu_out_data;
240+ return ;
241+ }
242+
181243 // std::vector<T> sum_rows(num_distributions);
182244 // SumArrayCUDAKernel<T>(in_data, sum_rows,)
183245
@@ -188,74 +250,91 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
188250 VLOG (3 ) << " Print in_rank " << in_rank << " \n " ;
189251
190252 framework::Tensor sum_rows_t ;
191- auto * sum_rows_data = sum_rows_t .mutable_data <T>({1 }, ctx.GetPlace ());
253+ auto * sum_rows_data =
254+ sum_rows_t .mutable_data <T>({num_distributions}, ctx.GetPlace ());
192255 // auto* sum_rows_data =
193- // sum_rows_t->mutable_data<T>(framework::make_ddim({1}), ctx.GetPlace());
256+ // sum_rows_t->mutable_data<T>(framework::make_ddim({num_distributions}),
257+ // ctx.GetPlace());
194258
195259 auto & place = *ctx.template device_context <platform::CUDADeviceContext>()
196260 .eigen_device ();
197261
198- auto eigen_input = framework::EigenVector<T>::Flatten (*x);
199- // auto eigen_sum_rows = framework::EigenVector<T>::From(sum_rows_t);
200- auto eigen_sum_rows = framework::EigenScalar<T>::From (sum_rows_t );
201- eigen_sum_rows.device (place) =
202- eigen_input.sum (Eigen::DSizes<int , 1 >(0 ))
203- .eval ()
204- .reshape (Eigen::DSizes<int , 1 >(sum_rows_t .dims ()[0 ]));
205- // eigen_sum_rows.device(place) =
206- // eigen_input.sum().eval().reshape(Eigen::DSizes<int, 1>(1));
207-
208- dim3 grid (num_distributions);
209- dim3 block (num_categories);
262+ if (num_distributions == 1 ) {
263+ auto eigen_input = framework::EigenVector<T>::Flatten (*x);
264+ auto eigen_sum_rows = framework::EigenVector<T>::From (sum_rows_t );
265+ // auto eigen_sum_rows = framework::EigenScalar<T>::From(sum_rows_t);
266+ eigen_sum_rows.device (place) =
267+ eigen_input.sum (Eigen::DSizes<int , 1 >(1 ))
268+ .eval ()
269+ .reshape (Eigen::DSizes<int , 1 >(sum_rows_t .dims ()[0 ]));
270+ } else {
271+ auto eigen_input = framework::EigenMatrix<T>::From (*x);
272+ // auto eigen_sum_rows = framework::EigenVector<T>::From(sum_rows_t);
273+ auto eigen_sum_rows = framework::EigenVector<T>::From (sum_rows_t );
274+ eigen_sum_rows.device (place) = eigen_input.sum (Eigen::DSizes<int , 1 >(1 ));
275+ // .eval()
276+ // .reshape(Eigen::DSizes<int, 1>(sum_rows_t.dims()[0]));
277+ // eigen_sum_rows.device(place) =
278+ // eigen_input.sum().eval().reshape(Eigen::DSizes<int, 1>(1));
279+ }
210280
211281 // std::vector<T> in_data_norm(num_categories);
212282 framework::Tensor norm_probs_t ;
213- auto * norm_probs_data =
214- norm_probs_t .mutable_data <T>({num_categories}, ctx.GetPlace ());
283+ auto * norm_probs_data = norm_probs_t .mutable_data <T>(
284+ {num_distributions, num_categories}, ctx.GetPlace ());
285+
286+ // dim3 grid(num_distributions);
287+ // dim3 block(num_categories);
288+
289+ dim3 block (num_categories < 512 ? num_categories : 512 );
290+ dim3 grid ((num_categories - 1 ) / block.x + 1 , num_distributions);
215291 NormalizeProbability<
216292 T><<<grid, block, 0 , ctx.cuda_device_context().stream()>>> (
217293 norm_probs_data, in_data, sum_rows_data);
218294
219295 // num_distributions can only be 1.
220296 // std::vector<T> cumulative_probs(num_categories);
221297 framework::Tensor cumulative_probs_t ;
222- auto * cumulative_probs =
223- cumulative_probs_t . mutable_data <T>({ num_categories}, ctx.GetPlace ());
298+ auto * cumulative_probs = cumulative_probs_t . mutable_data <T>(
299+ {num_distributions, num_categories}, ctx.GetPlace ());
224300 // T cumulative_probs[num_categories];
225- int64_t size = num_categories;
226- thrust::inclusive_scan (thrust::device, norm_probs_data,
227- norm_probs_data + num_categories, cumulative_probs);
301+ dim3 block1 (1 );
302+ dim3 grid1 (num_distributions);
303+ Cumsum<T><<<grid1, block1, 0 , ctx.cuda_device_context().stream()>>> (
304+ norm_probs_data, num_distributions, num_categories, cumulative_probs);
305+
306+ /*
307+ dim3 block2(num_categories < 512 ? num_categories : 512);
308+ dim3 grid2((num_categories-1)/block2.x+1, num_distributions);
309+ yokiFunc<T><<<grid2, block2, 0, ctx.cuda_device_context().stream()>>>(
310+ cumulative_probs, yokiout_data);*/
311+
312+ // int64_t size = num_categories;
313+ // thrust::inclusive_scan(thrust::device, norm_probs_data,
314+ // norm_probs_data + num_categories,
315+ // cumulative_probs);
316+
317+ VLOG (3 ) << " Print cumsum " << cumulative_probs << " \n " ;
228318
229319 if (replacement) {
230320 dim3 block (128 );
231321 // int grid_y = 1;
232- dim3 grid ((num_samples - 1 ) / block.x + 1 );
233-
234- /*
235- // std::vector<T> rng(num_samples);
236- T rng[num_samples];
237- std::uniform_real_distribution<T> dist(0, 1);
238- auto gen_ptr = framework::DefaultCPUGenerator();
239- auto engine = gen_ptr->GetCPUEngine();
240-
241- for (int s = 0; s < num_samples; s++) {
242- rng[s] = dist(*engine);
243- }
244- */
322+ dim3 grid ((num_samples - 1 ) / block.x + 1 , num_distributions);
245323
246324 std::random_device rd;
247325 auto seed = rd ();
248326
249327 framework::Tensor rng_data_t ;
250- auto * rng_data =
251- rng_data_t . mutable_data <T>({ num_samples}, ctx.GetPlace ());
328+ auto * rng_data = rng_data_t . mutable_data <T>(
329+ {num_distributions, num_samples}, ctx.GetPlace ());
252330
253331 thrust::counting_iterator<unsigned int > index_sequence_begin (0 );
254332 platform::Transform<platform::CUDADeviceContext> trans;
255333 auto * context = static_cast <const platform::CUDADeviceContext*>(
256334 &ctx.device_context ());
257- trans (*context, index_sequence_begin, index_sequence_begin + num_samples,
258- rng_data, RandomGeneratorCudaFunctor<T>(seed));
335+ trans (*context, index_sequence_begin,
336+ index_sequence_begin + num_distributions * num_samples, rng_data,
337+ RandomGeneratorCudaFunctor<T>(seed));
259338
260339 VLOG (3 ) << " Print enter\n " ;
261340 // VLOG(3) << "Print size in_data " <<
@@ -267,8 +346,12 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
267346 T><<<grid, block, 0 , ctx.cuda_device_context().stream()>>> (
268347 rng_data, num_samples, out_data, num_distributions, num_categories,
269348 cumulative_probs, norm_probs_data);
349+
350+ VLOG (3 ) << " Print end\n " << out_data;
270351 }
271352
353+ VLOG (3 ) << " Print final end\n " ;
354+
272355 // MultinomialCudaFunctor<T>(out_data, in_data, num_samples, replacement,
273356 // num_categories, num_distributions);
274357 }
0 commit comments