|
| 1 | +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +you may not use this file except in compliance with the License. |
| 5 | +You may obtain a copy of the License at |
| 6 | +
|
| 7 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +
|
| 9 | +Unless required by applicable law or agreed to in writing, software |
| 10 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +See the License for the specific language governing permissions and |
| 13 | +limitations under the License. */ |
| 14 | +#pragma once |
| 15 | + |
| 16 | +#include "paddle/fluid/framework/op_registry.h" |
| 17 | +#include "paddle/fluid/framework/operator.h" |
| 18 | +#include "paddle/fluid/operators/gumbel_softmax_op.h" |
| 19 | + |
| 20 | +#if defined(__NVCC__) || defined(__HIPCC__) |
| 21 | +#ifdef __NVCC__ |
| 22 | +#include "cub/cub.cuh" |
| 23 | +#endif |
| 24 | +#ifdef __HIPCC__ |
| 25 | +#include <hipcub/hipcub.hpp> |
| 26 | +namespace cub = hipcub; |
| 27 | +#endif |
| 28 | + |
| 29 | +#include <thrust/device_vector.h> |
| 30 | +#include <thrust/host_vector.h> |
| 31 | +#include <thrust/random.h> |
| 32 | +#include <thrust/transform.h> |
| 33 | +#include "paddle/fluid/framework/generator.h" |
| 34 | +#include "paddle/fluid/memory/memcpy.h" |
| 35 | + |
| 36 | +namespace paddle { |
| 37 | +namespace operators { |
| 38 | + |
| 39 | +template <typename K, typename V> |
| 40 | +using KeyValuePair = cub::KeyValuePair<K, V>; |
| 41 | + |
| 42 | +template <typename T> |
| 43 | +struct UniformCUDAGenerator { |
| 44 | + T min_, max_; |
| 45 | + unsigned int seed_; |
| 46 | + unsigned int offset_ = 0; |
| 47 | + HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed) |
| 48 | + : min_(min), max_(max), seed_(seed) {} |
| 49 | + HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed, |
| 50 | + unsigned int offset) |
| 51 | + : min_(min), max_(max), seed_(seed), offset_(offset) {} |
| 52 | + |
| 53 | + HOSTDEVICE T operator()(const unsigned int n) const { |
| 54 | + thrust::minstd_rand rng; |
| 55 | + rng.seed(seed_); |
| 56 | + thrust::uniform_real_distribution<T> dist(min_, max_); |
| 57 | + rng.discard(n + offset_); |
| 58 | + return dist(rng); |
| 59 | + } |
| 60 | +}; |
| 61 | + |
| 62 | +template <typename T, size_t BlockDim> |
| 63 | +__global__ void OneHotCUDAKernel(const int64_t height, const int64_t width, |
| 64 | + const int64_t size_out_axis, const T init, |
| 65 | + const T* in, T* out) { |
| 66 | + typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce; |
| 67 | + __shared__ typename BlockReduce::TempStorage temp_storage; |
| 68 | + |
| 69 | + for (int64_t idx = blockIdx.x; idx < height; idx += gridDim.x) { |
| 70 | + KeyValuePair<int, T> kv_pair = {-1, init}; |
| 71 | + int h = idx / size_out_axis; |
| 72 | + int w = idx % size_out_axis; |
| 73 | + cub::ArgMax reducer; |
| 74 | + for (int k = threadIdx.x; k < width; k += blockDim.x) { |
| 75 | + kv_pair = reducer( |
| 76 | + {k, in[h * width * size_out_axis + k * size_out_axis + w]}, kv_pair); |
| 77 | + } |
| 78 | + kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer); |
| 79 | + if (threadIdx.x == 0) { |
| 80 | + int index = static_cast<int>(kv_pair.key); |
| 81 | + out[h * width * size_out_axis + index * size_out_axis + w] = 1; |
| 82 | + } |
| 83 | + __syncthreads(); |
| 84 | + } |
| 85 | +} |
| 86 | + |
| 87 | +template <typename T> |
| 88 | +struct OneHotGenerator<platform::CUDADeviceContext, T> { |
| 89 | + static void Transform(const platform::CUDADeviceContext& context, |
| 90 | + const Tensor& X, Tensor* Out, int axis) { |
| 91 | + const int size_to_axis = SizeToAxis(axis, X.dims()); |
| 92 | + const int size_from_axis = SizeFromAxis(axis, X.dims()); |
| 93 | + const int size_out_axis = SizeOutAxis(axis, X.dims()); |
| 94 | + constexpr int thread_size = 512; |
| 95 | + int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize().x; |
| 96 | + int64_t height = size_to_axis * size_out_axis; |
| 97 | + int block_size = height < max_grid_dimx ? height : max_grid_dimx; |
| 98 | + |
| 99 | + Tensor input_tensor; |
| 100 | + input_tensor.mutable_data<T>(Out->dims(), platform::CUDAPlace()); |
| 101 | + TensorCopy(*Out, context.GetPlace(), &input_tensor); |
| 102 | + math::set_constant(context, Out, 0.0); |
| 103 | + OneHotCUDAKernel< |
| 104 | + T, thread_size><<<block_size, thread_size, 0, context.stream()>>>( |
| 105 | + height, size_from_axis / size_out_axis, size_out_axis, |
| 106 | + std::numeric_limits<T>::lowest(), input_tensor.data<T>(), |
| 107 | + Out->data<T>()); |
| 108 | + } |
| 109 | +}; |
| 110 | + |
| 111 | +template <typename T> |
| 112 | +__global__ void AddGumbelNoiseCUDAKernel(const T* input_data, T* output_data, |
| 113 | + T* noise, const float temperature, |
| 114 | + int64_t n) { |
| 115 | + int index = threadIdx.x + blockIdx.x * blockDim.x; |
| 116 | + int step = blockDim.x * gridDim.x; |
| 117 | + for (int64_t i = index; i < n; i += step) { |
| 118 | + T gumbel_noise = -log(-log(noise[i])); |
| 119 | + output_data[i] = (gumbel_noise + input_data[i]) / temperature; |
| 120 | + } |
| 121 | +} |
| 122 | + |
| 123 | +template <typename T> |
| 124 | +struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> { |
| 125 | + static void Transform(const platform::CUDADeviceContext& context, |
| 126 | + const T* input_data, T* output_data, int size_to_axis, |
| 127 | + int size_from_axis, const float temperature) { |
| 128 | + Tensor random_tensor; |
| 129 | + int64_t size = size_to_axis * size_from_axis; |
| 130 | + T* random_data = |
| 131 | + random_tensor.mutable_data<T>({size}, platform::CUDAPlace()); |
| 132 | + thrust::counting_iterator<unsigned int> index_sequence_begin(0); |
| 133 | + const unsigned int seed = std::random_device()(); |
| 134 | + |
| 135 | + // generate gumbel noise |
| 136 | + int device_id = |
| 137 | + BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId(); |
| 138 | + auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); |
| 139 | + if (gen_cuda->GetIsInitPy()) { |
| 140 | + auto seed_offset = gen_cuda->IncrementOffset(1); |
| 141 | + int gen_offset = size * seed_offset.second; |
| 142 | + thrust::transform( |
| 143 | + index_sequence_begin, index_sequence_begin + size, |
| 144 | + thrust::device_ptr<T>(random_data), |
| 145 | + UniformCUDAGenerator<T>(0.00001, 1, seed_offset.first, gen_offset)); |
| 146 | + } else { |
| 147 | + thrust::transform(index_sequence_begin, index_sequence_begin + size, |
| 148 | + thrust::device_ptr<T>(random_data), |
| 149 | + UniformCUDAGenerator<T>(0.00001, 1, seed)); |
| 150 | + } |
| 151 | + |
| 152 | + // add gumbel noise to X |
| 153 | + const int thread_size = 512; |
| 154 | + int64_t block_size = (size + thread_size) / thread_size; |
| 155 | + AddGumbelNoiseCUDAKernel< |
| 156 | + T><<<block_size, thread_size, 0, context.stream()>>>( |
| 157 | + input_data, output_data, random_data, temperature, size); |
| 158 | + } |
| 159 | +}; |
| 160 | + |
| 161 | +#endif |
| 162 | +} // namespace operators |
| 163 | +} // namespace paddle |
| 164 | + |
| 165 | +namespace ops = paddle::operators; |
| 166 | +namespace plat = paddle::platform; |
| 167 | +REGISTER_OP_CUDA_KERNEL( |
| 168 | + gumbel_softmax, ops::GumbelSoftmaxKernel<plat::CUDADeviceContext, float>, |
| 169 | + ops::GumbelSoftmaxKernel<plat::CUDADeviceContext, double>); |
| 170 | +REGISTER_OP_CUDA_KERNEL( |
| 171 | + gumbel_softmax_grad, |
| 172 | + ops::GumbelSoftmaxGradKernel<plat::CUDADeviceContext, float>, |
| 173 | + ops::GumbelSoftmaxGradKernel<plat::CUDADeviceContext, double>); |
0 commit comments