-
Notifications
You must be signed in to change notification settings - Fork 5.9k
add a fusion op: fused_residual_dropout_bias #34963
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
bf318b8
507117a
462caa1
93e0638
e2808ff
036b430
4d33b98
bd44d04
d2beab7
5d2bbc8
934fcac
44610ea
3133d33
1a83adb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | ||
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. */ | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <cooperative_groups.h> | ||
| #include <cuda.h> | ||
| #include <curand_kernel.h> | ||
|
|
||
| #include <iostream> | ||
| #include <memory> | ||
|
||
|
|
||
| #include "paddle/fluid/platform/cuda_device_function.h" | ||
| #include "paddle/fluid/platform/device_context.h" | ||
| #include "paddle/fluid/platform/float16.h" | ||
|
|
||
| namespace paddle { | ||
| namespace operators { | ||
|
|
||
| /** | ||
| * get 1D threads and blocks | ||
| */ | ||
| template <int VecSize = 4> | ||
| inline std::pair<uint32_t, uint32_t> Get1DThreadsAndBlocks( | ||
| const platform::CUDADeviceContext &ctx, const uint64_t n) { | ||
| const uint64_t tmp_n = n / VecSize; | ||
| int threads = std::max( | ||
| (uint64_t)32, std::min(tmp_n, (uint64_t)ctx.GetMaxThreadsPerBlock())); | ||
| int blocks = std::max((uint64_t)1, (tmp_n + threads - 1) / threads); | ||
| return std::pair<uint32_t, uint32_t>{threads, blocks}; | ||
| } | ||
|
|
||
| /** | ||
| * get the threads for fused_residual_dropout_bias: | ||
| * 1D blocks: blockDim.x = cols | ||
| * 2D grids: gridDim.y = rows | ||
| */ | ||
| template <int VecSize = 4> | ||
| inline std::pair<dim3, dim3> Get1DBlocksAnd2DGrids( | ||
| const platform::CUDADeviceContext &ctx, const uint32_t rows, | ||
| const uint32_t cols) { | ||
| const uint32_t tmp_cols = cols / VecSize; | ||
| int threads = std::max( | ||
| (uint32_t)32, std::min(tmp_cols, (uint32_t)ctx.GetMaxThreadsPerBlock())); | ||
| int blocks_x = std::max((uint32_t)1, (tmp_cols + threads - 1) / threads); | ||
| int blocks_y = std::max((uint32_t)1, rows); | ||
| dim3 block_dim(threads, 1, 1); | ||
| dim3 grid_dim(blocks_x, blocks_y, 1); | ||
| return std::pair<dim3, dim3>{block_dim, grid_dim}; | ||
| } | ||
|
|
||
| // aligned vector generates vectorized load/store on CUDA | ||
| template <typename T, int VecSize> | ||
| struct alignas(sizeof(T) * VecSize) AlignedVector { | ||
| T val[VecSize]; | ||
| }; | ||
|
|
||
| } // namespace operators | ||
| } // namespace paddle | ||
Uh oh!
There was an error while loading. Please reload this page.