Skip to content

Commit ed2ed4d

Browse files
authored
Merge pull request PaddlePaddle#33 from NVIDIA/cutlass_1.2
CUTLASS 1.2
2 parents 2332df4 + 4db423c commit ed2ed4d

File tree

1,162 files changed

+23921
-4575
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,162 files changed

+23921
-4575
lines changed

CHANGELOG.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
# NVIDIA CUTLASS Changelog
22

3+
## [1.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.0) (2018-10-26)
4+
* Parallelized reductions across threadblocks ("Split-K")
5+
* Improved IGEMM performance
6+
* Batched strided WMMA GEMMs
37

4-
## 1.1.0 (2018-09-19)
8+
## [1.1.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.1.0) (2018-09-19)
59
* Turing Features
610
* WMMA GEMM targeting TensorCores - INT8, INT4, 1-bit
711
* Batched Strided GEMM
@@ -16,13 +20,13 @@
1620
* Examples
1721
* Basic GEMM, tensor views, CUTLASS utilities, batched GEMM, WMMA GEMM
1822

19-
## 1.0.1 (2018-06-11)
23+
## [1.0.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.0.1) (2018-06-11)
2024

2125
* Intra-threadblock reduction added for small threadblock tile sizes
2226
* sgemm_64x128x16, sgemm_128x128x16, sgemm_128x64x16, sgemm_128x32x16, sgemm_64x64x16, sgemm_64x32x16
2327
* igemm_32x32x128
2428
* GEMM _K_ residue handled during prologue prior to mainloop
25-
* Replaced Google Test copy with submodule. Use `git submodule init`
29+
* Replaced Google Test copy with submodule. Use `git submodule init --recursive --update`
2630

2731
## [1.0.0](https://github.com/NVIDIA/cutlass/commit/2028ebe120aab22bfd0b2baf8902d4c9627eb33f) (2018-05-16)
2832

CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,10 @@ else()
141141
string(APPEND NVCC_FLAGS " -lineinfo")
142142
endif()
143143

144+
if (UNIX)
145+
string(APPEND NVCC_FLAGS " -Xcompiler -Wconversion")
146+
endif()
147+
144148
string(APPEND NVCC_FLAGS_DEBUG " -g")
145149
string(APPEND NVCC_FLAGS_RELWITHDEBINFO " -O3")
146150
string(APPEND NVCC_FLAGS_RELEASE " -O3")
@@ -169,6 +173,8 @@ file(GLOB CUTLASS_GEMM RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/gemm/*.h)
169173
file(GLOB CUTLASS_UTIL RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/util/*.h)
170174
file(GLOB CUTLASS_DEVICE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/device/*.h)
171175
file(GLOB CUTLASS_CORE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/*.h)
176+
file(GLOB CUTLASS_REDUCTION RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/reduction/*.h )
177+
172178
###################################################################################################
173179
#
174180
# Define build targets
@@ -178,6 +184,7 @@ file(GLOB CUTLASS_CORE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/*.h)
178184
source_group("cutlass\\gemm" FILES ${CUTLASS_GEMM})
179185
source_group("cutlass\\util" FILES ${CUTLASS_UTIL})
180186
source_group("cutlass\\device" FILES ${CUTLASS_DEVICE})
187+
source_group("cutlass\\reduction" FILES ${CUTLASS_REDUCTION})
181188
source_group("cutlass" FILES ${CUTLASS_CORE})
182189

183190
add_library(CUTLASS INTERFACE)
@@ -187,6 +194,7 @@ target_sources(CUTLASS INTERFACE
187194
${CUTLASS_UTIL}
188195
${CUTLASS_DEVICE}
189196
${CUTLASS_CORE}
197+
${CUTLASS_REDUCTION}
190198
)
191199

192200
target_include_directories(CUTLASS INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
@@ -197,6 +205,7 @@ add_custom_target(cutlass_ide SOURCES
197205
${CUTLASS_UTIL}
198206
${CUTLASS_DEVICE}
199207
${CUTLASS_CORE}
208+
${CUTLASS_REDUCTION}
200209
)
201210
# Doxygen is available. Generate documentation
202211
if (DOXYGEN_FOUND)

CUTLASS.md

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ CUTLASS core components, and to identify their role in implementing GEMM computa
99
2. [General Matrix Multiply](#S-general-matrix-multiply)
1010
3. [Core Components](#S-core-components)
1111
4. [Utilities](#S-utilities)
12+
5. [Optimization Strategies](#S-optimization-strategies)
1213

1314
# <a name="S-design-patterns"></a> 1. Design Patterns
1415

@@ -26,7 +27,7 @@ objectives. This section is intended to provide more detail.
2627

2728
## <a name="S-patterns-sequencing-nesting"></a> Sequencing and Nesting of Collective Primitives
2829

29-
CUTLASS embodies a design paradigm exemplified by the [CUB library](https://nvlabs.github.io/cub/) for expressing collective operations. Objects expose an interface for a problem that is then decomposed into concurrent subtasks executed by cooperating threadblocks, warps, and threads. For example, a grid-level object may be constructed with base pointers to the start of a GEMM operation, add a threadblock-dependent offset to partition the problem, and then compute a per-threadblock GEMM. This in turn performs some operations as a collection of cooperating threads, while it may partition other parts of the task into warp-level subtasks.
30+
CUTLASS embodies a design paradigm exemplified by the [CUB library](https://nvlabs.github.io/cub/) for expressing collective operations. Objects expose an interface for a problem that is then decomposed into concurrent subtasks executed by cooperating threadblocks, warps, and threads. For example, a grid-level object may be constructed with base pointers to the start of a GEMM operation, add a threadblock-dependent offset to partition the problem, and then compute a per-threadblock GEMM. This in turn performs some operations as a collection of cooperating threads, while it may partition other parts of the task into warp-level subtasks.
3031

3132
## <a name="S-patterns-tiles-iterators"></a> Tiles and Iterators
3233

@@ -48,7 +49,7 @@ CUTLASS can take advantage of this CUDA grid-invariant property by constructing
4849

4950
The design pattern in CUTLASS is for classes with nontrivial constructors to define `struct Params` as an inner class which contains grid-invariant state. These should define a constructor and an `initialize()` method. The `Params` structure should also include a data member corresponding to each data member in the parent class, so these too can be properly constructed in host code. The parent class should define a constructor which accepts `Params const &` as its first argument.
5051

51-
For example, `cutlass::gemm::Gemm<>` should define `struct cutlass::gemm::Gemm::Params`. The latter should define data members for each data member in `cutlass::gemm::Gemm<>`.
52+
For example, `cutlass::gemm::Gemm<>` should define `struct cutlass::gemm::Gemm::Params`. The latter should define data members for each data member in `cutlass::gemm::Gemm<>`.
5253

5354

5455
## <a name="S-patterns-composable-shared-memory"></a> Composable shared memory allocation
@@ -94,7 +95,7 @@ multiply operation performed by each iteration of the mainloop is referred to as
9495

9596
The threadblock loads a sequence of tiles from global memory and stores this data to shared memory. The iterative
9697
access and traversal of tiles in global memory are performed by a _TileLoadIterator_, and storing to a circular
97-
buffer in shared memory is performed by a _GlobalLoadIterator_.
98+
buffer in shared memory is performed by a _GlobalLoadIterator_.
9899

99100
**[Global Load Stream](cutlass/gemm/gemm_global_stream.h)** manages loading of the threadblock-scope multiplicands to the GEMM kernel. It owns an iterator into global memory for loading tiles of data, a TensorAllocation in shared memory to hold the resulting tile, and an iterator for writing the tile into this allocation. A transformer exists to optionally transform the data as it is loaded which may of use to perform type conversion or, in the case of int8 GEMM, transpose 4x4 tiles held in registers.
100101

@@ -109,24 +110,24 @@ The Global Load Stream template contains members defined by the following templa
109110
The threadblock's _OutputTile_ is partitioned among the warps, and each computes a warp-level matrix product.
110111
Data is loaded from shared memory into registers, and math instructions are dispatched to CUDA Cores or Tensor Cores.
111112

112-
[**Shared Load Stream**](cutlass/gemm/gemm_shared_stream.h) manages loading of warp-level multiplicands from shared memory into registers. This owns an iterator for fetching data and the destination fragments for holding the results.
113+
[**Shared Load Stream**](cutlass/gemm/gemm_shared_stream.h) manages loading of warp-level multiplicands from shared memory into registers. This owns an iterator for fetching data and the destination fragments for holding the results.
113114

114115
* [GemmSharedLoadTile{A,B}](cutlass/gemm/gemm_shared_tile.h)
115116

116-
**Matrix Multiply** computes a matrix product operation on data held in registers. Specializations exist for thread-level instructions such as single-precision fused multiply-add as well as warp-level matrix operations targeting TensorCores.
117+
**Matrix Multiply** computes a matrix product operation on data held in registers. Specializations exist for thread-level instructions such as single-precision fused multiply-add as well as warp-level matrix operations targeting TensorCores.
117118

118119
* [WMMA Multiply Add](cutlass/gemm/wmma_gemm_multiply_add.h)
119120

120121
## Thread-level GEMM
121122

122123
SGEMM, IGEMM, HGEMM, and DGEMM are computed by SIMT math instructions issued by thread-level matrix multiply
123-
procedures.
124+
procedures.
124125

125126
* [ThreadMultiplyAdd](cutlass/gemm/thread_multiply_add.h)
126127
* [IGEMM specialization](cutlass/gemm/igemm_multiply_add.h)
127128
* [HGEMM specialization](cutlass/gemm/hgemm_multiply_add.h)
128129

129-
## Epilogue
130+
## Epilogue
130131

131132
The [**epilogue**](cutlass/gemm/gemm_epilogue.h) iteratively selects a subset of accumulator elements held by a warp, writes them to shared memory, and loads them by different threads such that a threadblock-scoped tile store operation will make contiguous, striped accesses to global memory. Thus, the flow of data utilizes the following components:
132133

@@ -227,7 +228,7 @@ must specify compile-time constant tile sizes.
227228
## <a name="S-core-tile-structure"></a> Tile Structure
228229

229230
Tiled structures express an arrangement of data in memory as well as a logical mapping of concurrent CUDA
230-
threads to the problem space. For example, the CUTLASS GEMM
231+
threads to the problem space. For example, the CUTLASS GEMM
231232

232233
Tiled structures can be defined using the `cutlass::TileTraits<>` concept which defines the following
233234
members. Collectively, these members offer a flexible way to define a 4-D subpartition of an integer
@@ -286,7 +287,7 @@ the next item in sequence.
286287
<img src="/media/images/cutlass-tile-iteration.png" alt="CUTLASS tile access and traversal" width="50%" />
287288

288289
To offer a generic solution that spans numerous data types and layouts, CUTLASS defines the _TileIterator_ concept.
289-
This concept provides access to a sequence of _tiles_ embedded in a tensor in addressable memory.
290+
This concept provides access to a sequence of _tiles_ embedded in a tensor in addressable memory.
290291

291292
The canonical CUTLASS tile iterator template is defined in [cutlass/tile_iterator.h](cutlass/tile_iterator.h).
292293

@@ -296,9 +297,9 @@ A fragment is analogous to `std::array<>` in that it is a constant-sized array o
296297

297298
## <a name="S-core-predicate-vector"></a> Predicate Vector
298299

299-
SIMT architectures utilize predicated execution in place of control flow when conditional code sequences are fairly short, on the order of a few machine instructions. While CUDA C++ does not include constructs at the language level for predication, PTX makes this explicit, and compilation to SASS is assumed to aggressively utilize predication. Typical applications are to initialize a sequence of bits used to mask memory operations and use these bits as predicates guarding memory load and store instructions.
300+
SIMT architectures utilize predicated execution in place of control flow when conditional code sequences are fairly short, on the order of a few machine instructions. While CUDA C++ does not include constructs at the language level for predication, PTX makes this explicit, and compilation to SASS is assumed to aggressively utilize predication. Typical applications are to initialize a sequence of bits used to mask memory operations and use these bits as predicates guarding memory load and store instructions.
300301

301-
CUTLASS provides `PredicateVector` defined in [cutlass/predicate_vector.h](cutlass/predicate_vector.h) to manage a statically-sized bit vector, store them into general purpose registers, and efficiently access them in sequence. By storing four predicates per byte in hardware registers, the CUDA compiler is able to issue specialized instructions to achieve very efficient unpacking.
302+
CUTLASS provides `PredicateVector` defined in [cutlass/predicate_vector.h](cutlass/predicate_vector.h) to manage a statically-sized bit vector, store them into general purpose registers, and efficiently access them in sequence. By storing four predicates per byte in hardware registers, the CUDA compiler is able to issue specialized instructions to achieve very efficient unpacking.
302303

303304

304305
# <a name="S-utilities"></a> 4. Utilities
@@ -310,6 +311,46 @@ framework offering features such as:
310311
* Components for allocating and initializing [host-side and device-side tensors](tools/util/host_tensor.h) usable by CUTLASS
311312
* Reference implementations of [GEMM](tools/util/reference/host/gemm.h) and [element-wise operations](tools/util/reference/host/tensor_elementwise.h)
312313

314+
315+
# <a name="S-optimization-strategies"></a>5. Optimization Strategies
316+
317+
This section describes several strategies taken to increase performance beyond what is achievable with
318+
a basic implementation of the hierarchical GEMM structure.
319+
320+
321+
## Threadblock Rasterization
322+
323+
To maximize reuse of data held in the last level cache, CUTLASS defines several functions to
324+
affect the mapping of threadblocks to logical partitions of the GEMM problem. These map
325+
consecutively launched threadblocks to packed two-dimensional regions of the partitioned GEMM
326+
problem to increase the probability that these will access the same tiles of global memory at
327+
approximately the same time.
328+
329+
Several functions are defined in [cutlass/gemm/threadblock_swizzle.h](cutlass/gemm/threadblock_swizzle.h).
330+
331+
332+
## Parallel Reductions across GEMM _K_
333+
334+
Matrix product computations expose parallelism among _O(MN)_ independent inner product
335+
computations. For sufficiently large problem sizes, a GEMM kernel in CUTLASS may approach
336+
the theoretical maximum computational throughput. For small problems, however, there are
337+
too few threadblocks to efficiently occupy the entire GPU.
338+
339+
As a recourse, parallelizing the reduction performed during the inner product computation
340+
enables more threadblocks to execute concurrently while still taking advantage of the throughput
341+
benefits of large threadblock-level GEMM tiles.
342+
343+
CUTLASS implements parallel reductions across threadblocks by partitioning the GEMM _K_ dimension
344+
and launching an additional set of threadblocks for each partition. Consequently, we refer to
345+
this strategy within CUTLASS as "parallel reduction splitK." The "parallel reduction splitK" in cutlass requires the execution of 2 kernels. The first one is called partitionedK GEMM. The second one is called batched reduction.
346+
347+
The partitionedK GEMM is very similar to one flavor of batched strided GEMM. Instead of requiring users to specify the problem size of each batch, partitionedK GEMM asks for the overall problem size and the number of partition that will be applied along K dimension for operand A and B. For example, parameters of m=128, n=128, k=4096 and partition=16 will result in 16 batched strided GEMMs with each batch of m=128, n=128, k=256. PartitionedK also allows scenario where k is not divisible by partition count. For example, parameters of m=128, n=128, k=4096 and partition=20 will result in 20 batched strided GEMMs with the first 19 batches of m=128, n=128, k=4096/20=204 and the last batch of m=128, n=128, k=220.
348+
349+
The batched reduction kernel will further perform reduction along the K-dimension. Thus, the input of the batched reduction kernel is the output (C) of partitionedK GEMM. An workspace memory is managed by the users to store this intermediate results.
350+
351+
An example of splitK usage can be found [here](examples/06_splitK_gemm/splitK_gemm.cu).
352+
353+
313354
# Copyright
314355

315356
Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
@@ -335,4 +376,3 @@ Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
335376
STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
336377
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
337378
```
338-

README.md

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
22

3-
# CUTLASS 1.1
3+
# CUTLASS 1.2
44

5-
_CUTLASS 1.1.0 - September 2018_
5+
_CUTLASS 1.2.0 - October 2018_
66

7-
CUTLASS 1.1 is a collection of CUDA C++ template abstractions for implementing
7+
CUTLASS is a collection of CUDA C++ template abstractions for implementing
88
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
99
It incorporates strategies for hierarchical decomposition and data movement similar
1010
to those used to implement cuBLAS. CUTLASS decomposes these "moving parts" into
@@ -22,12 +22,19 @@ point (FP64) types. Furthermore, CUTLASS demonstrates CUDA's WMMA API for targe
2222
the programmable, high-throughput _Tensor Cores_ provided by NVIDIA's Volta architecture
2323
and beyond.
2424

25-
CUTLASS 1.1 is described in the [CUTLASS Documentation](CUTLASS.md) and the accompanying
25+
CUTLASS 1.2 is described in the [CUTLASS Documentation](CUTLASS.md) and the accompanying
2626
[Doxygen documentation](https://nvidia.github.io/cutlass).
2727
We describe the structure of an efficient GEMM in our talk at the
2828
[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf).
2929

30+
# What's New in CUTLASS 1.2
31+
_October 2018_
32+
* [Parallelized Reductions](CUTLASS.md#parallel-reductions-across-gemm-k)
33+
* Batched strided WMMA GEMM
34+
35+
3036
# What's New in CUTLASS 1.1
37+
_September 2018_
3138

3239
* [CUTLASS Documentation](CUTLASS.md)
3340
* [Examples](examples/)

cutlass/coord.h

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,56 @@ struct Coord {
313313

314314
////////////////////////////////////////////////////////////////////////////////////////////////////
315315

316+
/// Scalar multiplication
317+
template <typename T, int Rank, typename Index>
318+
CUTLASS_HOST_DEVICE
319+
Coord<Rank, Index> operator*(T s, Coord<Rank, Index> coord) {
320+
CUTLASS_PRAGMA_UNROLL
321+
for (int i = 0; i < Rank; ++i) {
322+
coord[i] *= s;
323+
}
324+
return coord;
325+
}
326+
327+
/// Scalar multiplication
328+
template <typename T, int Rank, typename Index>
329+
CUTLASS_HOST_DEVICE
330+
Coord<Rank, Index> operator*(Coord<Rank, Index> coord, T s) {
331+
CUTLASS_PRAGMA_UNROLL
332+
for (int i = 0; i < Rank; ++i) {
333+
coord[i] *= s;
334+
}
335+
return coord;
336+
}
337+
338+
/// Scalar division
339+
template <typename T, int Rank, typename Index>
340+
CUTLASS_HOST_DEVICE
341+
Coord<Rank, Index> operator/(T s, Coord<Rank, Index> coord) {
342+
CUTLASS_PRAGMA_UNROLL
343+
for (int i = 0; i < Rank; ++i) {
344+
coord[i] = s / coord[i];
345+
}
346+
return coord;
347+
}
348+
349+
/// Scalar division
350+
template <typename T, int Rank, typename Index>
351+
CUTLASS_HOST_DEVICE
352+
Coord<Rank, Index> operator/(Coord<Rank, Index> coord, T s) {
353+
CUTLASS_PRAGMA_UNROLL
354+
for (int i = 0; i < Rank; ++i) {
355+
coord[i] /= s;
356+
}
357+
return coord;
358+
}
359+
360+
////////////////////////////////////////////////////////////////////////////////////////////////////
361+
//
362+
// Integer-valued make_Coord
363+
//
364+
////////////////////////////////////////////////////////////////////////////////////////////////////
365+
316366
/// Helper to make a 2-element coordinate
317367
CUTLASS_HOST_DEVICE
318368
Coord<1> make_Coord(int _0) {

cutlass/cutlass.h

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
////////////////////////////////////////////////////////////////////////////////////////////////////
3333

3434
#define CUTLASS_MAJOR 1
35-
#define CUTLASS_MINOR 1
35+
#define CUTLASS_MINOR 2
3636
#define CUTLASS_PATCH 0
3737
#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
3838

@@ -49,21 +49,7 @@
4949

5050
#define CUTLASS_ASSERT(x) assert(x)
5151

52-
// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
53-
#if defined(__CUDA_ARCH__)
54-
#if defined(_MSC_VER)
55-
#define CUTLASS_PRAGMA_UNROLL __pragma("unroll")
56-
#define CUTLASS_PRAGMA_NO_UNROLL __pragma("unroll 1")
57-
#else
58-
#define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
59-
#define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
60-
#endif
61-
#else
62-
#define CUTLASS_PRAGMA_UNROLL
63-
#define CUTLASS_PRAGMA_NO_UNROLL
64-
#endif
65-
66-
#define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
52+
#include "cutlass/util/performance_tuning.h"
6753

6854
// A small helper class to dump a type at compile time
6955
// Usage:: DumpType<Class>::Class

0 commit comments

Comments
 (0)