Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c5866aa
bump up cutlass upstream to 2.7
thakkarV Oct 20, 2021
6df6200
fixes for build against cutlass 2.7 version bump
thakkarV Oct 23, 2021
e968316
nits
thakkarV Oct 23, 2021
4ee508e
set and require C++14 for CUDA
thakkarV Oct 28, 2021
55b372c
generalize test generator and rename test files in prep for SM80
thakkarV Oct 30, 2021
6b7fa23
generalize bench generator and rename bench files in prep for SM80
thakkarV Oct 31, 2021
a40f7bc
add benchmark generator, generated code, cmake file for default configs
thakkarV Oct 31, 2021
2c6fa5e
rework DefualtSemiRingConfiguration struct API for clarity
thakkarV Nov 1, 2021
4de2c56
Add initial Ampere multi-stage templates
thakkarV Nov 3, 2021
7964328
add default config benchmarks for ampere
thakkarV Nov 3, 2021
bf1a4e2
add default srgemm configs for SM80 and bind to multistage kernel
thakkarV Nov 3, 2021
a7ba0cc
change CMake default CUDA arch to SM80
thakkarV Nov 3, 2021
62ae551
add default test generator, SM50 defualt tests
thakkarV Nov 11, 2021
e6c8962
remove cuASR CUDA Archs and use CMAKE_CUDA_ARCHITECTURES
thakkarV Dec 23, 2021
7d67137
save point for operator struct migration
thakkarV Apr 25, 2022
3757ee7
WiP: working basic min-plus srgemm example
thakkarV Jul 28, 2022
fc5b3a0
cuASR 2.0: Initial all test passing
thakkarV Aug 8, 2022
91f0934
cuASR 2.0: add CUASR_CUDA_ARCHS cmake flag
thakkarV Aug 8, 2022
4dde9d8
update readme for 2.0
thakkarV Aug 8, 2022
4090562
update cutlass dep to 2.9.1
thakkarV Aug 8, 2022
38f5956
Update license headers
thakkarV Aug 20, 2022
357c186
nits
thakkarV Aug 21, 2022
91ca723
fix fragment init for multistage mainloop
thakkarV Aug 21, 2022
18afb74
add SM80 multistage default config tests and benchmark
thakkarV Aug 21, 2022
fd18cec
naming nits
thakkarV Aug 21, 2022
10db4e6
add int32_t tests and benchmarks
thakkarV Aug 21, 2022
a597767
better user definined srgemm example
thakkarV Aug 21, 2022
113872c
add license file
thakkarV Aug 22, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
17 changes: 12 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.13)
cmake_minimum_required(VERSION 3.18)
project(cuASR CUDA CXX)

# RELEASE config by default if none is provided:
Expand All @@ -22,13 +22,15 @@ option(CUASR_TEST "Build cuASR test suite. Use with CUASR_TEST_LEVEL={0|1|2}.
option(CUASR_BENCH "Build cuASR benchmark suite." ON)
option(CUASR_EXAMPLE "Build cuASR examples." ON)

# By default, build fat binaries. TODO add sm_80 here
option(CUASR_CUDA_ARCHS "List of CUDA architectures to compile for." "60 61 70 72 75")

# CUDA native compiler (nvcc) only supports upto C++14 for now
find_package(CUDA REQUIRED)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)

# C++ compiler flags for target compile options
set(cuASR_CXX_FLAGS -Wall -Wextra -Wno-unused-parameter -Wno-uninitialized -Wno-strict-aliasing)
Expand All @@ -41,7 +43,11 @@ set(cuASR_CUDA_FLAGS --expt-relaxed-constexpr)
set(cuASR_CUDA_FLAGS_DEBUG -G ${cuASR_CUDA_FLAGS})
set(cuASR_CUDA_FLAGS_RELEASE -O3 ${cuASR_CUDA_FLAGS})
set(cuASR_CUDA_FLAGS_RELWITHDEBINFO -G ${cuASR_CUDA_FLAGS})
set(CMAKE_CUDA_ARCHITECTURES ${CUASR_CUDA_ARCHS})
if(NOT DEFINED CUASR_CUDA_ARCHS)
set(CMAKE_CUDA_ARCHITECTURES 80)
else()
set(CMAKE_CUDA_ARCHITECTURES ${CUASR_CUDA_ARCHS})
endif()

# the sub-modules update themselves with git, so find git
find_package(Git QUIET)
Expand Down Expand Up @@ -94,6 +100,7 @@ message(STATUS " C++ Compiler : ${CMAKE_CXX_COMPILER}")
message(STATUS " C++ Compiler version : ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS " CUDA Compiler : ${CMAKE_CUDA_COMPILER}")
message(STATUS " CUDA Compiler version: ${CMAKE_CUDA_COMPILER_VERSION}")
message(STATUS " CUDA Arch support : ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS " Build tests : ${CUASR_TEST}")
message(STATUS " Test level : ${CUASR_TEST_LEVEL}")
message(STATUS " Build benchmarks : ${CUASR_BENCH}")
Expand Down
28 changes: 28 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Copyright (c) 2020 - 2022 Vijay Thakkar.
Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: BSD-3-Clause

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
89 changes: 44 additions & 45 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ Normally, BLAS libraries are defined as operations over real numbers,`+` and `x`
cuASR is a template library and therefore header only, but includes an exhaustive list of tests and benchmarks. The build system is based on `CMake`. Basic checkout and build instructions are as follows:

```sh
$ git clone --recurse-submodules https://github.com/hpcgarage/semiring-gemm /path/to/repo
$ git clone --recurse-submodules https://github.com/hpcgarage/cuASR /path/to/repo
$ cd /path/to/repo
$ mkdir build && cd build
$ cmake .. -G Ninja -DCUASR_CUDA_ARCHS="70 75"
$ cmake .. -G Ninja -DCMAKE_CUDA_ARCHITECTURES="70 75 80"
$ ninja
```

Expand All @@ -34,7 +34,7 @@ Notable build flags:

| Build Flag | Usage Description |
|-|-|
| `CUASR_CUDA_ARCHS` | lists the CUDA SM architectures the fat binaries should be built to target. `CUASR_CUDA_ARCHS="60 61 70 72 75"` (all Pascal and Volta GPUs) will be used if no value is specified, but this can really hurt compile times for tests and benchmarks; Limit CUDA architectures to the smallest subset you forsee running the tests and benchmarks on.
| `CMAKE_CUDA_ARCHITECTURES` | lists the CUDA SM architectures the fat binaries should be built to target. `CMAKE_CUDA_ARCHITECTURES="80"` (Ampere) will be used if no value is specified, but this can really hurt compile times for tests and benchmarks; Limit CUDA architectures to the smallest subset you forsee running the tests and benchmarks on.
| `CUASR_TEST` | Set to `ON` by default and controls whether tests will be built or not. Set to `OFF` to disable building all tests. |
| `CUASR_BENCH` | Set to `ON` by default and controls whether benchmarks will be built or not. Set to `OFF` to disable building all benchmarks. |
| `CUASR_EXAMPLES` | Set to `ON` by default and controls whether examples will be built or not. Set to `OFF` to disable building all examples. |
Expand Down Expand Up @@ -92,33 +92,30 @@ auto cuasr_minplus_srsgemm_nt(
bool do_epilogue_min,
cudaStream_t stream = nullptr) -> int {
// compile time configuration of this srgemm kernel
using OperatorClass = cutlass::arch::OpClassSimt;
using SmArch = cutlass::arch::Sm50;
using AdditionOp = cuasr::minimum<float>;
using MultiplicationOp = cuasr::plus<float>;
using OperatorClass = cutlass::arch::OpClassSimt;
using SmArch = cutlass::arch::Sm50;
using RingOp = cuasr::min_plus<float>;

using TropicalConfig = typename cuasr::gemm::device::DefaultSemiRingConfiguration<
float, float, float, float, OperatorClass, //
AdditionOp, MultiplicationOp, SmArch>;
RingOp, SmArch>;

using ColumnMajor = cutlass::layout::ColumnMajor;
using RowMajor = cutlass::layout::RowMajor;

using cuASR_MinPlus_SGEMM = cuasr::gemm::device::Srgemm<
AdditionOp, // Thread level semiring add operator
MultiplicationOp, // Thread level semiRing multiply operator
RingOp, // Thread level SemiRing operator
float, // element type of A
ColumnMajor, // layout of A
float, // element type of B
RowMajor, // layout of B
float, // element type of C
RowMajor, // layout of C
float // element type of D
>;
>;

float alpha = MultiplicationOp::Identity;
float beta
= do_epilogue_min ? MultiplicationOp::Identity : MultiplicationOp::Annihilator;
int alpha = RingOp::MultIdentity;
int beta = do_epilogue_min ? RingOp::MultIdentity : RingOp::MultAnnihilator;

// construct kernel arguments struct
cuASR_MinPlus_SGEMM::Arguments args(
Expand Down Expand Up @@ -161,28 +158,26 @@ After the operator struct is defined, the rest is some simple boilerplate for in
The code excerpt below is taken from [`examples/01_userdefined_semiring`](examples/01_userdefined_semiring/userdefined_semiring.cu).

```cpp
template <typename T, int N = 1>
struct binary_xor {
static T constexpr Identity = static_cast<T>(false);
// scalar operator
template <class T>
struct xor_and {
static T constexpr AddIdentity = static_cast<T>(false);
static T constexpr MultIdentity = static_cast<T>(true);
static T constexpr MultAnnihilator = static_cast<T>(false);

__host__ __device__
T operator()(T lhs, T const &rhs) const {
lhs ^= rhs;
return lhs;
void fma(T& dst, T const lhs, T const rhs, T const src) const {
dst = add(src, mult(lhs, rhs));
}

__host__ __device__
cutlass::Array<T, N>
operator()(cutlass::Array<T, N> const &lhs, cutlass::Array<T, N> const &rhs) const {
cutlass::Array<T, N> result;
#pragma unroll
for (int i = 0; i < N; ++i) {
result[i] = this->operator()(lhs[i], rhs[i]);
}
return result;
T add(T const lhs, T const rhs) const {
return lhs ^ rhs;
}

// ... other overloads for cutlass::Array<T, N> here ...
__host__ __device__
T mult(T const lhs, T const rhs) const {
return lhs && rhs;
}
};

// GF(2) xor-and SRGEMM
Expand All @@ -203,10 +198,9 @@ auto cuasr_gf_srgemm_nnn(
using OperatorClass = cutlass::arch::OpClassSimt;
using SmArch = cutlass::arch::Sm50;

using AdditionOp = binary_xor<int>;
using MultiplicationOp = cuasr::binary_and<int>;
using RingOp = xor_and<int>;
using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination<
AdditionOp, MultiplicationOp, int, 1>;
RingOp, int, 1>;

static int constexpr AlignmentA = 1;
static int constexpr AlignmentB = 1;
Expand All @@ -220,8 +214,7 @@ auto cuasr_gf_srgemm_nnn(
using RowMajor = cutlass::layout::RowMajor;

using cuASRGaloisFieldSrgemm = cuasr::gemm::device::Srgemm<
AdditionOp, // Thread level SemiRing operator
MultiplicationOp, // Thread level SemiRing operator
RingOp, // Thread level SemiRing operator
int, // element type of A
RowMajor, // layout of A
int, // element type of B
Expand All @@ -242,8 +235,8 @@ auto cuasr_gf_srgemm_nnn(
false // SplitKSerial
>;

int alpha = MultiplicationOp::Identity;
int beta = do_epilogue_and ? MultiplicationOp::Identity : MultiplicationOp::Annihilator;
int alpha = RingOp::MultIdentity;
int beta = do_epilogue_and ? RingOp::MultIdentity : RingOp::MultAnnihilator;

// construct kernel arguments struct
cuASRGaloisFieldSrgemm::Arguments args(
Expand Down Expand Up @@ -291,24 +284,30 @@ When a device level SRGEMM template, `cuasr::gemm::device::Srgemm`, is instantia
namespace cuasr::arch {
template <
// ... datatype and GEMM shape template params
typename AdditionOp,
typename MultiplicationOp
typename RingOp
>
struct Srmma {
struct Srmma<
cutlass::gemm::GemmShape<1, 1, 1>,
1,
ElementA,
LayoutA,
ElementB,
LayoutB,
ElementC,
LayoutC,
RingOp> {
using Shape = cutlass::gemm::GemmShape<1, 1, 1>;

// operators must be default contructible and contain a binary operator()
AdditionOp add;
MultiplicationOp mult;
RingOp ring_op;

__host__ __device__
CUTLASS_HOST_DEVICE
void operator()(
cutlass::Array<ElementC, 1> &d,
cutlass::Array<ElementA, 1> const &a,
cutlass::Array<ElementB, 1> const &b,
cutlass::Array<ElementC, 1> const &c
) {
d[0] = add(c[0], mult(a[0], b[0]));
ring_op.fma(d[0], a[0], b[0], c[0]);
}
};
}
Expand Down
47 changes: 42 additions & 5 deletions bench/device/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,23 +1,60 @@
file(GLOB SIMT_BENCH_SRCS CONFIGURE_DEPENDS *.cu)
add_executable(cuasr_bench_srgemm_device
# SM50 defualt configurations
add_executable(cuasr_bench_srgemm_device_sm50_defaults
sm50_defaults.cu
)
target_include_directories(
cuasr_bench_srgemm_device_sm50_defaults
PRIVATE
${PROJECT_SOURCE_DIR}/include/
${PROJECT_SOURCE_DIR}/tools/include/
${PROJECT_SOURCE_DIR}/cutlass/include/
${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/
)
target_link_libraries(cuasr_bench_srgemm_device_sm50_defaults
benchmark
benchmark_main
${cuASR_LIB_NAME}
)

# SM80 defualt configurations
add_executable(cuasr_bench_srgemm_device_sm80_defaults
sm80_defaults.cu
)
target_include_directories(
cuasr_bench_srgemm_device_sm80_defaults
PRIVATE
${PROJECT_SOURCE_DIR}/include/
${PROJECT_SOURCE_DIR}/tools/include/
${PROJECT_SOURCE_DIR}/cutlass/include/
${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/
)
target_link_libraries(cuasr_bench_srgemm_device_sm80_defaults
benchmark
benchmark_main
${cuASR_LIB_NAME}
)

# All shmoo benchmarks
file(GLOB SIMT_BENCH_SRCS CONFIGURE_DEPENDS sm50_simt_*.cu)
add_executable(cuasr_bench_srgemm_device_shmoo
${SIMT_BENCH_SRCS}
)
target_include_directories(
cuasr_bench_srgemm_device
cuasr_bench_srgemm_device_shmoo
PRIVATE
${PROJECT_SOURCE_DIR}/include/
${PROJECT_SOURCE_DIR}/tools/include/
${PROJECT_SOURCE_DIR}/cutlass/include/
${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/
)
target_link_libraries(cuasr_bench_srgemm_device
target_link_libraries(cuasr_bench_srgemm_device_shmoo
benchmark
benchmark_main
${cuASR_LIB_NAME}
)
if(NOT DEFINED CUASR_BENCH_LEVEL)
set(CUASR_BENCH_LEVEL 0)
endif()
target_compile_definitions(cuasr_bench_srgemm_device
target_compile_definitions(cuasr_bench_srgemm_device_shmoo
PRIVATE CUASR_BENCH_LEVEL=${CUASR_BENCH_LEVEL}
)
Loading