hpcgarage · thakkarV · Oct 20, 2021 · Oct 23, 2021 · Oct 23, 2021 · Oct 28, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.18)
 project(cuASR CUDA CXX)
 
 # RELEASE config by default if none is provided:
@@ -22,13 +22,15 @@ option(CUASR_TEST    "Build cuASR test suite. Use with CUASR_TEST_LEVEL={0|1|2}.
 option(CUASR_BENCH   "Build cuASR benchmark suite."  ON)
 option(CUASR_EXAMPLE "Build cuASR examples."  ON)
 
-# By default, build fat binaries. TODO add sm_80 here
-option(CUASR_CUDA_ARCHS "List of CUDA architectures to compile for." "60 61 70 72 75")
-
 # CUDA native compiler (nvcc) only supports upto C++14 for now
 find_package(CUDA REQUIRED)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 
 # C++ compiler flags for target compile options
 set(cuASR_CXX_FLAGS -Wall -Wextra -Wno-unused-parameter -Wno-uninitialized -Wno-strict-aliasing)
@@ -41,7 +43,11 @@ set(cuASR_CUDA_FLAGS --expt-relaxed-constexpr)
 set(cuASR_CUDA_FLAGS_DEBUG          -G ${cuASR_CUDA_FLAGS})
 set(cuASR_CUDA_FLAGS_RELEASE        -O3 ${cuASR_CUDA_FLAGS})
 set(cuASR_CUDA_FLAGS_RELWITHDEBINFO -G ${cuASR_CUDA_FLAGS})
-set(CMAKE_CUDA_ARCHITECTURES ${CUASR_CUDA_ARCHS})
+if(NOT DEFINED CUASR_CUDA_ARCHS)
+  set(CMAKE_CUDA_ARCHITECTURES 80)
+else()
+  set(CMAKE_CUDA_ARCHITECTURES ${CUASR_CUDA_ARCHS})
+endif()
 
 # the sub-modules update themselves with git, so find git
 find_package(Git QUIET)
@@ -94,6 +100,7 @@ message(STATUS "  C++ Compiler         : ${CMAKE_CXX_COMPILER}")
 message(STATUS "  C++ Compiler version : ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "  CUDA Compiler        : ${CMAKE_CUDA_COMPILER}")
 message(STATUS "  CUDA Compiler version: ${CMAKE_CUDA_COMPILER_VERSION}")
+message(STATUS "  CUDA Arch support    : ${CMAKE_CUDA_ARCHITECTURES}")
 message(STATUS "  Build tests          : ${CUASR_TEST}")
 message(STATUS "  Test level           : ${CUASR_TEST_LEVEL}")
 message(STATUS "  Build benchmarks     : ${CUASR_BENCH}")

diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,28 @@
+Copyright (c) 2020 - 2022 Vijay Thakkar.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -21,10 +21,10 @@ Normally, BLAS libraries are defined as operations over real numbers,`+` and `x`
 cuASR is a template library and therefore header only, but includes an exhaustive list of tests and benchmarks. The build system is based on `CMake`. Basic checkout and build instructions are as follows:
 
 ```sh
-$ git clone --recurse-submodules https://github.com/hpcgarage/semiring-gemm /path/to/repo
+$ git clone --recurse-submodules https://github.com/hpcgarage/cuASR /path/to/repo
 $ cd /path/to/repo
 $ mkdir build && cd build
-$ cmake .. -G Ninja -DCUASR_CUDA_ARCHS="70 75"
+$ cmake .. -G Ninja -DCMAKE_CUDA_ARCHITECTURES="70 75 80"
 $ ninja
 ```
 
@@ -34,7 +34,7 @@ Notable build flags:
 
 | Build Flag | Usage Description |
 |-|-|
-| `CUASR_CUDA_ARCHS` | lists the CUDA SM architectures the fat binaries should be built to target. `CUASR_CUDA_ARCHS="60 61 70 72 75"` (all Pascal and Volta GPUs) will be used if no value is specified, but this can really hurt compile times for tests and benchmarks; Limit CUDA architectures to the smallest subset you forsee running the tests and benchmarks on.
+| `CMAKE_CUDA_ARCHITECTURES` | lists the CUDA SM architectures the fat binaries should be built to target. `CMAKE_CUDA_ARCHITECTURES="80"` (Ampere) will be used if no value is specified, but this can really hurt compile times for tests and benchmarks; Limit CUDA architectures to the smallest subset you forsee running the tests and benchmarks on.
 | `CUASR_TEST` | Set to `ON` by default and controls whether tests will be built or not. Set to `OFF` to disable building all tests. |
 | `CUASR_BENCH` | Set to `ON` by default and controls whether benchmarks will be built or not. Set to `OFF` to disable building all benchmarks. |
 | `CUASR_EXAMPLES` | Set to `ON` by default and controls whether examples will be built or not. Set to `OFF` to disable building all examples. |
@@ -92,33 +92,30 @@ auto cuasr_minplus_srsgemm_nt(
     bool do_epilogue_min,
     cudaStream_t stream = nullptr) -> int {
   // compile time configuration of this srgemm kernel
-  using OperatorClass    = cutlass::arch::OpClassSimt;
-  using SmArch           = cutlass::arch::Sm50;
-  using AdditionOp       = cuasr::minimum<float>;
-  using MultiplicationOp = cuasr::plus<float>;
+  using OperatorClass = cutlass::arch::OpClassSimt;
+  using SmArch        = cutlass::arch::Sm50;
+  using RingOp        = cuasr::min_plus<float>;
 
   using TropicalConfig = typename cuasr::gemm::device::DefaultSemiRingConfiguration<
       float, float, float, float, OperatorClass, //
-      AdditionOp, MultiplicationOp, SmArch>;
+      RingOp, SmArch>;
 
   using ColumnMajor = cutlass::layout::ColumnMajor;
   using RowMajor    = cutlass::layout::RowMajor;
 
   using cuASR_MinPlus_SGEMM = cuasr::gemm::device::Srgemm<
-      AdditionOp,       // Thread level semiring add operator
-      MultiplicationOp, // Thread level semiRing multiply operator
+      RingOp,           // Thread level SemiRing operator
       float,            // element type of A
       ColumnMajor,      // layout of A
       float,            // element type of B
       RowMajor,         // layout of B
       float,            // element type of C
       RowMajor,         // layout of C
       float             // element type of D
-      >;
+  >;
 
-  float alpha = MultiplicationOp::Identity;
-  float beta
-      = do_epilogue_min ? MultiplicationOp::Identity : MultiplicationOp::Annihilator;
+  int alpha = RingOp::MultIdentity;
+  int beta = do_epilogue_min ? RingOp::MultIdentity : RingOp::MultAnnihilator;
 
   // construct kernel arguments struct
   cuASR_MinPlus_SGEMM::Arguments args(
@@ -161,28 +158,26 @@ After the operator struct is defined, the rest is some simple boilerplate for in
 The code excerpt below is taken from [`examples/01_userdefined_semiring`](examples/01_userdefined_semiring/userdefined_semiring.cu).
 
 ```cpp
-template <typename T, int N = 1>
-struct binary_xor {
-  static T constexpr Identity = static_cast<T>(false);
-  // scalar operator
+template <class T>
+struct xor_and {
+  static T constexpr AddIdentity     = static_cast<T>(false);
+  static T constexpr MultIdentity    = static_cast<T>(true);
+  static T constexpr MultAnnihilator = static_cast<T>(false);
+
   __host__ __device__
-  T operator()(T lhs, T const &rhs) const {
-    lhs ^= rhs;
-    return lhs;
+  void fma(T& dst, T const lhs, T const rhs, T const src) const {
+    dst = add(src, mult(lhs, rhs));
   }
 
   __host__ __device__
-  cutlass::Array<T, N>
-  operator()(cutlass::Array<T, N> const &lhs, cutlass::Array<T, N> const &rhs) const {
-    cutlass::Array<T, N> result;
-    #pragma unroll
-    for (int i = 0; i < N; ++i) {
-      result[i] = this->operator()(lhs[i], rhs[i]);
-    }
-    return result;
+  T add(T const lhs, T const rhs) const {
+    return lhs ^ rhs;
   }
 
-  // ... other overloads for cutlass::Array<T, N> here ...
+  __host__ __device__
+  T mult(T const lhs, T const rhs) const {
+    return lhs && rhs;
+  }
 };
 
 // GF(2) xor-and SRGEMM
@@ -203,10 +198,9 @@ auto cuasr_gf_srgemm_nnn(
   using OperatorClass = cutlass::arch::OpClassSimt;
   using SmArch        = cutlass::arch::Sm50;
 
-  using AdditionOp       = binary_xor<int>;
-  using MultiplicationOp = cuasr::binary_and<int>;
+  using RingOp = xor_and<int>;
   using EpilogueOutputOp = cuasr::epilogue::thread::SemiringLinearCombination<
-      AdditionOp, MultiplicationOp, int, 1>;
+      RingOp, int, 1>;
 
   static int constexpr AlignmentA = 1;
   static int constexpr AlignmentB = 1;
@@ -220,8 +214,7 @@ auto cuasr_gf_srgemm_nnn(
   using RowMajor = cutlass::layout::RowMajor;
 
   using cuASRGaloisFieldSrgemm = cuasr::gemm::device::Srgemm<
-      AdditionOp,         // Thread level SemiRing operator
-      MultiplicationOp,   // Thread level SemiRing operator
+      RingOp,             // Thread level SemiRing operator
       int,                // element type of A
       RowMajor,           // layout of A
       int,                // element type of B
@@ -242,8 +235,8 @@ auto cuasr_gf_srgemm_nnn(
       false               // SplitKSerial
       >;
 
-  int alpha = MultiplicationOp::Identity;
-  int beta = do_epilogue_and ? MultiplicationOp::Identity : MultiplicationOp::Annihilator;
+  int alpha = RingOp::MultIdentity;
+  int beta = do_epilogue_and ? RingOp::MultIdentity : RingOp::MultAnnihilator;
 
   // construct kernel arguments struct
   cuASRGaloisFieldSrgemm::Arguments args(
@@ -291,24 +284,30 @@ When a device level SRGEMM template, `cuasr::gemm::device::Srgemm`, is instantia
 namespace cuasr::arch {
 template <
   // ... datatype and GEMM shape template params
-  typename AdditionOp,
-  typename MultiplicationOp
+  typename RingOp
 >
-struct Srmma {
+struct Srmma<
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    1,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    RingOp> {
   using Shape = cutlass::gemm::GemmShape<1, 1, 1>;
 
-  // operators must be default contructible and contain a binary operator()
-  AdditionOp add;
-  MultiplicationOp mult;
+  RingOp ring_op;
 
-  __host__ __device__
+  CUTLASS_HOST_DEVICE
   void operator()(
     cutlass::Array<ElementC, 1> &d,
     cutlass::Array<ElementA, 1> const &a,
     cutlass::Array<ElementB, 1> const &b,
     cutlass::Array<ElementC, 1> const &c
   ) {
-    d[0] = add(c[0], mult(a[0], b[0]));
+    ring_op.fma(d[0], a[0], b[0], c[0]);
   }
 };
 }

diff --git a/bench/device/CMakeLists.txt b/bench/device/CMakeLists.txt
@@ -1,23 +1,60 @@
-file(GLOB SIMT_BENCH_SRCS CONFIGURE_DEPENDS *.cu)
-add_executable(cuasr_bench_srgemm_device
+# SM50 defualt configurations
+add_executable(cuasr_bench_srgemm_device_sm50_defaults
+  sm50_defaults.cu
+)
+target_include_directories(
+  cuasr_bench_srgemm_device_sm50_defaults
+  PRIVATE
+  ${PROJECT_SOURCE_DIR}/include/
+  ${PROJECT_SOURCE_DIR}/tools/include/
+  ${PROJECT_SOURCE_DIR}/cutlass/include/
+  ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/
+)
+target_link_libraries(cuasr_bench_srgemm_device_sm50_defaults
+  benchmark
+  benchmark_main
+  ${cuASR_LIB_NAME}
+)
+
+# SM80 defualt configurations
+add_executable(cuasr_bench_srgemm_device_sm80_defaults
+  sm80_defaults.cu
+)
+target_include_directories(
+  cuasr_bench_srgemm_device_sm80_defaults
+  PRIVATE
+  ${PROJECT_SOURCE_DIR}/include/
+  ${PROJECT_SOURCE_DIR}/tools/include/
+  ${PROJECT_SOURCE_DIR}/cutlass/include/
+  ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/
+)
+target_link_libraries(cuasr_bench_srgemm_device_sm80_defaults
+  benchmark
+  benchmark_main
+  ${cuASR_LIB_NAME}
+)
+
+# All shmoo benchmarks
+file(GLOB SIMT_BENCH_SRCS CONFIGURE_DEPENDS sm50_simt_*.cu)
+add_executable(cuasr_bench_srgemm_device_shmoo
   ${SIMT_BENCH_SRCS}
 )
 target_include_directories(
-  cuasr_bench_srgemm_device
+  cuasr_bench_srgemm_device_shmoo
   PRIVATE
   ${PROJECT_SOURCE_DIR}/include/
   ${PROJECT_SOURCE_DIR}/tools/include/
   ${PROJECT_SOURCE_DIR}/cutlass/include/
   ${PROJECT_SOURCE_DIR}/cutlass/tools/util/include/
 )
-target_link_libraries(cuasr_bench_srgemm_device
+target_link_libraries(cuasr_bench_srgemm_device_shmoo
   benchmark
   benchmark_main
   ${cuASR_LIB_NAME}
 )
 if(NOT DEFINED CUASR_BENCH_LEVEL)
   set(CUASR_BENCH_LEVEL 0)
 endif()
-target_compile_definitions(cuasr_bench_srgemm_device
+target_compile_definitions(cuasr_bench_srgemm_device_shmoo
   PRIVATE CUASR_BENCH_LEVEL=${CUASR_BENCH_LEVEL}
 )