PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/cutlass_kernels/cutlass_helper.h‎
Lines changed: 53 additions & 0 deletions b/‎custom_ops/gpu_ops/cutlass_kernels/cutlass_helper.h‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm.cuh‎
Lines changed: 83 additions & 7 deletions b/‎custom_ops/gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm.cuh‎
Lines changed: 83 additions & 7 deletions
diff --git a/‎custom_ops/gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu‎
Lines changed: 24 additions & 0 deletions b/‎custom_ops/gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu‎
Lines changed: 24 additions & 0 deletions
@@ -48,6 +48,59 @@ struct enable_sm90_or_later : Kernel {
   }
 };
 
+// SM90: covers SM90 (Hopper) only
+template <typename Kernel>
+struct enable_sm90_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args &&...args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+// SM100f: covers SM100 (Blackwell GB200) and SM103 (GB10x)
+template <typename Kernel>
+struct enable_sm100f_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args &&...args) {
+#if defined __CUDA_ARCH__ && (__CUDA_ARCH__ == 1000 || __CUDA_ARCH__ == 1030)
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+// SM120: covers SM120 (RTX 5090)
+template <typename Kernel>
+struct enable_sm120_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args &&...args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+// SM12x family: covers SM120 (RTX 5090) and SM121 (DGX Spark)
+template <typename Kernel>
+struct enable_sm120_family : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args &&...args) {
+#if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 1200 && __CUDA_ARCH__ < 1300)
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+inline int32_t get_sm_version_num() {
+  int device = -1;
+  cudaGetDevice(&device);
+  int major = 0, minor = 0;
+  cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device);
+  cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device);
+  return major * 10 + minor;
+}
+
 template <paddle::DataType D>
 class CutlassDtypeTraits;
 
 
@@ -34,8 +34,7 @@ namespace fastdeploy {
 
 template <typename ElementAB_,
           typename ElementD_,
-          template <typename, typename, typename>
-          typename Epilogue_,
+          template <typename, typename, typename> typename Epilogue_,
           typename TileShape,
           typename ClusterShape,
           typename KernelSchedule,
@@ -57,7 +56,8 @@ struct cutlass_3x_gemm {
   // These are the minimum alignments needed for the kernels to compile
   static constexpr int AlignmentAB =
       128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD = 4;
+  static constexpr int AlignmentCD =
+      128 / cutlass::sizeof_bits<ElementD>::value;
 
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
@@ -104,8 +104,7 @@ struct cutlass_3x_gemm {
 
 template <typename ElementAB_,
           typename ElementD_,
-          template <typename, typename, typename>
-          typename Epilogue_,
+          template <typename, typename, typename> typename Epilogue_,
           typename TileShape,
           typename ClusterShape,
           typename KernelSchedule,
@@ -180,11 +179,88 @@ struct cutlass_3x_gemm_sm100 {
               sizeof(typename CollectiveEpilogue::SharedStorage))>,
           KernelSchedule>::CollectiveOp;
 
-  using GemmKernel =
+  using GemmKernel = enable_sm100f_only<
       cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,
                                            CollectiveMainloop,
                                            CollectiveEpilogue,
-                                           void>;
+                                           void>>;
+};
+
+template <typename ElementAB_,
+          typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape,
+          typename ClusterShape,
+          typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm_sm120 {
+  using ElementAB = ElementAB_;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<ElementD_>::value;
+
+  using ElementD = ElementD_;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ElementAcc = typename std::
+      conditional<std::is_same_v<ElementAB, int8_t>, int32_t, float>::type;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  using ElementAccumulator = float;
+  using ElementCompute = float;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm120,
+          cutlass::arch::OpClassTensorOp,
+          TileShape,
+          ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator,
+          ElementCompute,
+          ElementC,
+          LayoutC,
+          AlignmentC,
+          ElementD,
+          LayoutD,
+          AlignmentD,
+          EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm120,
+          cutlass::arch::OpClassTensorOp,
+          ElementAB,
+          LayoutA,
+          AlignmentA,
+          ElementAB,
+          LayoutB,
+          AlignmentB,
+          ElementAccumulator,
+          TileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = enable_sm120_only<
+      cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,
+                                           CollectiveMainloop,
+                                           CollectiveEpilogue,
+                                           void>>;
 };
 
 }  // namespace fastdeploy
@@ -0,0 +1,24 @@
+// adapted from:
+// https://github.com/vllm-project/vllm/blob/main/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu
+
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_blockwise_sm100_fp8_dispatch.cuh"
+
+namespace fastdeploy {
+
+void cutlass_scaled_mm_blockwise_sm100_fp8(paddle::Tensor &out,
+                                           paddle::Tensor const &a,
+                                           paddle::Tensor const &b,
+                                           paddle::Tensor const &a_scales,
+                                           paddle::Tensor const &b_scales) {
+  if (out.dtype() == paddle::DataType::BFLOAT16) {
+    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
+        out, a, b, a_scales, b_scales);
+  } else {
+    PD_CHECK(out.dtype() == paddle::DataType::FLOAT16);
+    cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::half_t>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace fastdeploy