[CINN][New Hardware Update] replace DefaultNVGPUTarget

DongBaiYue · DongBaiYue · commit 6ebb20bfd8fc · 2024-05-16T10:20:55.000+08:00
* replace DefaultNVGPUTarget with CurrentTarget
diff --git a/paddle/cinn/backends/codegen_device_util.h b/paddle/cinn/backends/codegen_device_util.h
@@ -25,6 +25,7 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn {
 namespace backends {
@@ -51,8 +52,9 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
   explicit CollectHostFunctionVisitor(const std::string& module_name)
       : host_module_builder(module_name + "_host",
                             cinn::common::DefaultHostTarget()),
-        device_module_builder(module_name + "_gpu_device",
-                              cinn::common::DefaultNVGPUTarget()) {}
+        device_module_builder(
+            module_name + "_gpu_device",
+            cinn::runtime::CurrentTarget::GetCurrentTarget()) {}
 
   std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
     ir::IRMutator<>::Visit(expr, expr);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h
@@ -16,6 +16,7 @@
 
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_group.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn {
 namespace dialect {
@@ -140,7 +141,7 @@ static int GetSharedSize(const cinn::dialect::ir::OpNode& op_node) {
       lane = inshape[idx];
     }
     // int max_num_threads =
-    // cinn::common::DefaultNVGPUTarget().max_num_threads();
+    // cinn::runtime::CurrentTarget::GetCurrentTarget().max_num_threads();
     int max_num_threads = 1000;
     if (lane > max_num_threads / 2) {
       return 0;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
@@ -22,6 +22,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/cinn/runtime/flags.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/ir_printer.h"
@@ -197,8 +198,8 @@ int GetSharedSize(::pir::Operation* op) {
       lane = inshape[idx];
     }
     // int max_num_threads =
-    // cinn::common::DefaultNVGPUTarget().max_num_threads(); todo(phlrain): get
-    // gpu max threads
+    // cinn::runtime::CurrentTarget::GetCurrentTarget().max_num_threads();
+    // todo(phlrain): get gpu max threads
     int max_num_threads = 2048;
     if (lane > max_num_threads / 2) {
       return 0;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/pre_analysis.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/pre_analysis.cc
@@ -16,6 +16,7 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
+#include "paddle/cinn/runtime/flags.h"
 #include "paddle/common/flags.h"
 
 PD_DECLARE_bool(enable_cinn_compile_cache);
@@ -56,7 +57,7 @@ void FusionOpAnalysis::PreCompileGroup() {
   }
   // Build and trigger compilaion cache.
   VLOG(4) << "Parallel Pre-Compile for Group with size: " << groups.size();
-  PirCompiler pir_compiler(cinn::common::DefaultNVGPUTarget());
+  PirCompiler pir_compiler(cinn::runtime::CurrentTarget::GetCurrentTarget());
   pir_compiler.Build(groups);
 }
 }  // namespace cinn::dialect::ir::details
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc b/paddle/cinn/hlir/dialect/operator/transforms/lowering_pass/utils.cc
@@ -61,7 +61,7 @@ std::vector<pir::Value> GetBlockOutsideInput(
 std::unordered_map<OpLoweringGroupPtr,
                    std::unordered_map<std::string, pir::Attribute>>
 CompileGroupAsOpAttribute(const std::vector<OpLoweringGroupPtr>& group_list) {
-  PirCompiler pir_compiler(cinn::common::DefaultNVGPUTarget());
+  PirCompiler pir_compiler(cinn::runtime::CurrentTarget::GetCurrentTarget());
   auto fn_ptr_res = pir_compiler.Build(group_list);
 
   std::unordered_map<OpLoweringGroupPtr,
@@ -85,7 +85,8 @@ std::unordered_map<std::string, ::pir::Attribute> GetJitKernelAttr(
       hlir::framework::pir::FusionInfo fusion_info(*group);
       return CompilationCache::Instance().GetKernelInfo(fusion_info);
     } else {
-      PirCompiler pir_compiler(cinn::common::DefaultNVGPUTarget());
+      PirCompiler pir_compiler(
+          cinn::runtime::CurrentTarget::GetCurrentTarget());
       return pir_compiler.Build({group})[0];
     }
   };
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -717,7 +717,7 @@ void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,  // NOLINT
   // If the number of current device SM is smaller than the number of SM
   // required by Warp Reduce, the performance of Warp Reduce is better.
   // Otherwise, use Block Reduce.
-  auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
+  auto max_num_threads = target.max_num_threads();
   int need_reduce_last_count = 1;
   for (int i = 0; i < inshape.size(); i++) {
     if (find(axes.begin(), axes.end(), i) == axes.end()) {
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_util.cc b/paddle/cinn/hlir/framework/pir/op_lowering_util.cc
@@ -577,7 +577,7 @@ void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,  // NOLINT
   // If the number of current device SM is smaller than the number of SM
   // required by Warp Reduce, the performance of Warp Reduce is better.
   // Otherwise, use Block Reduce.
-  auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
+  auto max_num_threads = target.max_num_threads();
   int need_reduce_last_count = 1;
   for (int i = 0; i < inshape.size(); i++) {
     if (find(axes.begin(), axes.end(), i) == axes.end()) {
diff --git a/paddle/cinn/hlir/op/reduction.cc b/paddle/cinn/hlir/op/reduction.cc
@@ -263,7 +263,7 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
                                          reduce_tmp_out.as_tensor_ref(),
                                          tmp_out.as_tensor_ref(),
                                          out.as_tensor_ref(),
-                                         cinn::common::DefaultNVGPUTarget());
+                                         cinn::runtime::CurrentTarget::GetCurrentTarget());
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};
@@ -279,7 +279,7 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
                                        reduce_tmp_out.as_tensor_ref(),
                                        tmp_out.as_tensor_ref(),
                                        out.as_tensor_ref(),
-                                       cinn::common::DefaultNVGPUTarget());
+                                       cinn::runtime::CurrentTarget::GetCurrentTarget());
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};
diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc
@@ -27,6 +27,7 @@
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/string.h"
 
 namespace cinn {
@@ -841,7 +842,8 @@ std::vector<ir::Tensor> TwoStepBlockReduceInternal(
   // If the number of current device SM is smaller than the number of SM
   // required by Warp Reduce, the performance of Warp Reduce is better.
   // Otherwise, use Block Reduce.
-  auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
+  auto max_num_threads =
+      cinn::runtime::CurrentTarget::GetCurrentTarget().max_num_threads();
   int need_reduce_last_count = 1;
   for (int i = 0; i < A->shape.size(); i++) {
     if (find(axes.begin(), axes.end(), i) == axes.end()) {
@@ -850,11 +852,11 @@ std::vector<ir::Tensor> TwoStepBlockReduceInternal(
   }
   int warp_reduce_need_sm_count =
       ceil((need_reduce_last_count * 32) /
-           static_cast<float>(
-               cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm()));
+           static_cast<float>(cinn::runtime::CurrentTarget::GetCurrentTarget()
+                                  .get_max_threads_per_sm()));
   // Set Num_max_threads to 32 is Warp Reduce
-  if (cinn::common::DefaultNVGPUTarget().get_multi_processor_count() <
-      warp_reduce_need_sm_count) {
+  if (cinn::runtime::CurrentTarget::GetCurrentTarget()
+          .get_multi_processor_count() < warp_reduce_need_sm_count) {
     max_num_threads = 32;
   }
 
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_tactic.cc
@@ -15,6 +15,7 @@
 #include "paddle/cinn/ir/group_schedule/tactic/tile_tactic.h"
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn {
 namespace ir {
@@ -46,7 +47,7 @@ void TileTactic::Init(ScheduleContext* context) {
   };
   auto GetTreeReduceSize = [&](const ir::Expr& total_rb_extent) -> int64_t {
     const int64_t max_num_threads =
-        common::DefaultNVGPUTarget().max_num_threads();
+        cinn::runtime::CurrentTarget::GetCurrentTarget().max_num_threads();
     int64_t nums_thread_per_block = max_num_threads;
     if (total_rb_extent.is_constant()) {
       int64_t extent = static_cast<int64_t>(total_rb_extent.get_constant());
diff --git a/paddle/cinn/optim/map_extern_call.cc b/paddle/cinn/optim/map_extern_call.cc
@@ -18,6 +18,7 @@
 #include "paddle/cinn/hlir/op/op_util.h"
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/runtime/cpu/host_intrinsics.h"
+#include "paddle/cinn/runtime/flags.h"
 
 namespace cinn {
 namespace optim {
@@ -90,8 +91,8 @@ void DealWithIntrinsicsImpl(common::NVGPUArch, ir::Call *node, Expr *expr) {
     return;
   }
 
-  std::string extern_func =
-      hlir::GetExternFuncName(cinn::common::DefaultNVGPUTarget(), dtype, name);
+  std::string extern_func = hlir::GetExternFuncName(
+      cinn::runtime::CurrentTarget::GetCurrentTarget(), dtype, name);
   *expr = lang::CallExtern(extern_func, node->read_args, node->attrs);
 }