Make FrontendCastDoubleToHalf match auto schedule

jacobhinkle · jacobhinkle · commit 4519bd17186b · 2023-01-18T09:28:26.000-05:00
This was very similar to the FrontendAdd example since it also uses the
pointwise scheduler.
diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -724,7 +724,36 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Define fusion
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(2, DataType::Double);
+    auto tv1 = makeSymbolicTensor(2, DataType::Double);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    auto tv2 = castOp(DataType::Half, tv0);
+    auto tv3 = castOp(DataType::Half, tv1);
+    // implicit casts
+    auto tv4 = castOp(DataType::Float, tv2);
+    auto tv5 = castOp(DataType::Float, tv3);
+    auto tv6 = add(tv4, tv5);
+    auto tv7 = relu(tv6);
+    auto tv8 = castOp(DataType::Half, tv7);
+
+    fauto.addOutput(tv8);
+
+    // Run automatic scheduler
+    auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+    TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+    schedulePointwise(&fauto, *pointwise_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -734,37 +763,64 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
   fusion.addInput(tv0);
   fusion.addInput(tv1);
 
-  auto tv0h = castOp(DataType::Half, tv0);
-  auto tv1h = castOp(DataType::Half, tv1);
-  auto tv0f = castOp(DataType::Float, tv0h);
-  auto tv1f = castOp(DataType::Float, tv1h);
-  auto tv2 = add(tv0f, tv1f);
-  auto tv3 = relu(tv2);
-  auto tv4 = castOp(DataType::Half, tv3);
+  auto tv2 = castOp(DataType::Half, tv0);
+  auto tv3 = castOp(DataType::Half, tv1);
+  // implicit casts
+  auto tv4 = castOp(DataType::Float, tv2);
+  auto tv5 = castOp(DataType::Float, tv3);
+  auto tv6 = add(tv4, tv5);
+  auto tv7 = relu(tv6);
+  auto tv8 = castOp(DataType::Half, tv7);
 
-  fusion.addOutput(tv4);
-
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
-  TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
-  schedulePointwise(&fauto, *pointwise_params);
+  fusion.addOutput(tv8);
 
   // Perform manual scheduling
-  tv4->merge(0, 1);
-  tv4->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Before schedulePointwise() is called, getPointwiseHeuristics() calls
+  // vectorize_helper::getExpandedVectorization() which in turn calls:
+  //   vectorize_helper::getVectorizationSize
+  //   vectorize_helper::ProjectedExtent::getNumerator
+  //   vectorize_helper::ProjectedExtent::computeNumerDenomir
+  //   IrContainer::oneVal
+  // oneVal() creates an actual Val here to hold the denominator and
+  // initializes it to 1. Since this is reflected in the fusion log, I'm
+  // inserting it here even though it has not effect on the generated kernel.
+  fusion.oneVal();
+
+  tv0->cacheAfter(); // tv9
+  tv1->cacheAfter(); // tv10
+  auto tv11 = tv8->cacheBefore(); // tv11
+
+  tv8->merge(0, 1);
+  tv8->reorder({{0, -1}});
+  tv8->reorder({{-1, 0}});
+  tv8->split(0, 128);
+  tv8->split(0, 1);
+  tv8->split(0, 1);
+  tv8->axis(0)->parallelize(ParallelType::BIDx);
+  tv8->axis(1)->parallelize(ParallelType::Unswitch);
+  tv8->axis(3)->parallelize(ParallelType::TIDx);
 
   // propagate the mapping to other tensors
-  TransformPropagatorWithCheck propagator(tv4);
-  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(
-      tv4, {tv0, tv1, tv0h, tv1h, tv0f, tv1f, tv2, tv3});
+  TransformPropagatorWithCheck propagator(tv8);
+  MaxRootDomainInfoSpanningTree(tv8).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv8);
 
-  inlineMost();
+  // Pointwise scheduler does not use inlineMost(), as reduction scheduler does
+  // Instead, it uses inlineAllAt followed by inlineMost(innermost_tensors)
+  inlineAllAt(tv8, 2, true);
+  inlineMost(
+      std::vector<TensorView*>({tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv11}));
 
-  compare_ir(fusion, fauto);
+  // Note that inlineAllAt iterates through an unordered_set to do inlining, so
+  // it is not practical to match the fusion_debug log exactly when using
+  // pointwise scheduler
+  compare_ir_math(fusion, fauto);
+  compare_transforms(fusion, fauto);
+  // compare_fusion_debug(fusion, fauto);
+  compare_kernels(fusion, fauto);
+
+  // compare_ir(fusion, fauto);
 
   // Perform eager computation and verify
   auto t0h = t0.to(options.dtype(at::kHalf));