Fix how HostIrEvaluator detects pre-allocated outputs

wujingyue · wujingyue · commit 6111986aa1a2 · 2025-11-14T23:20:58.000-08:00
diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
@@ -482,27 +482,24 @@ void HostIrEvaluator::handle(MatmulOp* matmul) {
   TensorView* b = matmul->inB();
   TensorView* out = matmul->out();
 
-  if (expr_evaluator_.isKnown(out)) {
-    auto t_a = getKnownConcreteValue(a).as<at::Tensor>();
-    auto t_b = getKnownConcreteValue(b).as<at::Tensor>();
-    auto t_out = getKnownConcreteValue(out).as<at::Tensor>();
-    at::matmul_out(t_out, t_a, t_b);
-  } else {
+  if (!matmul->outputIsPreallocated()) {
     unhandled(matmul);
+    return;
   }
+
+  auto t_a = getKnownConcreteValue(a).as<at::Tensor>();
+  auto t_b = getKnownConcreteValue(b).as<at::Tensor>();
+  auto t_out = getKnownConcreteValue(out).as<at::Tensor>();
+  at::matmul_out(t_out, t_a, t_b);
 }
 
 void HostIrEvaluator::handle(LinearOp* linear) {
   auto* in = linear->inA()->as<TensorView>();
   auto* weight = linear->inB()->as<TensorView>();
   auto* out = linear->out()->as<TensorView>();
 
-  // FIXME: When LinearOp is called in a for loop, even if it's output is not
-  // pre-allocated, the second iteration will see isKnown true and skip the
-  // unhandled path.
-  if (!expr_evaluator_.isKnown(out)) {
-    unhandled(linear);
-    return;
+  if (!linear->outputIsPreallocated()) {
+    return unhandled(linear);
   }
 
   auto in_tensor = getKnownConcreteValue(in).as<at::Tensor>();
diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp
@@ -115,7 +115,10 @@ const std::vector<IterDomain*>& findReferenceLoopDomain(
 
 Expr* cloneWithNewOperands(
     Expr* e,
-    const std::unordered_map<Val*, Val*>& replacement_map) {
+    const std::unordered_map<Val*, Val*>& replacement_map,
+    bool output_is_preallocated) {
+  NVF_ERROR(!e->outputIsPreallocated());
+
   auto maybe_replace = [&](Val*& x) -> bool {
     Val* new_x = getOrDefault(replacement_map, x);
     if (new_x == nullptr) {
@@ -133,10 +136,16 @@ Expr* cloneWithNewOperands(
   std::vector<Val*> new_outs = e->outputs();
   replaced += std::ranges::count_if(new_outs, maybe_replace);
 
-  if (replaced == 0) {
+  if (replaced == 0 && !output_is_preallocated) {
     return e;
   }
-  return e->newObjectFunc()(e->container(), new_ins, new_outs, e->attributes());
+
+  Expr* new_e =
+      e->newObjectFunc()(e->container(), new_ins, new_outs, e->attributes());
+  if (output_is_preallocated) {
+    new_e = new_e->withOutputPreallocated();
+  }
+  return new_e;
 }
 
 void lowerSegment(
@@ -204,7 +213,7 @@ void lowerSegment(
           innermost_scope.push_back(allocate);
         }
 
-        Expr* new_c = cloneWithNewOperands(c, replacement_map);
+        Expr* new_c = cloneWithNewOperands(c, replacement_map, true);
         innermost_scope.push_back(new_c);
 
         auto* wait = IrBuilder::create<hir::Wait>(new_c);
@@ -261,12 +270,14 @@ void lowerSegment(
           }
         }
 
+        bool output_is_preallocated = false;
         for (auto* out : ir_utils::filterByType<TensorView>(e->outputs())) {
           if (getShardedIterDomain(
                   out, ParallelType::Stream, DomainType::kAllocation) ==
               nullptr) {
             auto* allocate =
                 IrBuilder::create<kir::Allocate>(out, MemoryType::Global);
+            output_is_preallocated = true;
             innermost.parent_scope->insert(
                 innermost.parent_insertion_point, allocate);
             // Loop is stream parallelized but allocation is not. Therefore,
@@ -281,7 +292,8 @@ void lowerSegment(
           }
         }
 
-        Expr* new_e = cloneWithNewOperands(e, replacement_map);
+        Expr* new_e =
+            cloneWithNewOperands(e, replacement_map, output_is_preallocated);
         innermost_scope.push_back(new_e);
       }
       break;
diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -475,6 +475,7 @@ std::list<Expr*> processForLoopBodies(
              ir_utils::filterByType<TensorView>(body_expr->outputs())) {
           processTensor(body_expr, output, tensor_index);
         }
+        body_expr = body_expr->withOutputPreallocated();
         new_loop_body.push_back(body_expr);
       }
     }
diff --git a/csrc/ir/base_nodes.cpp b/csrc/ir/base_nodes.cpp
@@ -253,6 +253,7 @@ std::optional<DataType> Val::getDataType() const {
 // after inputs and outputs are registered with the Expr
 Expr::Expr(IrBuilderPasskey passkey) : Statement(passkey) {}
 
+// FIXME: Should this constructor copy the output_is_preallocated_ flag?
 Expr::Expr(const Expr* src, IrCloner* ir_cloner)
     : Statement(src, ir_cloner),
       attributes_(ir_cloner->clone(src->attributes_)),
@@ -270,12 +271,13 @@ Expr::Expr(
       outputs_(std::move(outputs)) {}
 
 Expr* Expr::shallowCopy() const {
-  auto result =
+  Expr* result =
       newObjectFunc()(ir_container_, inputs(), outputs(), attributes());
   if (container()->isA<kir::Kernel>()) {
     result->predicate_ = predicate_;
     result->write_predicate_ = write_predicate_;
   }
+  result->output_is_preallocated_ = output_is_preallocated_;
   return result;
 }
 
@@ -383,6 +385,11 @@ Expr* Expr::withWritePredicate(kir::Predicate* predicate) {
   return result;
 }
 
+Expr* Expr::withOutputPreallocated() {
+  output_is_preallocated_ = true;
+  return this;
+}
+
 std::vector<PolymorphicValue> Expr::evaluate(
     const ExpressionEvaluator& ee,
     const std::vector<PolymorphicValue>& inputs) const {
diff --git a/csrc/ir/base_nodes.h b/csrc/ir/base_nodes.h
@@ -599,6 +599,12 @@ class NVF_API Expr : public Statement {
   // TODO: Protect based on being in kernel container
   Expr* withWritePredicate(kir::Predicate* write_predicate);
 
+  bool outputIsPreallocated() const {
+    return output_is_preallocated_;
+  }
+
+  Expr* withOutputPreallocated();
+
   // Get the name of an expression
   virtual const char* getOpString() const = 0;
 
@@ -660,6 +666,8 @@ class NVF_API Expr : public Statement {
 
   // Only used for reduction-related expressions
   kir::Predicate* write_predicate_ = nullptr;
+
+  bool output_is_preallocated_ = false;
 };
 
 template <typename T>
diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp
@@ -159,7 +159,8 @@ TEST_F(HostIrEvaluatorTest, MatmulInLoop) {
 
     // By default, MatmulOp is computed by ExpressionEvaluator so it appears in
     // host IR.
-    auto* mm = IrBuilder::create<MatmulOp>(loop_out, in, loop_w);
+    auto* mm = IrBuilder::create<MatmulOp>(loop_out, in, loop_w)
+                   ->withOutputPreallocated();
     for_loop->body().push_back(mm);
 
     hic->pushBackTopLevelExprs(allocate_out);
diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp
@@ -6,9 +6,6 @@
  */
 // clang-format on
 
-#include <algorithm>
-#include <iostream>
-
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
@@ -874,7 +874,8 @@ TEST_F(MatmulHostIrTest, HostIrMatmulOut) {
   TensorView* tv0 = makeContigTensor(3);
   TensorView* tv1 = makeContigTensor(3);
   TensorView* tv2 = makeContigTensor(3);
-  auto* matmul = IrBuilder::create<MatmulOp>(tv2, tv0, tv1);
+  auto* matmul =
+      IrBuilder::create<MatmulOp>(tv2, tv0, tv1)->withOutputPreallocated();
 
   hic->addInput(tv0);
   hic->addInput(tv1);
@@ -956,7 +957,8 @@ TEST_F(LinearHostIrTest, HostIrLinearOut) {
   TensorView* bias = makeContigTensor(1);
   TensorView* out = makeContigTensor(3);
 
-  auto linear_op = IrBuilder::create<LinearOp>(out, in, weight, bias);
+  auto* linear_op = IrBuilder::create<LinearOp>(out, in, weight, bias)
+                        ->withOutputPreallocated();
 
   hic->addInput(in);
   hic->addInput(weight);
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
@@ -77,9 +77,9 @@ TEST_F(StreamTest, RowParallelLinear_Forward) {
   constexpr int64_t t = 6;
   static_assert(t % s == 0);
   at::Tensor in_tensor =
-      at::randn({t, h * 4}, tensor_options_.dtype(at::kBFloat16));
+      at::randint(-2, 3, {t, h * 4}, tensor_options_.dtype(at::kBFloat16));
   at::Tensor w_tensor =
-      at::randn({h, h * 4}, tensor_options_.dtype(at::kBFloat16));
+      at::randint(-2, 3, {h, h * 4}, tensor_options_.dtype(at::kBFloat16));
   at::Tensor out_tensor = at::linear(in_tensor, w_tensor);
 
   at::Tensor sharded_in_tensor = shardTensor(in_tensor, in);
@@ -91,7 +91,11 @@ TEST_F(StreamTest, RowParallelLinear_Forward) {
           .runFusionWithInputs({sharded_in_tensor, sharded_w_tensor})[0]
           .as<at::Tensor>();
 
-  EXPECT_TRUE(at::allclose(sharded_out_tensor, out_tensor));
+  EXPECT_TRUE(at::allclose(sharded_out_tensor, out_tensor))
+      << "sharded_out_tensor:" << std::endl
+      << sharded_out_tensor << std::endl
+      << " out_tensor:" << std::endl
+      << out_tensor;
 }
 
 } // namespace nvfuser

Original file line number	Diff line number	Diff line change
`@@ -475,6 +475,7 @@ std::list<Expr*> processForLoopBodies(`
`475`	`475`	`ir_utils::filterByType<TensorView>(body_expr->outputs())) {`
`476`	`476`	`processTensor(body_expr, output, tensor_index);`
`477`	`477`	`}`
	`478`	`+ body_expr = body_expr->withOutputPreallocated();`
`478`	`479`	`new_loop_body.push_back(body_expr);`
`479`	`480`	`}`
`480`	`481`	`}`