WIP

wujingyue · wujingyue · commit eddca9e0e087 · 2025-11-14T22:06:18.000-08:00
diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
@@ -497,12 +497,9 @@ void HostIrEvaluator::handle(LinearOp* linear) {
   auto* weight = linear->inB()->as<TensorView>();
   auto* out = linear->out()->as<TensorView>();
 
-  // FIXME: When LinearOp is called in a for loop, even if it's output is not
-  // pre-allocated, the second iteration will see isKnown true and skip the
-  // unhandled path.
-  if (!expr_evaluator_.isKnown(out)) {
-    unhandled(linear);
-    return;
+  // FIXME: this breaks MultiDeviceExecutor.
+  if (!linear->outputIsPreallocated()) {
+    return unhandled(linear);
   }
 
   auto in_tensor = getKnownConcreteValue(in).as<at::Tensor>();
diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp
@@ -129,7 +129,10 @@ const std::vector<IterDomain*>& findReferenceLoopDomain(
 
 Expr* cloneWithNewOperands(
     Expr* e,
-    const std::unordered_map<Val*, Val*>& replacement_map) {
+    const std::unordered_map<Val*, Val*>& replacement_map,
+    bool output_is_preallocated) {
+  NVF_ERROR(!e->outputIsPreallocated());
+
   auto maybe_replace = [&](Val*& x) -> bool {
     Val* new_x = getOrDefault(replacement_map, x);
     if (new_x == nullptr) {
@@ -147,10 +150,16 @@ Expr* cloneWithNewOperands(
   std::vector<Val*> new_outs = e->outputs();
   replaced += std::ranges::count_if(new_outs, maybe_replace);
 
-  if (replaced == 0) {
+  if (replaced == 0 && !output_is_preallocated) {
     return e;
   }
-  return e->newObjectFunc()(e->container(), new_ins, new_outs, e->attributes());
+
+  Expr* new_e =
+      e->newObjectFunc()(e->container(), new_ins, new_outs, e->attributes());
+  if (output_is_preallocated) {
+    new_e = new_e->withOutputPreallocated();
+  }
+  return new_e;
 }
 
 void lowerSegment(
@@ -213,7 +222,7 @@ void lowerSegment(
           innermost_scope.push_back(allocate);
         }
 
-        Expr* new_c = cloneWithNewOperands(c, replacement_map);
+        Expr* new_c = cloneWithNewOperands(c, replacement_map, true);
         innermost_scope.push_back(new_c);
 
         auto* wait = IrBuilder::create<hir::Wait>(new_c);
@@ -267,10 +276,12 @@ void lowerSegment(
           }
         }
 
+        bool output_is_preallocated = false;
         for (auto* out : ir_utils::filterByType<TensorView>(e->outputs())) {
           if (getShardedIterDomain(out, ParallelType::Stream) == nullptr) {
             auto* allocate =
                 IrBuilder::create<kir::Allocate>(out, MemoryType::Global);
+            output_is_preallocated = true;
             innermost.parent_scope->insert(
                 innermost.parent_insertion_point, allocate);
             // Loop is stream parallelized but allocation is not. Therefore,
@@ -285,7 +296,8 @@ void lowerSegment(
           }
         }
 
-        Expr* new_e = cloneWithNewOperands(e, replacement_map);
+        Expr* new_e =
+            cloneWithNewOperands(e, replacement_map, output_is_preallocated);
         innermost_scope.push_back(new_e);
       }
       break;
diff --git a/csrc/ir/base_nodes.cpp b/csrc/ir/base_nodes.cpp
@@ -253,6 +253,7 @@ std::optional<DataType> Val::getDataType() const {
 // after inputs and outputs are registered with the Expr
 Expr::Expr(IrBuilderPasskey passkey) : Statement(passkey) {}
 
+// FIXME: Should this constructor copy the output_is_preallocated_ flag?
 Expr::Expr(const Expr* src, IrCloner* ir_cloner)
     : Statement(src, ir_cloner),
       attributes_(ir_cloner->clone(src->attributes_)),
@@ -270,12 +271,13 @@ Expr::Expr(
       outputs_(std::move(outputs)) {}
 
 Expr* Expr::shallowCopy() const {
-  auto result =
+  Expr* result =
       newObjectFunc()(ir_container_, inputs(), outputs(), attributes());
   if (container()->isA<kir::Kernel>()) {
     result->predicate_ = predicate_;
     result->write_predicate_ = write_predicate_;
   }
+  result->output_is_preallocated_ = output_is_preallocated_;
   return result;
 }
 
@@ -383,6 +385,11 @@ Expr* Expr::withWritePredicate(kir::Predicate* predicate) {
   return result;
 }
 
+Expr* Expr::withOutputPreallocated() {
+  output_is_preallocated_ = true;
+  return this;
+}
+
 std::vector<PolymorphicValue> Expr::evaluate(
     const ExpressionEvaluator& ee,
     const std::vector<PolymorphicValue>& inputs) const {
diff --git a/csrc/ir/base_nodes.h b/csrc/ir/base_nodes.h
@@ -599,6 +599,12 @@ class NVF_API Expr : public Statement {
   // TODO: Protect based on being in kernel container
   Expr* withWritePredicate(kir::Predicate* write_predicate);
 
+  bool outputIsPreallocated() const {
+    return output_is_preallocated_;
+  }
+
+  Expr* withOutputPreallocated();
+
   // Get the name of an expression
   virtual const char* getOpString() const = 0;
 
@@ -660,6 +666,8 @@ class NVF_API Expr : public Statement {
 
   // Only used for reduction-related expressions
   kir::Predicate* write_predicate_ = nullptr;
+
+  bool output_is_preallocated_ = false;
 };
 
 template <typename T>
diff --git a/tests/cpp/test_multidevice_overlap.cpp b/tests/cpp/test_multidevice_overlap.cpp
@@ -77,9 +77,9 @@ TEST_F(StreamTest, RowParallelLinear_Forward) {
   constexpr int64_t t = 6;
   static_assert(t % s == 0);
   at::Tensor in_tensor =
-      at::randn({t, h * 4}, tensor_options_.dtype(at::kBFloat16));
+      at::randint(-2, 3, {t, h * 4}, tensor_options_.dtype(at::kBFloat16));
   at::Tensor w_tensor =
-      at::randn({h, h * 4}, tensor_options_.dtype(at::kBFloat16));
+      at::randint(-2, 3, {h, h * 4}, tensor_options_.dtype(at::kBFloat16));
   at::Tensor out_tensor = at::linear(in_tensor, w_tensor);
 
   at::Tensor sharded_in_tensor = shardTensor(in_tensor, in);
@@ -91,7 +91,11 @@ TEST_F(StreamTest, RowParallelLinear_Forward) {
           .runFusionWithInputs({sharded_in_tensor, sharded_w_tensor})[0]
           .as<at::Tensor>();
 
-  EXPECT_TRUE(at::allclose(sharded_out_tensor, out_tensor));
+  EXPECT_TRUE(at::allclose(sharded_out_tensor, out_tensor))
+      << "sharded_out_tensor:" << std::endl
+      << sharded_out_tensor << std::endl
+      << " out_tensor:" << std::endl
+      << out_tensor;
 }
 
 } // namespace nvfuser