Fix MultiDeviceExecutor

wujingyue · wujingyue · commit 82eb4cc9e017 · 2025-11-14T22:47:52.000-08:00
diff --git a/csrc/host_ir/evaluator.cpp b/csrc/host_ir/evaluator.cpp
@@ -482,14 +482,15 @@ void HostIrEvaluator::handle(MatmulOp* matmul) {
   TensorView* b = matmul->inB();
   TensorView* out = matmul->out();
 
-  if (expr_evaluator_.isKnown(out)) {
-    auto t_a = getKnownConcreteValue(a).as<at::Tensor>();
-    auto t_b = getKnownConcreteValue(b).as<at::Tensor>();
-    auto t_out = getKnownConcreteValue(out).as<at::Tensor>();
-    at::matmul_out(t_out, t_a, t_b);
-  } else {
+  if (!matmul->outputIsPreallocated()) {
     unhandled(matmul);
+    return;
   }
+
+  auto t_a = getKnownConcreteValue(a).as<at::Tensor>();
+  auto t_b = getKnownConcreteValue(b).as<at::Tensor>();
+  auto t_out = getKnownConcreteValue(out).as<at::Tensor>();
+  at::matmul_out(t_out, t_a, t_b);
 }
 
 void HostIrEvaluator::handle(LinearOp* linear) {
@@ -498,6 +499,8 @@ void HostIrEvaluator::handle(LinearOp* linear) {
   auto* out = linear->out()->as<TensorView>();
 
   // FIXME: this breaks MultiDeviceExecutor.
+  std::cout << "linear->outputIsPreallocated(): "
+            << linear->outputIsPreallocated() << std::endl;
   if (!linear->outputIsPreallocated()) {
     return unhandled(linear);
   }
diff --git a/csrc/host_ir/pass/stream_parallel_type.cpp b/csrc/host_ir/pass/stream_parallel_type.cpp
@@ -475,6 +475,7 @@ std::list<Expr*> processForLoopBodies(
              ir_utils::filterByType<TensorView>(body_expr->outputs())) {
           processTensor(body_expr, output, tensor_index);
         }
+        body_expr = body_expr->withOutputPreallocated();
         new_loop_body.push_back(body_expr);
       }
     }
diff --git a/tests/cpp/test_host_ir_evaluator.cpp b/tests/cpp/test_host_ir_evaluator.cpp
@@ -159,7 +159,8 @@ TEST_F(HostIrEvaluatorTest, MatmulInLoop) {
 
     // By default, MatmulOp is computed by ExpressionEvaluator so it appears in
     // host IR.
-    auto* mm = IrBuilder::create<MatmulOp>(loop_out, in, loop_w);
+    auto* mm = IrBuilder::create<MatmulOp>(loop_out, in, loop_w)
+                   ->withOutputPreallocated();
     for_loop->body().push_back(mm);
 
     hic->pushBackTopLevelExprs(allocate_out);
diff --git a/tests/cpp/test_host_ir_stream_lowering.cpp b/tests/cpp/test_host_ir_stream_lowering.cpp
@@ -6,9 +6,6 @@
  */
 // clang-format on
 
-#include <algorithm>
-#include <iostream>
-
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
@@ -874,7 +874,8 @@ TEST_F(MatmulHostIrTest, HostIrMatmulOut) {
   TensorView* tv0 = makeContigTensor(3);
   TensorView* tv1 = makeContigTensor(3);
   TensorView* tv2 = makeContigTensor(3);
-  auto* matmul = IrBuilder::create<MatmulOp>(tv2, tv0, tv1);
+  auto* matmul =
+      IrBuilder::create<MatmulOp>(tv2, tv0, tv1)->withOutputPreallocated();
 
   hic->addInput(tv0);
   hic->addInput(tv1);
@@ -956,7 +957,8 @@ TEST_F(LinearHostIrTest, HostIrLinearOut) {
   TensorView* bias = makeContigTensor(1);
   TensorView* out = makeContigTensor(3);
 
-  auto linear_op = IrBuilder::create<LinearOp>(out, in, weight, bias);
+  auto* linear_op = IrBuilder::create<LinearOp>(out, in, weight, bias)
+                        ->withOutputPreallocated();
 
   hic->addInput(in);
   hic->addInput(weight);

Original file line number	Diff line number	Diff line change
`@@ -475,6 +475,7 @@ std::list<Expr*> processForLoopBodies(`
`475`	`475`	`ir_utils::filterByType<TensorView>(body_expr->outputs())) {`
`476`	`476`	`processTensor(body_expr, output, tensor_index);`
`477`	`477`	`}`
	`478`	`+ body_expr = body_expr->withOutputPreallocated();`
`478`	`479`	`new_loop_body.push_back(body_expr);`
`479`	`480`	`}`
`480`	`481`	`}`