NVIDIA · Copilot · Nov 12, 2025 · Nov 12, 2025 · Nov 13, 2025 · IvanYashchuk
diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
@@ -1023,11 +1023,7 @@ TensorView* broadcast(
       nBCastDims - n_broadcasts);
 
   if (n_broadcasts == 0) {
-    auto identity = set(inp);
-    NVF_ERROR(
-        identity->getValType().value() == ValType::TensorView,
-        "Expected identity op, but didn't get a TensorView back.");
-    return identity->as<TensorView>();
+    return inp;
   }
 
   std::vector<IterDomain*> out_domain;

diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp
@@ -14,6 +14,7 @@
 #include <alias_analysis.h>
 #include <fusion.h>
 #include <fusion_profiler.h>
+#include <ir/internal_nodes.h>
 #include <ir/iostream.h>
 #include <ir/utils.h>
 #include <ops/alias.h>
@@ -1659,4 +1660,42 @@ TEST_F(AliasTest, SliceOfExpandedBroadcast) {
       executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 }
 
+TEST_F(AliasTest, BroadcastInDimNoRedundantSet) {
+  // Test that broadcast with no actual broadcasting does not introduce
+  // a redundant Set operation
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* in = makeContigConcreteTensor({2, 3});
+  fusion->addInput(in);
+
+  // Call broadcast with all dims marked as non-broadcast
+  // This should not introduce a Set operation and return the input directly
+  std::vector<bool> is_broadcast_dim = {false, false};
+  TensorView* maybe_bcast = broadcast(in, is_broadcast_dim);
+
+  // Add an operation to ensure we have something to test
+  TensorView* out = abs(maybe_bcast);
+
+  fusion->addOutput(out);
+
+  // Verify that no LoadStoreOp with type Set is in the fusion
+  auto exprs = fusion->exprs();
+  for (auto expr : exprs) {
+    if (auto load_store = dynamic_cast<LoadStoreOp*>(expr)) {
+      EXPECT_NE(load_store->opType(), LoadStoreOpType::Set)
+          << "Unexpected Set operation found in fusion with no-op broadcast";
+    }
+  }
+
+  // Verify the fusion still works correctly
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor in_tensor =
+      at::randn({2, 3}, at::dtype(at::kFloat).device(at::kCUDA));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser
diff --git a/tests/python/test_python_frontend.py b/tests/python/test_python_frontend.py
@@ -667,6 +667,50 @@ def fusion_func_3(fd: FusionDefinition):
         )
         self.assertEqual(eager_out, nvf_out[0])
 
+    def test_broadcast_in_dim_no_redundant_set(self):
+        """
+        Test that broadcast_in_dim doesn't introduce redundant Set operations
+        when all input dimensions are in broadcast_dims (i.e., no actual broadcast).
+
+        This verifies the fix for the issue where broadcast_in_dim would create
+        a redundant float-to-float cast operation via Set when the input already
+        had the correct shape.
+        """
+        inputs = [
+            torch.ones(1, 4, device="cuda"),
+            torch.randn(2, 4, device="cuda"),
+        ]
+
+        def fusion_with_broadcast_in_dim(fd: FusionDefinition):
+            t0 = fd.define_tensor(shape=[1, -1], contiguity=[None, True])
+            t1 = fd.define_tensor(shape=[-1, -1], contiguity=[True, True])
+            # broadcast_in_dim with broadcast_dims=[0, 1] means no new dims are added
-            # broadcast_in_dim with broadcast_dims=[0, 1] means no new dims are added
+            # broadcast_in_dim with all input dims in broadcast_dims means no broadcasting operation occurs
-            # broadcast_in_dim with broadcast_dims=[0, 1] means no new dims are added
+            # broadcast_in_dim with all input dims in broadcast_dims means no broadcasting operation occurs
+            t2 = fd.ops.broadcast_in_dim(t0, t1.shape(), [0, 1])
+            t3 = fd.ops.add(t2, t1)
+            fd.add_output(t3)
+
+        def fusion_with_expand(fd: FusionDefinition):
+            t0 = fd.define_tensor(shape=[1, -1], contiguity=[None, True])
+            t1 = fd.define_tensor(shape=[-1, -1], contiguity=[True, True])
+            # Direct expand without broadcast_in_dim
+            t2 = fd.ops.expand(t0, t1.shape())
+            t3 = fd.ops.add(t2, t1)
+            fd.add_output(t3)
+
+        # Execute both fusions and verify they produce the same result
+        nvf_out_bid, fd_bid = self.exec_nvfuser(fusion_with_broadcast_in_dim, inputs)
+        nvf_out_exp, fd_exp = self.exec_nvfuser(fusion_with_expand, inputs)
+
+        # Verify correctness
+        eager_out = inputs[0] + inputs[1]
+        self.assertEqual(eager_out, nvf_out_bid[0])
+        self.assertEqual(eager_out, nvf_out_exp[0])
+
+        # Check that the broadcast_in_dim fusion doesn't have a redundant Set operation
+        # by comparing the IR string representations - they should be identical since
+        # broadcast is a no-op in this case
+        self.assertEqual(str(fd_bid), str(fd_exp))
+
     # Testing a scenario where the broadcast is necessary to realize the output
     def test_tensor_shape_with_output_bcast(self):
         def fusion_func(fd: FusionDefinition):