Add profiling to check kernel counts

wujingyue · wujingyue · commit 466faaea2184 · 2025-11-20T12:18:19.000-08:00
diff --git a/csrc/runtime/fusion_executor_cache.cpp b/csrc/runtime/fusion_executor_cache.cpp
@@ -72,11 +72,12 @@ KernelArgumentHolder FusionExecutorCache::runFusionWithInputs(
 
   // Make sure the forced index type is indeed used
   if (forced_index_type.has_value()) {
-    NVF_ERROR(
-        kernel_runtime->getIndexType() == forced_index_type.value(),
+    NVF_ERROR_EQ(
+        kernel_runtime->getIndexType(),
+        forced_index_type.value(),
         "Enforcing index type of ",
         forced_index_type.value(),
-        " failed");
+        " failed.");
   }
 
   auto outputs = kernel_runtime->runWithInputs(args);
diff --git a/tests/python/multidevice/test_overlap.py b/tests/python/multidevice/test_overlap.py
@@ -86,6 +86,29 @@ def test_row_parallel_linear_forward(multidevice_direct_test):
     (out,) = fd.execute([inp, weight], _enable_options=["host_ir_lowering"])
     torch.testing.assert_close(out.cpu(), out_ref)
 
+    # Collect CUDA kernels after a warmup run to exclude autotuning.
+    # nvfuser_direct.PythonProfiler failed with host IR lowering. The main
+    # reason is that HostIrContainer doesn't keep segments while SegmentProfiler
+    # is still expecting data.  It's unclear to me whether we should relax
+    # SegmentProfiler's assumptions or stop creating them in the first place.
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CUDA]
+    ) as prof:
+        (out,) = fd.execute([inp, weight], _enable_options=["host_ir_lowering"])
+
+    kernel_events = [
+        event
+        for event in prof.events()
+        if event.device_type == torch.profiler.DeviceType.CUDA
+    ]
+
+    # When multiple GPUs, expect three kernels per iteration: linear, memcpy,
+    # allreduce.  The memcpy is from
+    # https://github.com/NVIDIA/Fuser/blob/cce887595dc86b099506b70f88d653880fde5116/csrc/multidevice/communication.cpp#L493.
+    # When single GPU, expect two kernels per iteration: linear, memcpy.
+    num_kernels_per_iteration = 2 if d == 1 else 3
+    assert len(kernel_events) == s * num_kernels_per_iteration
+
 
 @pytest.mark.mpi
 @pytest.mark.parametrize("backend_type", [CommunicatorBackend.nccl])