Skip to content

Commit 466faae

Browse files
committed
Add profiling to check kernel counts
1 parent 5f7427a commit 466faae

File tree

2 files changed

+27
-3
lines changed

2 files changed

+27
-3
lines changed

csrc/runtime/fusion_executor_cache.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,12 @@ KernelArgumentHolder FusionExecutorCache::runFusionWithInputs(
7272

7373
// Make sure the forced index type is indeed used
7474
if (forced_index_type.has_value()) {
75-
NVF_ERROR(
76-
kernel_runtime->getIndexType() == forced_index_type.value(),
75+
NVF_ERROR_EQ(
76+
kernel_runtime->getIndexType(),
77+
forced_index_type.value(),
7778
"Enforcing index type of ",
7879
forced_index_type.value(),
79-
" failed");
80+
" failed.");
8081
}
8182

8283
auto outputs = kernel_runtime->runWithInputs(args);

tests/python/multidevice/test_overlap.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,29 @@ def test_row_parallel_linear_forward(multidevice_direct_test):
8686
(out,) = fd.execute([inp, weight], _enable_options=["host_ir_lowering"])
8787
torch.testing.assert_close(out.cpu(), out_ref)
8888

89+
# Collect CUDA kernels after a warmup run to exclude autotuning.
90+
# nvfuser_direct.PythonProfiler failed with host IR lowering. The main
91+
# reason is that HostIrContainer doesn't keep segments while SegmentProfiler
92+
# is still expecting data. It's unclear to me whether we should relax
93+
# SegmentProfiler's assumptions or stop creating them in the first place.
94+
with torch.profiler.profile(
95+
activities=[torch.profiler.ProfilerActivity.CUDA]
96+
) as prof:
97+
(out,) = fd.execute([inp, weight], _enable_options=["host_ir_lowering"])
98+
99+
kernel_events = [
100+
event
101+
for event in prof.events()
102+
if event.device_type == torch.profiler.DeviceType.CUDA
103+
]
104+
105+
# When multiple GPUs, expect three kernels per iteration: linear, memcpy,
106+
# allreduce. The memcpy is from
107+
# https://github.com/NVIDIA/Fuser/blob/cce887595dc86b099506b70f88d653880fde5116/csrc/multidevice/communication.cpp#L493.
108+
# When single GPU, expect two kernels per iteration: linear, memcpy.
109+
num_kernels_per_iteration = 2 if d == 1 else 3
110+
assert len(kernel_events) == s * num_kernels_per_iteration
111+
89112

90113
@pytest.mark.mpi
91114
@pytest.mark.parametrize("backend_type", [CommunicatorBackend.nccl])

0 commit comments

Comments
 (0)