@@ -86,6 +86,29 @@ def test_row_parallel_linear_forward(multidevice_direct_test):
8686 (out ,) = fd .execute ([inp , weight ], _enable_options = ["host_ir_lowering" ])
8787 torch .testing .assert_close (out .cpu (), out_ref )
8888
89+ # Collect CUDA kernels after a warmup run to exclude autotuning.
90+ # nvfuser_direct.PythonProfiler failed with host IR lowering. The main
91+ # reason is that HostIrContainer doesn't keep segments while SegmentProfiler
92+ # is still expecting data. It's unclear to me whether we should relax
93+ # SegmentProfiler's assumptions or stop creating them in the first place.
94+ with torch .profiler .profile (
95+ activities = [torch .profiler .ProfilerActivity .CUDA ]
96+ ) as prof :
97+ (out ,) = fd .execute ([inp , weight ], _enable_options = ["host_ir_lowering" ])
98+
99+ kernel_events = [
100+ event
101+ for event in prof .events ()
102+ if event .device_type == torch .profiler .DeviceType .CUDA
103+ ]
104+
105+ # When multiple GPUs, expect three kernels per iteration: linear, memcpy,
106+ # allreduce. The memcpy is from
107+ # https://github.com/NVIDIA/Fuser/blob/cce887595dc86b099506b70f88d653880fde5116/csrc/multidevice/communication.cpp#L493.
108+ # When single GPU, expect two kernels per iteration: linear, memcpy.
109+ num_kernels_per_iteration = 2 if d == 1 else 3
110+ assert len (kernel_events ) == s * num_kernels_per_iteration
111+
89112
90113@pytest .mark .mpi
91114@pytest .mark .parametrize ("backend_type" , [CommunicatorBackend .nccl ])
0 commit comments