@@ -668,11 +668,6 @@ def kernel(
668668 qhead_per_kvhead_divmod ,
669669 )
670670 if warp_idx == 1 :
671- for warp_group_idx in cutlass .range (self .num_mma_warp_groups ):
672- cute .arch .barrier_arrive (
673- barrier_id = int (NamedBarrierBwd .dQEmptyWG0 ) + warp_group_idx ,
674- number_of_threads = self .num_threads_per_warp_group + cute .arch .WARP_SIZE ,
675- )
676671 self .dQaccum_store (
677672 mdQaccum ,
678673 sdQaccum ,
@@ -1605,6 +1600,16 @@ def dQaccum_store(
16051600 m_block = m_block_min + iter_idx
16061601 m_block_safe = m_block
16071602
1603+ for warp_group_idx in cutlass .range_constexpr (self .num_mma_warp_groups ):
1604+ cute .arch .cp_async_bulk_wait_group (
1605+ self .num_mma_warp_groups - 1 - warp_group_idx , read = True
1606+ )
1607+ cute .arch .barrier_arrive (
1608+ barrier_id = int (NamedBarrierBwd .dQEmptyWG0 ) + warp_group_idx ,
1609+ number_of_threads = self .num_threads_per_warp_group
1610+ + cute .arch .WARP_SIZE ,
1611+ )
1612+
16081613 for warp_group_idx in cutlass .range_constexpr (self .num_mma_warp_groups ):
16091614 cute .arch .barrier (
16101615 barrier_id = int (NamedBarrierBwd .dQFullWG0 ) + warp_group_idx ,
@@ -1618,15 +1623,6 @@ def dQaccum_store(
16181623 self .tma_copy_bytes ["dQ" ],
16191624 )
16201625 cute .arch .cp_async_bulk_commit_group ()
1621- for warp_group_idx in cutlass .range_constexpr (self .num_mma_warp_groups ):
1622- cute .arch .cp_async_bulk_wait_group (
1623- self .num_mma_warp_groups - 1 - warp_group_idx , read = True
1624- )
1625- cute .arch .barrier_arrive (
1626- barrier_id = int (NamedBarrierBwd .dQEmptyWG0 ) + warp_group_idx ,
1627- number_of_threads = self .num_threads_per_warp_group
1628- + cute .arch .WARP_SIZE ,
1629- )
16301626 else :
16311627 dQaccum_store_block_sparse_bwd_sm90 (
16321628 blocksparse_tensors ,
@@ -1643,3 +1639,5 @@ def dQaccum_store(
16431639 )
16441640 tile_scheduler .advance_to_next_work ()
16451641 work_tile = tile_scheduler .get_current_work ()
1642+
1643+ cute .arch .cp_async_bulk_wait_group (0 , read = True )
0 commit comments