@@ -168,7 +168,7 @@ struct CollectiveEpilogue {
168168 /* id=*/ static_cast <int >(NamedBarriers::kValueEmpty ));
169169 cute::copy (smem_tiled_copy_O, tOrO_retile, tOsO);
170170 cutlass::arch::fence_view_async_shared (); // ensure smem writes are visible to TMA
171- cutlass::arch::NamedBarrier::arrive (NUM_MMA_THREADS + Ktraits::NUM_PRODUCER_THREADS ,
171+ cutlass::arch::NamedBarrier::arrive (NUM_MMA_THREADS,
172172 cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
173173
174174 Tensor mLSE = make_tensor (make_gmem_ptr (epilogue_params.lse_ptr ), epilogue_params.layout_LSE );
@@ -194,11 +194,10 @@ struct CollectiveEpilogue {
194194 }
195195 }
196196
197+ cutlass::arch::NamedBarrier::sync (NUM_MMA_THREADS,
198+ cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
199+
197200 int write_warp_idx = NUM_WARPS - 1 ;
198- if (cutlass::canonical_warp_idx_sync () == write_warp_idx) {
199- cutlass::arch::NamedBarrier::sync (NUM_MMA_THREADS + Ktraits::NUM_PRODUCER_THREADS,
200- cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
201- }
202201 TiledCopyO gmem_tiled_copy_O;
203202 write_O<NUM_COPY_THREADS>(epilogue_params.O_ptr , gmem_tiled_copy_O, epilogue_params.layout_O ,
204203 select<0 , 1 >(TileShape_PDV{}), sO , thread_idx, qo_tile_idx,
0 commit comments