HKUSTDial · LoserCheems · Apr 22, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/flash_sparse_attn/ops/triton/activations.py b/flash_sparse_attn/ops/triton/activations.py
@@ -9,17 +9,74 @@ def check_inf(x):
 
 @triton.jit
 def online_softmax(
+    acc_s,
+    row_max,
+    row_sum,
+    scale_log2,
+    CHECK_INF: tl.constexpr,
+    RESCALE_THRESHOLD: tl.constexpr,
+):
+    """
+    Apply online softmax to acc_s, and update row_max and row_sum.
+
+    :param acc_s: Attention scores tensor of shape [BLOCK_M, BLOCK_N].
+    :param row_max: Current maximum values per row of shape [BLOCK_M], init to -inf.
+    :param row_sum: Current sum values per row of shape [BLOCK_M], init to 0.
+    :param scale_log2: Log2 of the scaling factor to be applied to acc_s.
+    :param CHECK_INF: Boolean flag indicating if -inf row_max should be clamped to 0.
+    :param RESCALE_THRESHOLD: Threshold for rescaling to avoid underflow. If <= 0, rescaling is disabled.
+
+    :return p: Softmax probabilities tensor of shape [BLOCK_M, BLOCK_N].
+    :return row_max_new: Updated maximum values per row of shape [BLOCK_M].
+    :return row_sum_new: Updated sum values per row of shape [BLOCK_M].
+    :return row_scale: Scaling factors per row of shape [BLOCK_M].
+    """
+    # Compute current row max
+    row_max_curr = tl.max(acc_s, axis=1)
+
+    # Update row max
+    row_max_new = tl.maximum(row_max_curr, row_max)
+
+    # Avoid exp(-inf - (-inf)) = nan by clamping -inf to 0
+    if CHECK_INF:
+        row_max_new = check_inf(row_max_new)
+
+    # Compute scaled differences to new row max
+    acc_scale_log2 = (row_max - row_max_new) * scale_log2
+
+    # Compute row scale
+    if RESCALE_THRESHOLD > 0.0:
+        # Triton can only skip computation at block granularity
+        if tl.min(acc_scale_log2) < -RESCALE_THRESHOLD:
+            row_scale = tl.exp2(acc_scale_log2)
+        else:
+            row_max_new = row_max
+            row_scale = acc_scale_log2 * 0.0 + 1.0
+    else:
+        row_scale = tl.exp2(acc_scale_log2)
+
+    # Compute attention weights
+    p = tl.exp2(acc_s * scale_log2 - row_max_new[:, None] * scale_log2)
+
+    # Update row sum
+    row_sum_cur = tl.sum(p, axis=1)
+    row_sum_new = row_sum * row_scale + row_sum_cur
+
+    return p, row_max_new, row_sum_new, row_scale
+
+
+@triton.jit
+def online_sparse_softmax(
     acc_s,
     block_max,
     row_max,
     row_sum,
     scale_log2,
     softmax_threshold_log2,
     CHECK_INF: tl.constexpr,
-    RESCALE_THRESHOLD: tl.constexpr,
 ):
     """
-    Apply online softmax to acc_s, and update block_max, row_max and row_sum.
+    Apply online sparse softmax to acc_s, and update block_max, row_max and row_sum.
 
     :param acc_s: Attention scores tensor of shape [BLOCK_M, BLOCK_N].
     :param block_max: Running block-wise maximum scalar, init to -inf.
@@ -28,7 +85,6 @@ def online_softmax(
     :param scale_log2: Log2 of the scaling factor to be applied to acc_s.
     :param softmax_threshold_log2: Threshold in log2-domain for block-level skip. If > -inf and block max is below threshold relative to running max, skip softmax update.
     :param CHECK_INF: Boolean flag indicating if -inf row_max should be clamped to 0.
-    :param RESCALE_THRESHOLD: Threshold for rescaling to avoid underflow. If <= 0, rescaling is disabled.
 
     :return p: Softmax probabilities tensor of shape [BLOCK_M, BLOCK_N].
     :return block_max_new: Updated block-wise maximum scalar.
@@ -69,15 +125,7 @@ def online_softmax(
         acc_scale_log2 = (row_max - row_max_new) * scale_log2
 
         # Compute row scale
-        if RESCALE_THRESHOLD > 0.0:
-            # Triton can only skip computation at block granularity
-            if tl.min(acc_scale_log2) < -RESCALE_THRESHOLD:
-                row_scale = tl.exp2(acc_scale_log2)
-            else:
-                row_max_new = row_max
-                row_scale = acc_scale_log2 * 0.0 + 1.0
-        else:
-            row_scale = tl.exp2(acc_scale_log2)
+        row_scale = tl.exp2(acc_scale_log2)
 
         # Compute attention weights
         p = tl.exp2(acc_s * scale_log2 - row_max_new[:, None] * scale_log2)

diff --git a/flash_sparse_attn/ops/triton/flash_bwd_postprocess.py b/flash_sparse_attn/ops/triton/flash_bwd_postprocess.py
@@ -90,7 +90,7 @@ def _bwd_postprocess_kernel(
         order=(1, 0),
     )
 
-    # Advance dq_accum pointer
+    # Advance dq_accum pointers
     dq_accum_ptrs = tl.advance(dq_accum_ptrs, (m_block * TILE_M, 0))
 
     # Load accumulators
@@ -99,7 +99,7 @@ def _bwd_postprocess_kernel(
     # Scale dq
     dq = (acc_dq * scale).to(dQ.dtype.element_ty)
 
-    # Advance dq pointer
+    # Advance dq pointers
     dq_ptrs = tl.advance(dq_ptrs, (m_block * TILE_M, 0))
 
     # Store dq
@@ -144,7 +144,7 @@ def _bwd_postprocess_kernel(
             order=(0,),
         )
 
-        # Advance da_accum pointer
+        # Advance da_accum pointers
         da_accum_ptrs = tl.advance(da_accum_ptrs, (m_block * TILE_M,))
 
         # Load da accumulators
@@ -153,7 +153,7 @@ def _bwd_postprocess_kernel(
         # Scale da
         da = (acc_da * scale).to(dA.dtype.element_ty)
 
-        # Advance da pointer
+        # Advance da pointers
         da_ptrs = tl.advance(da_ptrs, (m_block * TILE_M,))
 
         # Store da

diff --git a/flash_sparse_attn/ops/triton/flash_bwd_preprocess.py b/flash_sparse_attn/ops/triton/flash_bwd_preprocess.py
@@ -174,31 +174,31 @@ def _bwd_preprocess_kernel(
     # Initialize accumulators
     acc_dq = tl.zeros((TILE_M, TILE_K), dtype=tl.float32)
 
-    # Advance output pointer
+    # Advance output pointers
     o_ptrs = tl.advance(o_ptrs, (m_block * TILE_M, 0))
 
     # Load o tile
     o_tile = tl.load(o_ptrs, boundary_check=(0, 1)).to(tl.float32)
 
-    # Advance do pointer
+    # Advance do pointers
     do_ptrs = tl.advance(do_ptrs, (m_block * TILE_M, 0))
 
     # Load do tile
     do_tile = tl.load(do_ptrs, boundary_check=(0, 1)).to(tl.float32)
 
-    # Advance dpsum pointer
+    # Advance dpsum pointers
     dpsum_ptrs = tl.advance(dpsum_ptrs, (m_block * TILE_M,))
 
     # Compute dpsum
     dpsum = tl.sum(o_tile * do_tile, axis=1)
 
-    # Advance acc_dq pointer
+    # Advance acc_dq pointers
     dq_accum_ptrs = tl.advance(dq_accum_ptrs, (m_block * TILE_M, 0))
 
     # Store dpsum
     tl.store(dpsum_ptrs, dpsum, boundary_check=(0,))
 
-    # Advance lse pointer
+    # Advance lse pointers
     lse_ptrs = tl.advance(lse_ptrs, (m_block * TILE_M,))
 
     # Load lse tile
@@ -211,7 +211,7 @@ def _bwd_preprocess_kernel(
     # Store dq_accum
     tl.store(dq_accum_ptrs, acc_dq, boundary_check=(0, 1))
 
-    # Advance lse_log2 pointer
+    # Advance lse_log2 pointers
     lse_log2_ptrs = tl.advance(lse_log2_ptrs, (m_block * TILE_M,))
 
     # Store lse_log2

diff --git a/...arse_attn/ops/triton/flash_fwd_combine.py → ...arse_attn/ops/triton/flash_dec_combine.py b/...arse_attn/ops/triton/flash_fwd_combine.py → ...arse_attn/ops/triton/flash_dec_combine.py
@@ -6,7 +6,7 @@
 
 
 @triton.jit
-def _fwd_combine_kernel(
+def _dec_combine_kernel(
     Out_partial,
     Lse_partial,
     Out,
@@ -177,7 +177,7 @@ def _fwd_combine_kernel(
     tl.store(lse_ptrs, lse, boundary_check=(0,))
 
 
-def _flash_attn_fwd_combine(
+def _flash_attn_dec_combine(
     out_partial: torch.Tensor,
     lse_partial: torch.Tensor,
     out: torch.Tensor,
@@ -197,18 +197,18 @@ def _flash_attn_fwd_combine(
     TILE_K = max(triton.next_power_of_2(head_dim), 16)
 
     TILE_M, num_warps, num_stages, num_ctas = (
-        launch_template.get_fwd_combine_launch_config(
+        launch_template.get_dec_combine_launch_config(
             tile_k=TILE_K,
         )
     )
 
-    grid = launch_grid.get_fwd_combine_grid(
+    grid = launch_grid.get_dec_combine_grid(
         batch_size=batch_size,
         seqlen_q=seqlen_q,
         num_heads_q=num_heads_q,
     )
 
-    _fwd_combine_kernel[grid](
+    _dec_combine_kernel[grid](
         out_partial,
         lse_partial,
         out,