bugfix: correct attn output with base 2 or e

staugust · staugust · commit b72807073cb9 · 2025-11-17T15:17:07.000+08:00
flashinfer attention use 2 as base of lse instead of e, see https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/attention/mla.cuh#L400 Signed-off-by: augusto.yjh <augusto.yjh@antgroup.com>
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
@@ -88,6 +88,88 @@ def _correct_attn_cp_out_kernel(
     tl.store(new_output_ptr + output_offsets, output)
 
 
+@triton.jit
+def _correct_attn_cp_out_kernel_for_flashinfer(
+    outputs_ptr,
+    new_output_ptr,
+    lses_ptr,
+    vlse_ptr,
+    outputs_stride_B,
+    outputs_stride_H,
+    outputs_stride_D,
+    lses_stride_N,
+    lses_stride_B,
+    lses_stride_H,
+    lse_idx,
+    HEAD_DIM: tl.constexpr,
+    N_ROUNDED: tl.constexpr,
+):
+    """
+    Apply the all-gathered lses to correct each local rank's attention
+    output. we still need perform a cross-rank reduction to obtain the
+    final attention output.
+
+    Args:
+        outputs_ptr (triton.PointerType):
+            Pointer to input tensor of shape [ B, H, D ]
+        lses_ptr (triton.PointerType):
+            Pointer to input tensor of shape [ N, B, H ]
+        new_output_ptr (triton.PointerType):
+            Pointer to output tensor of shape [ B, H, D ]
+        vlse_ptr (triton.PointerType):
+            Pointer to output tensor of shape [ B, H ]
+    """
+    batch_idx = tl.program_id(axis=0).to(tl.int64)
+    head_idx = tl.program_id(axis=1).to(tl.int64)
+    d_offsets = tl.arange(0, HEAD_DIM)
+    num_n_offsets = tl.arange(0, N_ROUNDED)
+
+    # shape = [N]
+    lse_offsets = (
+        num_n_offsets * lses_stride_N
+        + batch_idx * lses_stride_B
+        + head_idx * lses_stride_H
+    )
+
+    # calc final lse
+    lse = tl.load(lses_ptr + lse_offsets)
+    lse = tl.where((lse != lse) | (lse == float("inf")), -float("inf"), lse)
+    lse_max = tl.max(lse, axis=0)
+    lse_max = tl.where(lse_max == -float("inf"), 0, lse_max)
+    lse -= lse_max
+    lse_exp = tl.exp2(lse)
+    lse_acc = tl.sum(lse_exp, axis=0)
+    lse = tl.log2(lse_acc)
+    lse += lse_max
+
+    lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H
+    tl.store(vlse_ptr + lse_offsets, lse)
+
+    # shape = [D]
+    output_offsets = (
+        batch_idx * outputs_stride_B
+        + head_idx * outputs_stride_H
+        + d_offsets * outputs_stride_D
+    )
+
+    # correct output
+    lse_offset = (
+        lse_idx * lses_stride_N + batch_idx * lses_stride_B + head_idx * lses_stride_H
+    )
+    lse_tmp = tl.load(lses_ptr + lse_offset)
+    lse_finally = lse_tmp - lse
+    lse_finally = tl.where(
+        (lse_finally != lse_finally) | (lse_finally == float("inf")),
+        -float("inf"),
+        lse_finally,
+    )
+    factor = tl.exp2(lse_finally)
+    output = tl.load(outputs_ptr + output_offsets)
+    output = output * factor
+
+    tl.store(new_output_ptr + output_offsets, output)
+
+
 class CPTritonContext:
     """The CPTritonContext is used to avoid recompilation of the Triton JIT."""
 
@@ -102,7 +184,7 @@ def call_kernel(self, kernel, grid, *regular_args, **const_args):
 
 
 def correct_attn_out(
-    out: torch.Tensor, lses: torch.Tensor, cp_rank: int, ctx: CPTritonContext
+    out: torch.Tensor, lses: torch.Tensor, cp_rank: int, ctx: CPTritonContext, is_lse_base_on_e: bool=True,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Correct the attention output using the all-gathered lses.
 
@@ -164,8 +246,10 @@ def correct_attn_out(
         cp_rank,
     )
     const_args = {"HEAD_DIM": D, "N_ROUNDED": N}
-
-    ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, **const_args)
+    correct_attn_kernel = _correct_attn_cp_out_kernel
+    if not is_lse_base_on_e:
+        correct_attn_kernel = _correct_attn_cp_out_kernel_for_flashinfer
+    ctx.call_kernel(correct_attn_kernel, grid, *regular_args, **const_args)
     return out, lse
 
 
@@ -175,6 +259,7 @@ def cp_lse_ag_out_rs(
     cp_group: GroupCoordinator,
     ctx: CPTritonContext = None,
     return_lse=False,
+    is_lse_base_on_e=True,
 ):
     """
     cp_attn_out: [ B, H, D ]
@@ -194,7 +279,7 @@ def cp_lse_ag_out_rs(
 
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
-    out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx, is_lse_base_on_e=is_lse_base_on_e)
     out = cp_group.reduce_scatter(out, dim=1)
 
     if return_lse:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -250,7 +250,7 @@ def run(
             return_lse=True,
         )
         output_context, lse_context = cp_lse_ag_out_rs(
-            output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True
+            output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True, is_lse_base_on_e=False,
         )
         lse_context = lse_context.transpose(0, 1).contiguous()
 
@@ -1343,7 +1343,7 @@ def forward(
                         return_lse=True,
                     )
                     output[:num_decode_tokens] = cp_lse_ag_out_rs(
-                        output_tmp, lse, get_dcp_group()
+                        output_tmp, lse, get_dcp_group(), is_lse_base_on_e=False,
                     )
                 else:
                     decode_wrapper.run(

Original file line number	Diff line number	Diff line change
`@@ -250,7 +250,7 @@ def run(`
`250`	`250`	`return_lse=True,`
`251`	`251`	`)`
`252`	`252`	`output_context, lse_context = cp_lse_ag_out_rs(`
`253`		`- output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True`
	`253`	`+ output_context_tmp, lse_context_tmp, get_dcp_group(), return_lse=True, is_lse_base_on_e=False,`
`254`	`254`	`)`
`255`	`255`	`lse_context = lse_context.transpose(0, 1).contiguous()`
`256`	`256`
`@@ -1343,7 +1343,7 @@ def forward(`
`1343`	`1343`	`return_lse=True,`
`1344`	`1344`	`)`
`1345`	`1345`	`output[:num_decode_tokens] = cp_lse_ag_out_rs(`
`1346`		`- output_tmp, lse, get_dcp_group()`
	`1346`	`+ output_tmp, lse, get_dcp_group(), is_lse_base_on_e=False,`
`1347`	`1347`	`)`
`1348`	`1348`	`else:`
`1349`	`1349`	`decode_wrapper.run(`