NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py‎
Lines changed: 138 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_attention.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/triton_attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/patches/gptoss.py‎
Lines changed: 2 additions & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/models/patches/gptoss.py‎
Lines changed: 2 additions & 1 deletion
@@ -354,7 +354,7 @@ def get_num_qkv_args(cls) -> int:
     @classmethod
     def get_source_attention_op(cls) -> OpOverloadPacket:
         """Get the source attention op that we target for replacement."""
-        return torch.ops.auto_deploy.torch_attention_bsnd_grouped_sdpa
+        return torch.ops.auto_deploy.torch_attention
 
     @classmethod
     def get_cached_attention_op(cls) -> MHACallable:
 
@@ -257,6 +257,144 @@ def bsnd_grouped_sdpa_fake(
     return query.new_empty(*query.shape[:-1], value.shape[-1]).contiguous()
 
 
+# Unified attention op
+@torch.library.custom_op("auto_deploy::torch_attention", mutates_args=())
+def torch_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+    sinks: Optional[torch.Tensor] = None,
+    sliding_window: Optional[int] = None,
+    logit_cap: Optional[float] = None,
+    layout: str = "bnsd",  # "bnsd" or "bsnd"
+) -> torch.Tensor:
+    """
+    SDPA attention (with optional GQA) that supports two memory layouts via `layout`:
+      - "bnsd": [batch, num_heads, seq_len, head_dim]
+      - "bsnd": [batch, seq_len, num_heads, head_dim]
+
+    The `attn_mask` is always interpreted as [b, n, s_q, s_k].
+
+    Returns a tensor in the SAME layout as inputs specified by `layout`.
+    """
+    if layout not in ("bnsd", "bsnd"):
+        raise ValueError(f"layout must be 'bnsd' or 'bsnd', got {layout!r}")
+
+    if layout == "bsnd":
+        query = query.transpose(1, 2).contiguous()
+        key = key.transpose(1, 2).contiguous()
+        value = value.transpose(1, 2).contiguous()
+
+    b, n_heads, s_q, head_dim = query.shape  # bnsd format: [batch, num_heads, seq_len, head_dim]
+    _, n_kv_heads, s_k, _ = key.shape  # bnsd format: [batch, num_kv_heads, seq_len, head_dim]
+
+    # Inputs are already in bnsd format, no need to transpose
+    query_t = query  # [b, n_heads, s_q, head_dim]
+    key_t = key  # [b, n_kv_heads, s_k, head_dim]
+    value_t = value  # [b, n_kv_heads, s_k, v_head_dim]
+
+    # Handle GQA by repeating KV if needed
+    if n_heads != n_kv_heads:
+        n_rep = n_heads // n_kv_heads
+        key_t = repeat_kv(key_t, n_rep)
+        value_t = repeat_kv(value_t, n_rep)
+
+    # Set scale
+    if scale is None:
+        scale = 1.0 / math.sqrt(head_dim)
+
+    # Compute attention scores: Q @ K^T
+    attn_scores = torch.matmul(query_t, key_t.transpose(-2, -1)) * scale  # [b, n_heads, s_q, s_k]
+
+    # Apply attention mask if provided
+    if attn_mask is not None:
+        # Convert boolean mask to float if needed
+        attn_mask = _convert_boolean_mask_to_float(attn_mask, attn_scores.dtype)
+        attn_scores = attn_scores + attn_mask
+
+    # Apply causal mask if specified and only during the context phase
+    if is_causal and s_q == s_k:  # Only apply causal mask during context processing
+        causal_mask = torch.triu(
+            torch.ones(s_q, s_k, device=query.device, dtype=torch.bool),
+            diagonal=1,  # Use diagonal=1 for standard causal masking
+        )
+        attn_scores.masked_fill_(causal_mask.unsqueeze(0).unsqueeze(0), float("-inf"))
+
+    # Apply sliding window mask if specified
+    if sliding_window is not None and sliding_window > 0:
+        # Handle position calculation for both context and generation phases
+        if s_q == s_k:
+            # Context phase: standard position calculation
+            query_positions = torch.arange(s_q, device=query.device)
+            key_positions = torch.arange(s_k, device=query.device)
+        else:
+            # Generation phase: query is at position s_k (after the cache)
+            query_positions = torch.arange(s_k, s_k + s_q, device=query.device)  # [s_k] for s_q=1
+            key_positions = torch.arange(s_k, device=query.device)  # [0,1,2,...,s_k-1]
+
+        # Create position difference matrix: query_pos - key_pos
+        pos_diff = query_positions.unsqueeze(1) - key_positions.unsqueeze(0)  # [s_q, s_k]
+
+        # Sliding window mask: allow attention only if 0 <= pos_diff < sliding_window_size
+        sliding_window_mask = (pos_diff < 0) | (pos_diff >= sliding_window)  # [s_q, s_k]
+        attn_scores.masked_fill_(sliding_window_mask.unsqueeze(0).unsqueeze(0), float("-inf"))
+
+    # Apply logit softcapping if enabled
+    attn_scores = _apply_logit_softcapping(attn_scores, logit_cap)
+
+    # Apply sinks if provided
+    if sinks is not None:
+        # Concatenate sinks to attention scores following the reference implementation
+        # sinks should have n_heads elements, each head gets its own sink value
+        # Expand sinks to [b, n_heads, s_q, 1] - one sink column per head
+        sinks_expanded = sinks.reshape(1, -1, 1, 1).expand(
+            b, n_heads, s_q, 1
+        )  # [b, n_heads, s_q, 1]
+
+        # Concatenate along the key dimension (last dimension)
+        logits_max = torch.max(attn_scores, dim=-1, keepdim=True).values
+        sinks = torch.exp(sinks_expanded - logits_max)
+        unnormalized_scores = torch.exp(attn_scores - logits_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
+        scores = unnormalized_scores / normalizer
+        # Use only the non-sink portion for computing output
+        # We added exactly 1 column, so remove exactly 1 column
+        attn_out = torch.matmul(scores, value_t)  # [b, n_heads, s_q, v_head_dim]
+    else:
+        attn_weights = torch.softmax(attn_scores, dim=-1, dtype=torch.float32).to(query.dtype)
+        attn_out = torch.matmul(attn_weights, value_t)  # [b, n_heads, s_q, v_head_dim]
+
+    # Apply dropout if specified
+    if dropout_p > 0.0:
+        attn_out = F.dropout(attn_out, p=dropout_p, training=False)
+
+    if layout == "bsnd":
+        return attn_out.transpose(1, 2).contiguous()
+    else:
+        return attn_out.contiguous()
+
+
+@torch_attention.register_fake
+def torch_attention_fake(
+    query,
+    key,
+    value,
+    attn_mask=None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale=None,
+    sinks=None,
+    sliding_window=None,
+    logit_cap=None,
+    layout: str = "bnsd",
+):
+    return query.new_empty(*query.shape[:-1], value.shape[-1]).contiguous()
+
+
 def update_kv_cache(
     key_states: torch.Tensor,
     value_states: torch.Tensor,
 
@@ -408,7 +408,7 @@ def get_num_qkv_args(cls) -> int:
 
     @classmethod
     def get_source_attention_op(cls) -> OpOverloadPacket:
-        return torch.ops.auto_deploy.torch_attention_bsnd_grouped_sdpa
+        return torch.ops.auto_deploy.torch_attention
 
     @classmethod
     def get_cached_attention_op(cls) -> MHACallable:
 
@@ -337,7 +337,7 @@ def get_num_qkv_args(cls) -> int:
 
     @classmethod
     def get_source_attention_op(cls) -> OpOverloadPacket:
-        return torch.ops.auto_deploy.torch_attention_bsnd_grouped_sdpa
+        return torch.ops.auto_deploy.torch_attention
 
     @classmethod
     def get_cached_attention_op(cls) -> MHACallable:
 
@@ -59,7 +59,7 @@ def gpt_oss_attention(
         sinks = self.sinks
 
     # Use custom op to capture attention. This layout is bsnd (batch, seq, num_heads, head_dim)
-    attn_output = torch.ops.auto_deploy.torch_attention_bsnd_grouped_sdpa(
+    attn_output = torch.ops.auto_deploy.torch_attention(
         query_states,
         key_states,
         value_states,
@@ -69,6 +69,7 @@ def gpt_oss_attention(
         scale=self.scaling,
         sinks=sinks,
         sliding_window=sliding_window,
+        layout="bsnd",
     )
 
     # Reshape back to original input shape