Add Attention mask support in test_accuracy

jennychristopher · jennychristopher · commit 0cf2700fbeb4 · 2025-09-23T17:10:10.000Z
diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md
@@ -45,7 +45,7 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep
 | Mistral-7B        | N150        | 95        | 99        | 29.75         | 100.24    |
 | Mistral-7B        | N300        | 95        | 99        | 47.01         | 65.95     |
 | Mistral-7B        | T3K         | 95        | 99        | 67.82         | 53.93     |
-| gemma-3-1b        | N150        |32         |48         | 53.3          |59.9       |
+| gemma-3-1b        | N150        |83         |96         | 53.3          |59.9       |
 | gemma-3-4b        | N150        | 78        | 95        | 34            | 68        |
 | gemma-3-4b        | N300        | 78        | 95        | 35            | 125       |
 | gemma-3-27b       | T3K         | 90        | 99        | 16            | 331       |
@@ -85,7 +85,7 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att
 | Mistral-7B        | N150        | 95        | 99        | 29.75         | 100.24    |
 | Mistral-7B        | N300        | 95        | 99        | 47.01         | 65.95     |
 | Mistral-7B        | T3K         | 95        | 99        | 67.82         | 53.93     |
-| gemma-3-1b        | N150        |32         |48         | 51.0          |62.02      |
+| gemma-3-1b        | N150        | 93        | 99        | 51.0          |62.02      |
 | gemma-3-4b        | N150        | 88        | 98        | 30            | 79        |
 | gemma-3-4b        | N300        | 86        | 98        | 32            | 135       |
 | gemma-3-27b       | T3K         | 91        | 100       | 15            | 361       |
diff --git a/models/tt_transformers/tests/test_accuracy.py b/models/tt_transformers/tests/test_accuracy.py
@@ -10,7 +10,12 @@
 from loguru import logger
 
 import ttnn
-from models.tt_transformers.tt.common import PagedAttentionConfig, preprocess_inputs_prefill
+from models.tt_transformers.tt.common import (
+    PagedAttentionConfig,
+    create_causal_mask,
+    create_sliding_window_causal_mask,
+    preprocess_inputs_prefill,
+)
 from models.tt_transformers.tt.model import Transformer
 from models.tt_transformers.tt.model_config import DecodersPrecision, ModelArgs, parse_decoder_json
 from models.tt_transformers.tt.rope import get_rot_mats
@@ -262,6 +267,32 @@ def test_tt_model_acc(
             pt_prefill_input[batch_id],
         )
 
+        if model_args.attention_mask:
+            attn_mask = torch.ones(prefill_lens[0] + 1).unsqueeze(0)
+            cache_postion = torch.arange(prefill_lens[0])
+            attention_mask = [
+                create_sliding_window_causal_mask(
+                    prefill_input,
+                    attn_mask,
+                    cache_postion,
+                    model_args,
+                    paged_attention_config,
+                    device=mesh_device,
+                    mode="prefill",
+                ),
+                create_causal_mask(
+                    prefill_input,
+                    attn_mask,
+                    cache_postion,
+                    model_args,
+                    paged_attention_config,
+                    device=mesh_device,
+                    mode="prefill",
+                ),
+            ]
+        else:
+            attention_mask = None
+
         tt_out = tt_model(
             prefill_input,
             current_pos=None,
@@ -270,6 +301,7 @@ def test_tt_model_acc(
             user_id=batch_id,
             mode="prefill",
             page_table=page_table_tt,
+            attention_masks=attention_mask,
             get_last_token=((decoding_pos[batch_id] - 1) // 32) * 32,
         )
 
@@ -322,13 +354,51 @@ def test_tt_model_acc(
             pt_decode_input,
             model_args.model_config["DECODE_RESIDUAL_MEMCFG"],
         )
+        # Run TT model
+        if model_args.attention_mask:
+            torch_current_pos = ttnn.to_torch(current_pos_tensor)
+            cur_batch_size = torch_current_pos.size(0)
+            max_len = torch_current_pos.max().item() + 1  # longest seq length (+1 since pos starts at 0)
+
+            # Initialize with zeros
+            attn_mask = torch.zeros(cur_batch_size, max_len, dtype=torch.long)
+            for j, length in enumerate(torch_current_pos.tolist()):
+                attn_mask[j, : length + 1] = 1
+
+            torch_current_pos = torch.tensor([max_len - 1])
+
+            attention_mask = [
+                create_sliding_window_causal_mask(
+                    decode_input,
+                    attn_mask,
+                    current_pos,
+                    model_args,
+                    paged_attention_config,
+                    device=mesh_device,
+                    mode="decode",
+                ),
+                create_causal_mask(
+                    decode_input,
+                    attn_mask,
+                    current_pos,
+                    model_args,
+                    paged_attention_config,
+                    device=mesh_device,
+                    mode="decode",
+                ),
+            ]
+            attention_mask = [ttnn.to_device(v, device=mesh_device) for v in attention_mask]
+        else:
+            attention_mask = None
+
         # Run TT model
         tt_out = tt_model(
             decode_input,
             current_pos_tensor,
             rot_mats_global=rot_mats,
             rot_mats_local=rot_mats_local,
             mode="decode",
+            attention_masks=attention_mask,
             page_table=page_table_tt,
         )