Fix repeatation issue for Gemma migration

MohammedTaherMcW · MohammedTaherMcW · commit a6255ceaae70 · 2025-09-19T09:43:36.000Z
diff --git a/models/tt_transformers/tt/attention.py b/models/tt_transformers/tt/attention.py
@@ -388,6 +388,8 @@ def forward_decode(
         rot_mats=None,
         page_table=None,
         kv_cache=None,
+        causal_mask=None,
+        is_causal=True,
     ) -> ttnn.Tensor:
         """
         x: (seq_len, 1, batch, dim)
@@ -516,6 +518,8 @@ def forward_decode(
                 program_config=self.model_config["SDPA_DECODE_PROGCFG"],
                 compute_kernel_config=self.sdpa_decode_compute_kernel_cfg,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG,
+                attn_mask=causal_mask,
+                is_causal=is_causal,
             )
         else:
             attn_output_1G4D = ttnn.transformer.scaled_dot_product_attention_decode(
@@ -527,6 +531,8 @@ def forward_decode(
                 program_config=self.model_config["SDPA_DECODE_PROGCFG"],
                 compute_kernel_config=self.sdpa_decode_compute_kernel_cfg,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG,  # FIXME: why not L1 height sharded e.g. SCORES_BATCHED_MM_OUTPUT_MEMCFG?
+                attn_mask=causal_mask,
+                is_causal=is_causal,
             )
 
         ttnn.deallocate(q_heads_1BQD)
@@ -671,6 +677,8 @@ def forward_prefill(
         chunk_page_table=None,
         chunk_start_idx=None,
         kv_cache=None,
+        causal_mask=None,
+        is_causal=True,
     ):
         seq_len = x_11SH.shape[-2]
         assert seq_len % 128 == 0 and seq_len > 0, "Seqlen must be divisible by 128"
@@ -833,10 +841,11 @@ def forward_prefill(
                 q_heads_1QSD_8b,
                 k_heads_1KSD_8b,
                 v_heads_1VSD_8b,
-                is_causal=True,
                 scale=self.scale,
                 compute_kernel_config=self.sdpa_prefill_compute_kernel_cfg,
                 program_config=self.model_config["SDPA_PROGCFG"](seq_len),
+                attn_mask=causal_mask,
+                is_causal=is_causal,
             )
 
         # deallocate keys and values
@@ -915,6 +924,8 @@ def forward(
         chunk_page_table=None,
         chunk_start_idx=None,
         kv_cache=None,
+        causal_mask=None,
+        is_causal=True,
     ):
         if mode == "prefill":
             return self.forward_prefill(
@@ -925,9 +936,19 @@ def forward(
                 chunk_page_table=chunk_page_table,
                 chunk_start_idx=chunk_start_idx,
                 kv_cache=kv_cache,
+                causal_mask=causal_mask,
+                is_causal=is_causal,
             )
         else:
-            return self.forward_decode(x, current_pos, rot_mats, page_table=page_table, kv_cache=kv_cache)
+            return self.forward_decode(
+                x,
+                current_pos,
+                rot_mats,
+                page_table=page_table,
+                kv_cache=kv_cache,
+                causal_mask=causal_mask,
+                is_causal=is_causal,
+            )
 
     def prefill_prepare_tensor_for_kv_cache(self, key_or_value_layer, user_id):
         tensor_copy = ttnn.clone(key_or_value_layer)
diff --git a/models/tt_transformers/tt/decoder.py b/models/tt_transformers/tt/decoder.py
@@ -176,6 +176,8 @@ def forward(
         chunk_page_table=None,
         chunk_start_idx=None,
         kv_cache=None,
+        causal_mask=None,
+        is_causal=True,
     ) -> ttnn.Tensor:
         TG = self.args.is_galaxy
         residual = x
@@ -204,6 +206,8 @@ def forward(
             chunk_page_table=chunk_page_table,
             chunk_start_idx=chunk_start_idx,
             kv_cache=kv_cache,
+            causal_mask=causal_mask,
+            is_causal=is_causal,
         )
         if self.pre_ff_norm == None:
             # Here x and attn_out are both fractured across devices
diff --git a/models/tt_transformers/tt/model.py b/models/tt_transformers/tt/model.py
@@ -9,7 +9,7 @@
 from models.common.lightweightmodule import LightweightModule
 from models.common.rmsnorm import RMSNorm
 from models.tt_transformers.tt.ccl import TT_CCL
-from models.tt_transformers.tt.common import copy_host_to_device
+from models.tt_transformers.tt.common import copy_host_to_device, create_causal_mask, create_sliding_window_causal_mask
 from models.tt_transformers.tt.decoder import TransformerBlock
 from models.tt_transformers.tt.distributed_norm import DistributedNorm
 from models.tt_transformers.tt.embedding import Embedding, ScaledEmbedding
@@ -32,6 +32,7 @@ def __init__(
         rope_setup_class=None,
     ):
         super().__init__()
+        self.paged_attention_config = paged_attention_config
         self.args = args
         self.vocab_size = args.vocab_size
         assert self.vocab_size > 0
@@ -187,14 +188,38 @@ def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_pag
             )
         else:
             tt_chunk_page_table = None
-
+        if self.args.attention_mask:
+            attn_mask = torch.ones(S + 1).unsqueeze(0)
+            cache_postion = torch.arange(S)
+            attention_mask = [
+                create_sliding_window_causal_mask(
+                    tokens_embd,
+                    attn_mask,
+                    cache_postion,
+                    self.args,
+                    self.paged_attention_config,
+                    device=self.mesh_device,
+                    mode="prefill",
+                ),
+                create_causal_mask(
+                    tokens_embd,
+                    attn_mask,
+                    cache_postion,
+                    self.args,
+                    self.paged_attention_config,
+                    device=self.mesh_device,
+                    mode="prefill",
+                ),
+            ]
+        else:
+            attention_mask = None
         return (
             tokens_embd,
             tt_rot_mats_prefill_global,
             tt_rot_mats_prefill_local,
             tt_page_table,
             tt_chunk_page_table,
-            None,
+            attention_mask,
         )
 
     def prepare_inputs_decode(self, *inputs):
@@ -258,7 +283,41 @@ def prepare_decode_inputs_host(self, tokens, current_pos, page_table=None):
                     mesh_shape=self.args.cluster_shape,
                 ),
             )
-        return tokens, current_pos_tt, rope_idxs_global, rope_idxs_local, page_table, None
+        if self.args.attention_mask:
+            batch_size = current_pos.size(0)
+            max_len = current_pos.max().item() + 1  # longest seq length (+1 since pos starts at 0)
+
+            # Initialize with zeros
+            attn_mask = torch.zeros(batch_size, max_len, dtype=torch.long)
+            for i, length in enumerate(current_pos.tolist()):
+                attn_mask[i, : length + 1] = 1
+
+            current_pos = torch.tensor([max_len - 1])
+
+            attention_mask = [
+                create_sliding_window_causal_mask(
+                    tokens,
+                    attn_mask,
+                    current_pos,
+                    self.args,
+                    self.paged_attention_config,
+                    device=self.mesh_device,
+                    mode="decode",
+                ),
+                create_causal_mask(
+                    tokens,
+                    attn_mask,
+                    current_pos,
+                    self.args,
+                    self.paged_attention_config,
+                    device=self.mesh_device,
+                    mode="decode",
+                ),
+            ]
+        else:
+            attention_mask = None
+
+        return tokens, current_pos_tt, rope_idxs_global, rope_idxs_local, page_table, attention_mask
 
     def _transform_decode_inputs_device(self, tokens):
         """
@@ -324,6 +383,7 @@ def ttnn_prefill_forward(
         chunk_start_idx=None,
         get_last_token=-1,
         kv_cache=None,
+        attention_masks=None,
         **kwargs,
     ):
         """
@@ -342,6 +402,7 @@ def ttnn_prefill_forward(
             chunk_start_idx=chunk_start_idx,
             get_last_token=get_last_token,
             kv_cache=kv_cache,
+            attention_masks=attention_masks,
         )
 
     def _increment_decode_positions_device(self, current_pos, rot_mat_idxs_global, rot_mat_idxs_local):
@@ -369,6 +430,7 @@ def ttnn_decode_forward(
         page_table=None,
         kv_cache=None,
         argmax_on_device=False,
+        attention_masks=None,
         **kwargs,
     ):
         """
@@ -388,6 +450,7 @@ def ttnn_decode_forward(
             rot_mats_local=rot_mats_local,
             mode="decode",
             page_table=page_table,
+            attention_masks=attention_masks,
             kv_cache=kv_cache,
         )
 
@@ -439,6 +502,7 @@ def forward(
         chunk_start_idx=None,
         get_last_token=-1,
         kv_cache=None,
+        attention_masks=None,
     ):
         for i, layer in enumerate(self.layers):
             # No-op if callers already provide the right memory config
@@ -449,7 +513,16 @@ def forward(
                 x = ttnn.to_memory_config(x, self.model_config["DECODE_RESIDUAL_MEMCFG"], activation_dtype)
             elif activation_dtype is not None and x.dtype != activation_dtype:
                 x = ttnn.typecast(x, activation_dtype)
-
+            causal_mask = (
+                (
+                    attention_masks[0]
+                    if (hasattr(layer.attention, "is_sliding") and layer.attention.is_sliding)
+                    else attention_masks[1]
+                )
+                if attention_masks is not None
+                else None
+            )
+            is_causal = False if causal_mask is not None else True
             x = layer(
                 x,
                 current_pos,
@@ -461,6 +534,8 @@ def forward(
                 chunk_page_table=chunk_page_table,
                 chunk_start_idx=chunk_start_idx,
                 kv_cache=kv_cache[i] if kv_cache is not None else None,
+                causal_mask=causal_mask,
+                is_causal=is_causal,
             )
 
         if mode == "prefill" and get_last_token == -1:
diff --git a/models/tt_transformers/tt/model_config.py b/models/tt_transformers/tt/model_config.py
@@ -1431,11 +1431,13 @@ def _get_hidden_activation_type(self, config):
 
     def _set_model_specific_params(self):
         # Gemma3 specific params
+        self.attention_mask = False
         is_gemma3 = "gemma-3" in self.base_model_name.lower()
         if is_gemma3:
             self.rms_norm_add_unit_offset = True
             self.embed_scale = self.dim**0.5
             self.sliding_window = 512
+            self.attention_mask = True
 
     def _set_params_from_dict(self, config, is_hf=False):
         eos_token_id = config.get("eos_token_id", None)
diff --git a/models/tt_transformers/tt/multimodal/gemma3/gemma_e2e_model.py b/models/tt_transformers/tt/multimodal/gemma3/gemma_e2e_model.py
@@ -1,4 +1,7 @@
+import torch
+
 import ttnn
+from models.tt_transformers.tt.common import create_causal_mask, create_sliding_window_causal_mask
 from models.tt_transformers.tt.model import Transformer
 from models.tt_transformers.tt.multimodal.gemma3.gemma_vision_model import TtGemmaTransformerVision
 
@@ -109,8 +112,39 @@ def prepare_inputs_prefill(self, pt_tokens, start_pos=0, page_table=None, chunk_
             )
         else:
             tt_chunk_page_table = None
-
-        return tokens_embd, tt_rot_mats_prefill_global, tt_rot_mats_prefill_local, tt_page_table, tt_chunk_page_table
+        if self.args.attention_mask:
+            attn_mask = torch.ones(S + 1).unsqueeze(0)
+            cache_postion = torch.arange(S)
+            attention_mask = [
+                create_sliding_window_causal_mask(
+                    tokens_embd,
+                    attn_mask,
+                    cache_postion,
+                    self.args,
+                    self.paged_attention_config,
+                    device=self.mesh_device,
+                    mode="prefill",
+                ),
+                create_causal_mask(
+                    tokens_embd,
+                    attn_mask,
+                    cache_postion,
+                    self.args,
+                    self.paged_attention_config,
+                    device=self.mesh_device,
+                    mode="prefill",
+                ),
+            ]
+        else:
+            attention_mask = None
+        return (
+            tokens_embd,
+            tt_rot_mats_prefill_global,
+            tt_rot_mats_prefill_local,
+            tt_page_table,
+            tt_chunk_page_table,
+            attention_mask,
+        )
 
     def compute_vision_token(self, pixel_values=None):
         if pixel_values is None: