Add support for sliding window mask in TT-Transformers

Bhuvanesh194Sankar · Bhuvanesh194Sankar · commit d40f3f79fd62 · 2025-10-13T18:52:12.000Z
diff --git a/models/tt_transformers/tt/attention.py b/models/tt_transformers/tt/attention.py
@@ -30,6 +30,8 @@ def __init__(
         super().__init__()
 
         self.mesh_device = mesh_device
+        self.layer_idx = layer_num
+        self.configuration = configuration
         self.tt_ccl = tt_ccl
         self.num_devices = configuration.num_devices
         self.TG = self.num_devices == 32
@@ -388,7 +390,6 @@ def forward_decode(
         rot_mats=None,
         page_table=None,
         kv_cache=None,
-        attn_mask=None,
     ) -> ttnn.Tensor:
         """
         x: (seq_len, 1, batch, dim)
@@ -513,12 +514,11 @@ def forward_decode(
                 values,
                 cur_pos_tensor=current_pos,
                 page_table_tensor=page_table,
-                attn_mask=attn_mask,
-                is_causal=True if attn_mask is None else False,
                 scale=self.scale,
                 program_config=self.model_config["SDPA_DECODE_PROGCFG"],
                 compute_kernel_config=self.sdpa_decode_compute_kernel_cfg,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG,
+                sliding_window=self.configuration.sliding_window if self.is_sliding else 0,
             )
         else:
             attn_output_1G4D = ttnn.transformer.scaled_dot_product_attention_decode(
@@ -527,11 +527,10 @@ def forward_decode(
                 values,
                 cur_pos_tensor=current_pos,
                 scale=self.scale,
-                is_causal=True if attn_mask is None else False,
-                attn_mask=attn_mask,
                 program_config=self.model_config["SDPA_DECODE_PROGCFG"],
                 compute_kernel_config=self.sdpa_decode_compute_kernel_cfg,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG,  # FIXME: why not L1 height sharded e.g. SCORES_BATCHED_MM_OUTPUT_MEMCFG?
+                sliding_window=self.configuration.sliding_window if self.is_sliding else 0,
             )
 
         ttnn.deallocate(q_heads_1BQD)
@@ -676,7 +675,6 @@ def forward_prefill(
         chunk_page_table=None,
         chunk_start_idx=None,
         kv_cache=None,
-        attn_mask=None,
     ):
         seq_len = x_11SH.shape[-2]
         assert seq_len % 128 == 0 and seq_len > 0, "Seqlen must be divisible by 128"
@@ -831,8 +829,6 @@ def forward_prefill(
                 values_BKSD,
                 page_table,
                 chunk_start_idx,
-                attn_mask=attn_mask,
-                is_causal=True if attn_mask is None else False,
                 compute_kernel_config=self.sdpa_prefill_compute_kernel_cfg,
                 program_config=self.model_config["SDPA_PROGCFG"](seq_len),
             )
@@ -844,8 +840,6 @@ def forward_prefill(
                 scale=self.scale,
                 compute_kernel_config=self.sdpa_prefill_compute_kernel_cfg,
                 program_config=self.model_config["SDPA_PROGCFG"](seq_len),
-                attn_mask=attn_mask,
-                is_causal=True if attn_mask is None else False,
             )
 
         # deallocate keys and values
@@ -924,7 +918,6 @@ def forward(
         chunk_page_table=None,
         chunk_start_idx=None,
         kv_cache=None,
-        attn_mask=None,
     ):
         if mode == "prefill":
             return self.forward_prefill(
@@ -935,7 +928,6 @@ def forward(
                 chunk_page_table=chunk_page_table,
                 chunk_start_idx=chunk_start_idx,
                 kv_cache=kv_cache,
-                attn_mask=attn_mask,
             )
         else:
             return self.forward_decode(
@@ -944,7 +936,6 @@ def forward(
                 rot_mats,
                 page_table=page_table,
                 kv_cache=kv_cache,
-                attn_mask=attn_mask,
             )
 
     def prefill_prepare_tensor_for_kv_cache(self, key_or_value_layer, user_id):
diff --git a/models/tt_transformers/tt/decoder.py b/models/tt_transformers/tt/decoder.py
@@ -202,7 +202,6 @@ def forward(
         chunk_page_table=None,
         chunk_start_idx=None,
         kv_cache=None,
-        attn_mask=None,
     ) -> ttnn.Tensor:
         TG = self.args.is_galaxy
         residual = x
@@ -231,7 +230,6 @@ def forward(
             chunk_page_table=chunk_page_table,
             chunk_start_idx=chunk_start_idx,
             kv_cache=kv_cache,
-            attn_mask=attn_mask,
         )
         if self.pre_ff_norm == None:
             # Here x and attn_out are both fractured across devices
diff --git a/models/tt_transformers/tt/model.py b/models/tt_transformers/tt/model.py
@@ -9,7 +9,7 @@
 from models.common.lightweightmodule import LightweightModule
 from models.common.rmsnorm import RMSNorm
 from models.tt_transformers.tt.ccl import TT_CCL
-from models.tt_transformers.tt.common import copy_host_to_device, get_decode_mask
+from models.tt_transformers.tt.common import copy_host_to_device
 from models.tt_transformers.tt.decoder import TransformerBlock
 from models.tt_transformers.tt.distributed_norm import DistributedNorm
 from models.tt_transformers.tt.embedding import Embedding, ScaledEmbedding
@@ -30,10 +30,8 @@ def __init__(
         use_paged_kv_cache=False,
         attention_class=None,
         rope_setup_class=None,
-        attn_mask=None,
     ):
         super().__init__()
-        self.paged_attention_config = paged_attention_config
         self.args = args
         self.vocab_size = args.vocab_size
         assert self.vocab_size > 0
@@ -130,31 +128,6 @@ def __init__(
             max_columns_per_device=self.args.max_columns_per_device_lm_head,
         )
 
-        if hasattr(self.args, "sliding_window") and self.args.sliding_window is not None:
-            # We are using sliding window attention in this model. We can create a custom attention mask to apply the sliding attention
-            # First we create the mask for all decode positions on host [bsz, n_heads_per_device, seq_len, seq_len]
-            self.decode_sliding_mask_mat = get_decode_mask(
-                self.args,
-                self.mesh_device,
-                paged_attention_config=paged_attention_config,
-            )
-            # Then we copy a slice for a single decode position for each user on to device [bsz, n_heads_per_device, 1, seq_len]
-            # We can update this tensor on host each iteration and copy to device to save storing the large square tensor on device
-            self.device_decode_sliding_mask = ttnn.as_tensor(
-                torch.concat(
-                    [self.decode_sliding_mask_mat[i, :, 0:1, :].unsqueeze(0) for i in range(self.args.max_batch_size)],
-                    axis=0,
-                ).transpose(1, 2),
-                dtype=ttnn.bfloat4_b,
-                layout=ttnn.TILE_LAYOUT,
-                device=self.mesh_device,
-                memory_config=ttnn.DRAM_MEMORY_CONFIG,
-                mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device),
-            )
-        else:
-            self.decode_sliding_mask_mat = None
-            self.device_decode_sliding_mask = None
-
     def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_page_table=None):
         """
         Inputs are torch tensors or python types. This function returns ttnn
@@ -214,38 +187,13 @@ def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_pag
             )
         else:
             tt_chunk_page_table = None
-        if self.args.attention_mask:
-            attn_mask = torch.ones(S + 1).unsqueeze(0)
-            cache_postion = torch.arange(S)
-            attention_mask = [
-                create_sliding_window_causal_mask(
-                    tokens_embd,
-                    attn_mask,
-                    cache_postion,
-                    self.args,
-                    self.paged_attention_config,
-                    device=self.mesh_device,
-                    mode="prefill",
-                ),
-                create_causal_mask(
-                    tokens_embd,
-                    attn_mask,
-                    cache_postion,
-                    self.args,
-                    self.paged_attention_config,
-                    device=self.mesh_device,
-                    mode="prefill",
-                ),
-            ]
-        else:
-            attention_mask = None
+
         return (
             tokens_embd,
             tt_rot_mats_prefill_global,
             tt_rot_mats_prefill_local,
             tt_page_table,
             tt_chunk_page_table,
-            attention_mask,
         )
 
     def prepare_inputs_decode(self, *inputs):
@@ -309,41 +257,7 @@ def prepare_decode_inputs_host(self, tokens, current_pos, page_table=None):
                     mesh_shape=self.args.cluster_shape,
                 ),
             )
-        if self.args.attention_mask:
-            batch_size = current_pos.size(0)
-            max_len = current_pos.max().item() + 1  # longest seq length (+1 since pos starts at 0)
-
-            # Initialize with zeros
-            attn_mask = torch.zeros(batch_size, max_len, dtype=torch.long)
-            for i, length in enumerate(current_pos.tolist()):
-                attn_mask[i, : length + 1] = 1
-
-            current_pos = torch.tensor([max_len - 1])
-
-            attention_mask = [
-                create_sliding_window_causal_mask(
-                    tokens,
-                    attn_mask,
-                    current_pos,
-                    self.args,
-                    self.paged_attention_config,
-                    device=self.mesh_device,
-                    mode="decode",
-                ),
-                create_causal_mask(
-                    tokens,
-                    attn_mask,
-                    current_pos,
-                    self.args,
-                    self.paged_attention_config,
-                    device=self.mesh_device,
-                    mode="decode",
-                ),
-            ]
-        else:
-            attention_mask = None
-
-        return tokens, current_pos_tt, rope_idxs_global, rope_idxs_local, page_table, attention_mask
+        return tokens, current_pos_tt, rope_idxs_global, rope_idxs_local, page_table
 
     def _transform_decode_inputs_device(self, tokens):
         """
@@ -414,17 +328,6 @@ def ttnn_prefill_forward(
         This method will take device tensors and any other args to run forward.
         It returns ttnn device tensors.
         """
-        if hasattr(self.args, "sliding_window") and self.args.sliding_window is not None:
-            mask = torch.triu(torch.full((1, 1, x.shape[-2], x.shape[-2]), -float("inf")), diagonal=1)
-            sliding_mask = mask + torch.tril(
-                torch.full((1, 1, x.shape[-2], x.shape[-2]), -float("inf")),
-                diagonal=-self.args.sliding_window,
-            )
-            sliding_attn_mask = ttnn.from_torch(
-                sliding_mask, device=self.mesh_device, layout=ttnn.TILE_LAYOUT, dtype=ttnn.bfloat16
-            )
-        else:
-            sliding_attn_mask = None
         return self.forward(
             x,
             current_pos=None,
@@ -437,7 +340,6 @@ def ttnn_prefill_forward(
             chunk_start_idx=chunk_start_idx,
             get_last_token=get_last_token,
             kv_cache=kv_cache,
-            sliding_attn_mask=sliding_attn_mask,
         )
 
     def _increment_decode_positions_device(self, current_pos, rot_mat_idxs_global, rot_mat_idxs_local):
@@ -456,24 +358,6 @@ def _increment_decode_positions_device(self, current_pos, rot_mat_idxs_global, r
         if rot_mat_idxs_local is not None:
             ttnn.plus_one(rot_mat_idxs_local)
 
-    def update_attention_masks(self, current_pos):
-        torch_mask = torch.concat(
-            [
-                self.decode_sliding_mask_mat[i, :, current_pos[i].item() : current_pos[i].item() + 1, :].unsqueeze(0)
-                for i in range(self.decode_sliding_mask_mat.shape[0])
-            ],
-            axis=0,
-        ).transpose(1, 2)
-        sliding_window_causal_mask = ttnn.as_tensor(
-            torch_mask,
-            dtype=ttnn.bfloat4_b,
-            layout=ttnn.TILE_LAYOUT,
-            device=None,
-            memory_config=ttnn.DRAM_MEMORY_CONFIG,
-            mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device),
-        )
-        ttnn.copy_host_to_device_tensor(sliding_window_causal_mask, self.device_decode_sliding_mask)
-
     def ttnn_decode_forward(
         self,
         x,
@@ -502,7 +386,6 @@ def ttnn_decode_forward(
             mode="decode",
             page_table=page_table,
             kv_cache=kv_cache,
-            sliding_attn_mask=self.device_decode_sliding_mask,
         )
 
         # Gather the output across all devices and untilize the tensor (for argmax)
@@ -553,7 +436,6 @@ def forward(
         chunk_start_idx=None,
         get_last_token=-1,
         kv_cache=None,
-        sliding_attn_mask=None,
     ):
         for i, layer in enumerate(self.layers):
             # No-op if callers already provide the right memory config
@@ -565,14 +447,6 @@ def forward(
             elif activation_dtype is not None and x.dtype != activation_dtype:
                 x = ttnn.typecast(x, activation_dtype)
 
-            if sliding_attn_mask is not None:
-                attn_mask_i = (
-                    sliding_attn_mask
-                    if (hasattr(layer.attention, "is_sliding") and layer.attention.is_sliding)
-                    else None
-                )
-            else:
-                attn_mask_i = None
             x = layer(
                 x,
                 current_pos,
@@ -584,7 +458,6 @@ def forward(
                 chunk_page_table=chunk_page_table,
                 chunk_start_idx=chunk_start_idx,
                 kv_cache=kv_cache[i] if kv_cache is not None else None,
-                attn_mask=attn_mask_i,
             )
 
         if mode == "prefill" and get_last_token == -1:
diff --git a/models/tt_transformers/tt/model_config.py b/models/tt_transformers/tt/model_config.py
@@ -2410,7 +2410,7 @@ def create_tokenizer(self):
 
             # Add meta-compatible stop token list to the HF tokenizer
             if not "stop_tokens" in tokenizer.__dict__:
-                tokenizer.stop_tokens = [tokenizer.eos_token_id]
+                tokenizer.stop_tokens = self.eos_token_id if self.eos_token_id is not None else [tokenizer.eos_token_id]
                 # Phi-3-mini uses "<|end|>" as EOS token
                 if "phi-3-mini" in self.base_model_name.lower():
                     tokenizer.stop_tokens.append(tokenizer.encode("<|end|>")[0])