Address comments and refactor comments

nikileshx · nikileshx · commit 4e6b224bccda · 2025-08-29T05:53:28.000Z
diff --git a/models/experimental/mistral_24b/tests/pipeline_tests/test_vision_model.py b/models/experimental/mistral_24b/tests/pipeline_tests/test_vision_model.py
@@ -82,7 +82,7 @@ def test_mistral_vision_model(mesh_device, reset_seeds):
         model_args=model_args,
     )
 
-    tt_output = vision_model(input_tensor, image_sizes=[(H, W)])  # [0]
+    tt_output = vision_model(input_tensor, image_sizes=[(H, W)])
     tt_output = ttnn.to_torch(tt_output, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=-1))[
         :, : tt_output.shape[-1]
     ]
diff --git a/models/experimental/mistral_24b/tests/test_vision_mlp.py b/models/experimental/mistral_24b/tests/test_vision_mlp.py
@@ -10,7 +10,6 @@
 
 import ttnn
 
-# from models.tt_transformers.tt.mlp import MLP
 from models.experimental.mistral_24b.tt.vision_mlp import MistralTTVisionMLP as MLP
 from models.tt_transformers.tt.model_config import ModelArgs
 from models.utility_functions import comp_allclose, comp_pcc, skip_for_grayskull
diff --git a/models/experimental/mistral_24b/tt/pipeline/mistral_vision_tower.py b/models/experimental/mistral_24b/tt/pipeline/mistral_vision_tower.py
@@ -4,6 +4,7 @@
 
 """
 This file implements the Vision Tower submodule specific for the Mistral-Small-3.1-24B-Instruct-2503 model.
+This pipeline constructs the vision tower from vision model architecture.
 """
 
 import ttnn
diff --git a/models/experimental/mistral_24b/tt/rmsnorm.py b/models/experimental/mistral_24b/tt/rmsnorm.py
@@ -1,6 +1,12 @@
 # SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
 
 # SPDX-License-Identifier: Apache-2.0
+
+"""
+This is the modified version of the rmsnorm for the Mistral-Small-3.1-24B-Instruct-2503 model.
+We introduced the `simplified_rms_norm` function to be compatible with the Mistral-Small-3.1-24B-Instruct-2503 model.
+"""
+
 import ttnn
 from models.common.lightweightmodule import LightweightModule
 
diff --git a/models/experimental/mistral_24b/tt/vision_attention.py b/models/experimental/mistral_24b/tt/vision_attention.py
@@ -2,12 +2,13 @@
 
 # SPDX-License-Identifier: Apache-2.0
 """
-This file implements the vision attention submodule specific for the Mistral-Small-3.1-24B-Instruct-2503 model.
-
+This is the modified version of the vision_attention for the Mistral-Small-3.1-24B-Instruct-2503 model.
+We introduced the `apply_rotary_pos_emb_vision_tt` function to llama_image_attention to be compatible with the Mistral-Small-3.1-24B-Instruct-2503 model.
 """
-import torch
 
+import torch
 import ttnn
+
 from models.common.lightweightmodule import LightweightModule
 from models.utility_functions import is_blackhole, nearest_32
 
@@ -162,7 +163,7 @@ def pad_head_dim(weight, heads_out=True):
     def forward(self, x_11SH, position_embeddings=None):
         seq_len = x_11SH.shape[-2]
 
-        MAX_MM_SEQ_LEN = self.configuration.VISION_MAX_MM_SEQ
+        MAX_MM_SEQ_LEN = seq_len
 
         if seq_len > MAX_MM_SEQ_LEN:
             x_11SH = ttnn.reshape(x_11SH, [1, seq_len // MAX_MM_SEQ_LEN, MAX_MM_SEQ_LEN, -1])
diff --git a/models/experimental/mistral_24b/tt/vision_conv2d.py b/models/experimental/mistral_24b/tt/vision_conv2d.py
@@ -2,9 +2,14 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
+"""
+This is the modified version of the vision_patch_conv2d for the Mistral-Small-3.1-24B-Instruct-2503 model.
+We have modified the llama_patch_conv2d to be compatible with the Mistral-Small-3.1-24B-Instruct-2503 model.
+"""
 
+import torch
 import ttnn
+
 from models.common.lightweightmodule import LightweightModule
 
 
@@ -57,7 +62,7 @@ def __init__(
 
         self._unfold = torch.nn.Unfold(kernel_size=self.kernel_size, stride=self.stride)
 
-        weight = state_dict[f"{state_dict_prefix}weight"]
+        weight = state_dict[f"{state_dict_prefix}_linear.weight"]
         if weight.ndim == 4:
             weight = weight.reshape(out_channels, -1).T
 
diff --git a/models/experimental/mistral_24b/tt/vision_mlp.py b/models/experimental/mistral_24b/tt/vision_mlp.py
@@ -1,13 +1,15 @@
 # SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 
 # SPDX-License-Identifier: Apache-2.0
+
 """
+This is the modified version of the FeedForward for the Mistral-Small-3.1-24B-Instruct-2503 model.
 This file implements the Vision FeedForward submodule specific for the Mistral-Small-3.1-24B-Instruct-2503 model.
-
 """
-import torch
 
+import torch
 import ttnn
+
 from models.common.lightweightmodule import LightweightModule
 
 
@@ -48,7 +50,6 @@ def as_tensor(name, dtype, is_bias=False):
                 mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device),
                 layout=ttnn.TILE_LAYOUT,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG,
-                # cache_file_name=cache_name(name),
             )
 
         # Weights and Biases
diff --git a/models/experimental/mistral_24b/tt/vision_mmp.py b/models/experimental/mistral_24b/tt/vision_mmp.py
@@ -1,17 +1,16 @@
 # SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 # SPDX-License-Identifier: Apache-2.0
 
+"""
+This file implements the Vision MultiModalProjector submodule specific for the Mistral-Small-3.1-24B-Instruct-2503 model.
+"""
 
 import torch
 from models.common.lightweightmodule import LightweightModule
 from models.experimental.mistral_24b.tt.rmsnorm import RMSNorm
 import ttnn
 from ttnn import ConcatMeshToTensor
 
-"""
-This file implements the Vision pixtral image submodule specific for the Mistral-Small-3.1-24B-Instruct-2503 model.
-"""
-
 
 class TTMistral3PatchMerger(LightweightModule):
     def __init__(
@@ -26,7 +25,7 @@ def __init__(
         super().__init__()
         self.device = mesh_device
         hidden_size = args.vision_dim
-        self.spatial_merge_size = 2  # TODO Handle in Model_config spatial_merge_size
+        self.spatial_merge_size = 2
         self.patch_size = args.vision_patch_size
         self.args = args
 
diff --git a/models/experimental/mistral_24b/tt/vision_pixtral_image_block.py b/models/experimental/mistral_24b/tt/vision_pixtral_image_block.py
@@ -10,7 +10,7 @@
 from models.experimental.mistral_24b.tt.vision_mlp import MistralTTVisionMLP as MLP
 
 """
-This file implements the Pixtral_image_block submodule specific for the Mistral-Small-3.1-24B-Instruct-2503 model.
+This file implements the pixtral image block specific for the Mistral-Small-3.1-24B-Instruct-2503 model.
 """
 
 
diff --git a/models/experimental/mistral_24b/tt/vision_pixtral_transformer.py b/models/experimental/mistral_24b/tt/vision_pixtral_transformer.py
@@ -2,6 +2,11 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
+"""
+This file implements the Vision Transformer submodule specific for the Mistral-Small-3.1-24B-Instruct-2503 model.
+This pipeline iterates over the pixtral image blocks to generate the image embeddings.
+"""
+
 from tqdm import tqdm
 
 from models.common.lightweightmodule import LightweightModule
diff --git a/models/experimental/mistral_24b/tt/vision_rope.py b/models/experimental/mistral_24b/tt/vision_rope.py
@@ -2,9 +2,14 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
+"""
+This is the modified version of the RoPE for the Mistral-Small-3.1-24B-Instruct-2503 model.
+We have modified the compute_gather_cos_sin function of RMSNorm to be compatible with the Mistral-Small-3.1-24B-Instruct-2503 model.
+"""
 
+import torch
 import ttnn
+
 from models.common.lightweightmodule import LightweightModule
 from models.tt_transformers.tt.common import precompute_mistral_vision_freqs
 from ttnn import ReplicateTensorToMesh
@@ -71,7 +76,6 @@ def __init__(
     def get_rot_mats(self, position_idxs, return_rot_idxs=False):
         device = self.device
 
-        # return self.cos_matrix, self.sin_matrix
         # If position_idxs is a torch tensor, get the TTNN version of it
         if isinstance(position_idxs, torch.Tensor):
             rot_idxs = position_idxs.unsqueeze(0)
diff --git a/models/tt_transformers/tt/load_checkpoints.py b/models/tt_transformers/tt/load_checkpoints.py
@@ -595,6 +595,7 @@ def map_hf_to_meta_keys(loaded_weights):
         ("o_proj", "wo"),
         ("q_norm", "q_norm"),
         ("k_norm", "k_norm"),
+        ("patch_conv.weight", "patch_conv._linear.weight"),
     ]
     return replace_keys(loaded_weights, replacements)
 
@@ -613,20 +614,9 @@ def map_vision_meta_to_hf_keys(loaded_weights):
         ("wk", "k_proj"),
         ("wv", "v_proj"),
         ("wo", "o_proj"),
+        ("_linear.weight", "weight"),
     ]
-
-    extra_mapping = [
-        ("attention_norm", "input_layernorm"),
-        ("ffn_norm", "post_attention_layernorm"),
-        ("attention", "self_attn"),
-        ("feed_forward", "mlp"),
-    ]
-
-    model_name = os.getenv("HF_MODEL")
-    if "Mistral" in model_name:
-        mapping = base_mapping
-    else:
-        mapping = base_mapping + extra_mapping
+    mapping = base_mapping
 
     return replace_keys(loaded_weights, mapping)
 
diff --git a/models/tt_transformers/tt/model_config.py b/models/tt_transformers/tt/model_config.py
@@ -1625,9 +1625,8 @@ def _set_params(self, checkpoint_dir):
         )
 
     def _set_vision_params(self, vision_config):
-        vision_config = config.get("vision_config", config)
-
         self.vision_chunk_size = vision_config.get("vision_chunk_size", 896)
+        self.image_size = vision_config.get("image_size", 896)
         self.vision_max_num_chunks = vision_config.get("vision_max_num_chunks", 4)
         self.vision_num_cross_attention_layers = vision_config.get("vision_num_cross_attention_layers", 8)
         self.vision_dim = vision_config.get("hidden_size", 1152)
@@ -1662,12 +1661,6 @@ def _set_vision_params(self, vision_config):
         # Optional tuning knobs
         self.vision_max_num_tiles = vision_config.get("max_num_tiles", 4)
         self.vision_n_global_layers = vision_config.get("n_global_layers", 8)
-        # self.vision_max_num_tiles = vision_config.get("max_num_tiles", 4)
-        # self.vision_n_global_layers = vision_config.get("n_global_layers", 8)
-
-        # # Optional Meta-specific knobs
-        # self.vision_max_num_chunks = vision_config.get("max_num_chunks", 4)
-        # self.vision_num_cross_attention_layers = vision_config.get("num_cross_attention_layers", -1)
 
     def _set_hf_params(self, checkpoint_dir):
         def merge_text_config(base_config):
@@ -1741,7 +1734,7 @@ def is_vision(self):
         return self.vision_chunk_size > 0
 
     def get_state_dict_prefix(self, module_name, layer_num, is_vision=False):
-        if self.is_vision() and "Mistral-Small-3.1-24B" not in self.model_name:
+        if self.is_vision() and self.model_name.startswith("Mistral") and "Small-3.1-24B" not in self.model_name:
             text_prefix = self.state_dict_text_prefix
         else:
             text_prefix = "" if not is_vision else self.state_dict_text_prefix
@@ -2384,20 +2377,6 @@ def reference_vision_rms_norm(self):
         layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
         return layer
 
-    def reference_vision_rms_norm_qwen(self):
-        model = self.reference_vision_transformer(wrap=False)
-        layer = model.visual.blocks[0].norm1
-        layer._load_state_dict = layer.load_state_dict
-        layer.load_state_dict = lambda x: layer._load_state_dict(convert_meta_to_hf(x, self.head_dim))
-        return layer
-
-    def reference_vision_rms_norm_qwen_merger(self):
-        model = self.reference_vision_transformer(wrap=False)
-        layer = model.visual.merger
-        layer._load_state_dict = layer.load_state_dict
-        layer.load_state_dict = lambda x: layer._load_state_dict(convert_meta_to_hf(x, self.head_dim))
-        return layer
-
     def reference_rms_norm(self):
         if self.checkpoint_type == CheckpointType.Meta:
             from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import RMSNorm
@@ -2463,30 +2442,19 @@ def reference_vision_model(self):
         layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
         return layer
 
-    def reference_vision_model(self):
-        model = self.reference_vision_transformer(wrap=False)
-        layer = model.vision_tower.vision_model
-        # layer._load_state_dict = layer.load_state_dict
-        # layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
-        return layer
-
-    def reference_vision_mlp(self):
-        model = self.reference_vision_transformer(wrap=False)
-        layer = model.vision_tower.vision_model.encoder.layers[0].mlp
-        # layer._load_state_dict = layer.load_state_dict
-        # layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
-        return layer
-
-    def reference_pixtral_image_block(self, layer_num=0):
+    def reference_vision_mlp(self, layer_idx=0):
         model = self.reference_vision_transformer(wrap=False)
-        layer = model.vision_tower.transformer.layers[layer_num]
+        if "Mistral-Small-3.1-24B-Instruct-2503" in self.model_name:
+            layer = model.vision_tower.transformer.layers[layer_idx].feed_forward
+        else:
+            layer = model.vision_tower.vision_model.encoder.layers[0].mlp
         layer._load_state_dict = layer.load_state_dict
         layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
         return layer
 
-    def reference_vision_mlp(self, layer_idx=0):
+    def reference_pixtral_image_block(self, layer_num=0):
         model = self.reference_vision_transformer(wrap=False)
-        layer = model.vision_tower.transformer.layers[layer_idx].feed_forward
+        layer = model.vision_tower.transformer.layers[layer_num]
         layer._load_state_dict = layer.load_state_dict
         layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
         return layer
@@ -2510,17 +2478,13 @@ def reference_siglip_patch_embed(self):
         layer = model.vision_tower.vision_model.embeddings.patch_embedding
         # layer._load_state_dict = layer.load_state_dict
         # layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
-        layer._load_state_dict = layer.load_state_dict
-        layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
         return layer
 
     def reference_vision_pos_embedding(self):
         model = self.reference_vision_transformer(wrap=False)
         layer = model.vision_tower.vision_model.embeddings.position_embedding
         # layer._load_state_dict = layer.load_state_dict
         # layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
-        layer._load_state_dict = layer.load_state_dict
-        layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
         return layer
 
     def reference_vision_embedding(self):
@@ -2542,22 +2506,6 @@ def reference_vision_layernorm(self, layer_name="layer_norm1"):
         # layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
         return layer
 
-    def reference_vision_attention(self):
-        model = self.reference_vision_transformer(wrap=False)
-        layer = model.vision_tower.vision_model.encoder.layers[0].self_attn  # Common naming
-        # layer._load_state_dict = layer.load_state_dict
-        # layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
-        layer._load_state_dict = layer.load_state_dict
-        layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
-        return layer
-
-    def reference_vision_layernorm(self):
-        model = self.reference_vision_transformer(wrap=False)
-        layer = model.vision_tower.vision_model.encoder.layers[0].layer_norm1
-        layer._load_state_dict = layer.load_state_dict
-        layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
-        return layer
-
     def reference_vision_attention(self, layer_idx=0):
         model = self.reference_vision_transformer(wrap=False)
         if "Mistral-Small-3.1-24B-Instruct-2503" in self.model_name:
@@ -2581,15 +2529,10 @@ def reference_vision_encoder_block(self):
         layer = model.vision_tower.vision_model.encoder.layers[0]
         # layer._load_state_dict = layer.load_state_dict
         # layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
-        layer._load_state_dict = layer.load_state_dict
-        layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
         return layer
 
     def reference_vision_encoder(self):
         model = self.reference_vision_transformer(wrap=False)
-        layer = model.vision_tower.vision_model.encoder
-        # layer._load_state_dict = layer.load_state_dict
-        # layer.load_state_dict = lambda x: layer._load_state_dict(convert_vision_meta_to_hf(x, self.head_dim))
         if "Mistral-Small-3.1-24B-Instruct-2503" in self.model_name:
             layer = model.vision_tower.transformer
         else:
@@ -2656,7 +2599,7 @@ def reference_attention(self):
                 "Gemma3Attention",
             )
             wrapper = HfAttentionWrapper(
-                layer, self.head_dim, model.model.rotary_emb wif use_position_embeddings else None
+                layer, self.head_dim, model.model.rotary_emb if use_position_embeddings else None
             )
             return wrapper
 

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ def test_mistral_vision_model(mesh_device, reset_seeds):`
`82`	`82`	`model_args=model_args,`
`83`	`83`	`)`
`84`	`84`
`85`		`- tt_output = vision_model(input_tensor, image_sizes=[(H, W)]) # [0]`
	`85`	`+ tt_output = vision_model(input_tensor, image_sizes=[(H, W)])`
`86`	`86`	`tt_output = ttnn.to_torch(tt_output, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=-1))[`
`87`	`87`	`:, : tt_output.shape[-1]`
`88`	`88`	`]`