Rebase Experimental E2E and refactor to include tt_ccl

nikileshx · nikileshx · commit 25e8431e493c · 2025-08-29T04:23:30.000Z
diff --git a/models/experimental/mistral_24b/tests/pipeline_tests/test_end2end.py b/models/experimental/mistral_24b/tests/pipeline_tests/test_end2end.py
@@ -6,6 +6,7 @@
 import os
 import ttnn
 
+from models.tt_transformers.tt.ccl import TT_CCL
 from models.tt_transformers.tt.common import (
     sample_host,
     PagedAttentionConfig,
@@ -117,8 +118,14 @@ def setup_vision_prompts_and_tokenizer(model_args, instruct):
         {
             "role": "user",
             "content": [
-                {"type": "image", "image": "https://www.theeducationmagazine.com/wp-content/uploads/2020/03/18.jpg"},
-                {"type": "text", "text": "Tell me who you see in the image and describe the image ?"},
+                {
+                    "type": "image",
+                    "image": "https://img.freepik.com/premium-photo/girl-hugging-dog-with-girl-hugging-her_737761-2565.jpg",
+                },
+                {
+                    "type": "text",
+                    "text": "Is there a cat in this image? If not, what animal do you see in the image? Describe the image in detail in 600 words.",
+                },
             ],
         }
     ]
@@ -182,9 +189,11 @@ def load_separate_models_like_test_end2end(model_args, mesh_device, dtype, paged
             max_num_blocks=page_params["page_max_num_blocks"],
         )
 
+    tt_ccl = TT_CCL(mesh_device)
     # Load vision model (exactly like test_end2end.py)
     vision_model = TtMistralVisionTransformer(
         mesh_device=mesh_device,
+        tt_ccl=tt_ccl,
         state_dict=state_dict,
         state_dict_prefix=vision_prefix,
         dtype=dtype,
@@ -418,6 +427,11 @@ def validate_e2e_outputs(results, expected_min_tokens=1):
     ],
     ids=["accuracy"],
 )
+@pytest.mark.parametrize(
+    "device_params",
+    [{"fabric_config": ttnn.FabricConfig.FABRIC_1D, "trace_region_size": 30000000, "num_command_queues": 1}],
+    indirect=True,
+)
 @pytest.mark.parametrize(
     "mesh_device",
     [
@@ -427,8 +441,6 @@ def validate_e2e_outputs(results, expected_min_tokens=1):
     ],
     indirect=True,
 )
-# @pytest.mark.parametrize("device_params", [{"l1_small_size": 1584864, "trace_region_size": 0}], indirect=True)
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 10 * 1024}], indirect=True)
 def test_e2e_vision_text_pipeline(
     weights,
     layers,
diff --git a/models/experimental/mistral_24b/tests/pipeline_tests/test_vision_model.py b/models/experimental/mistral_24b/tests/pipeline_tests/test_vision_model.py
@@ -7,6 +7,7 @@
 from loguru import logger
 
 import ttnn
+from models.tt_transformers.tt.ccl import TT_CCL
 from models.tt_transformers.tt.model_config import ModelArgs
 from models.experimental.mistral_24b.tt.pipeline.vision_model import TtMistralVisionTransformer
 from models.utility_functions import comp_allclose, comp_pcc, skip_for_grayskull
@@ -31,6 +32,11 @@ def get_image_features(vision_tower, projector, input_tensor, image_sizes):
     ],
     indirect=True,
 )
+@pytest.mark.parametrize(
+    "device_params",
+    [{"fabric_config": ttnn.FabricConfig.FABRIC_1D, "trace_region_size": 30000000, "num_command_queues": 1}],
+    indirect=True,
+)
 def test_mistral_vision_model(mesh_device, reset_seeds):
     pcc_required = 0.97
     dtype = ttnn.bfloat8_b
@@ -62,8 +68,10 @@ def test_mistral_vision_model(mesh_device, reset_seeds):
     reference_output = get_image_features(reference_model, reference_mmp, input_tensor, image_sizes=[(H, W)])
 
     # ##### TT Model: TtMistralVisionTransformer #####
+    tt_ccl = TT_CCL(mesh_device=mesh_device)
     vision_model = TtMistralVisionTransformer(
         mesh_device=mesh_device,
+        tt_ccl=tt_ccl,
         state_dict=state_dict,
         state_dict_prefix=first_layer_prefix,
         dtype=dtype,
diff --git a/models/experimental/mistral_24b/tests/pipeline_tests/test_vision_tower.py b/models/experimental/mistral_24b/tests/pipeline_tests/test_vision_tower.py
@@ -7,6 +7,7 @@
 from loguru import logger
 
 import ttnn
+from models.tt_transformers.tt.ccl import TT_CCL
 from models.tt_transformers.tt.model_config import ModelArgs
 from models.experimental.mistral_24b.tt.pipeline.mistral_vision_tower import MistralVisionTower
 from models.utility_functions import comp_allclose, comp_pcc, skip_for_grayskull
@@ -22,6 +23,11 @@
     ],
     indirect=True,
 )
+@pytest.mark.parametrize(
+    "device_params",
+    [{"fabric_config": ttnn.FabricConfig.FABRIC_1D, "trace_region_size": 30000000, "num_command_queues": 1}],
+    indirect=True,
+)
 def test_mistral_vision_tower(mesh_device, reset_seeds):
     pcc_required = 0.99
     dtype = ttnn.bfloat16
@@ -43,9 +49,11 @@ def test_mistral_vision_tower(mesh_device, reset_seeds):
     reference_output = reference_model(input_tensor, image_sizes=[(H, W)])
 
     reference_output = reference_output.last_hidden_state
+    tt_ccl = TT_CCL(mesh_device)
     ##### TT Model: MistralVisionTower #####
     vision_model = MistralVisionTower(
         mesh_device=mesh_device,
+        tt_ccl=tt_ccl,
         state_dict=state_dict,
         state_dict_prefix=first_layer_prefix,
         dtype=dtype,
diff --git a/models/experimental/mistral_24b/tests/test_pixtral_transformer.py b/models/experimental/mistral_24b/tests/test_pixtral_transformer.py
@@ -8,6 +8,7 @@
 from loguru import logger
 
 import ttnn
+from models.tt_transformers.tt.ccl import TT_CCL
 from models.tt_transformers.tt.model_config import ModelArgs
 
 from models.experimental.mistral_24b.tt.vision_pixtral_transformer import TtPixtralTransformer
@@ -28,6 +29,11 @@
     ],
     indirect=True,
 )
+@pytest.mark.parametrize(
+    "device_params",
+    [{"fabric_config": ttnn.FabricConfig.FABRIC_1D, "trace_region_size": 30000000, "num_command_queues": 1}],
+    indirect=True,
+)
 def test_image_transformer_inference(batch, num_chunks, mesh_device):
     pcc_required = 0.99
 
@@ -52,8 +58,10 @@ def test_image_transformer_inference(batch, num_chunks, mesh_device):
 
     all_tests_pass = True
 
+    tt_ccl = TT_CCL(mesh_device)
     tt_model = TtPixtralTransformer(
         mesh_device,
+        tt_ccl,
         state_dict,
         state_dict_prefix=first_layer_prefix,
         weight_cache_path=None,
diff --git a/models/experimental/mistral_24b/tests/test_vision_attention.py b/models/experimental/mistral_24b/tests/test_vision_attention.py
@@ -9,6 +9,7 @@
 from loguru import logger
 
 import ttnn
+from models.tt_transformers.tt.ccl import TT_CCL
 from models.tt_transformers.tt.model_config import ModelArgs
 from models.utility_functions import comp_allclose, comp_pcc, skip_for_grayskull
 
@@ -36,6 +37,11 @@
     "batch_size",
     (1,),
 )
+@pytest.mark.parametrize(
+    "device_params",
+    [{"fabric_config": ttnn.FabricConfig.FABRIC_1D, "trace_region_size": 30000000, "num_command_queues": 1}],
+    indirect=True,
+)
 def test_vision_attention(mesh_device, seq_len, batch_size):
     logger.info(f"seq_len: {seq_len}, batch_size: {batch_size}")
     dtype = ttnn.bfloat16
@@ -56,8 +62,10 @@ def test_vision_attention(mesh_device, seq_len, batch_size):
     n_heads = model_args.vision_attn_n_heads
     head_dim = hidden_size // n_heads
 
+    tt_ccl = TT_CCL(mesh_device)
     tt_model = TtLlamaImageAttention(
         mesh_device,
+        tt_ccl,
         state_dict,
         state_dict_prefix=first_layer_prefix,
         weight_cache_path=model_args.weight_cache_path(dtype),
diff --git a/models/experimental/mistral_24b/tt/model.py b/models/experimental/mistral_24b/tt/model.py
@@ -100,6 +100,14 @@ def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_pag
             self.rope_setup.sin_matrix[:, :, start_pos : start_pos + S, :],
         ]
 
+        if hasattr(self, "rope_local_setup"):
+            tt_rot_mats_prefill_local = [
+                self.rope_local_setup.cos_matrix[:, :, start_pos : start_pos + S, :],
+                self.rope_local_setup.sin_matrix[:, :, start_pos : start_pos + S, :],
+            ]
+        else:
+            tt_rot_mats_prefill_local = None
+
         if page_table is not None:
             tt_page_table = ttnn.from_torch(
                 page_table,
@@ -122,4 +130,4 @@ def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_pag
         else:
             tt_chunk_page_table = None
 
-        return tokens_embd, tt_rot_mats_prefill_global, tt_page_table, tt_chunk_page_table
+        return tokens_embd, tt_rot_mats_prefill_global, tt_rot_mats_prefill_local, tt_page_table, tt_chunk_page_table
diff --git a/models/experimental/mistral_24b/tt/pipeline/mistral_vision_tower.py b/models/experimental/mistral_24b/tt/pipeline/mistral_vision_tower.py
@@ -18,6 +18,7 @@ class MistralVisionTower(LightweightModule):
     def __init__(
         self,
         mesh_device,
+        tt_ccl,
         state_dict,
         state_dict_prefix,
         dtype,
@@ -28,6 +29,7 @@ def __init__(
 
         self.state_dict = state_dict
         self.mesh_device = mesh_device
+        self.tt_ccl = tt_ccl
         self.dtype = dtype
         self.config = configuration
 
@@ -98,6 +100,7 @@ def __init__(
 
         self.transformer = TtPixtralTransformer(
             mesh_device=self.mesh_device,
+            tt_ccl=tt_ccl,
             state_dict=self.state_dict,
             state_dict_prefix=f"{state_dict_prefix}transformer.",
             weight_cache_path=configuration.weight_cache_path(dtype),
diff --git a/models/experimental/mistral_24b/tt/pipeline/vision_model.py b/models/experimental/mistral_24b/tt/pipeline/vision_model.py
@@ -12,13 +12,15 @@
 
 
 class TtMistralVisionTransformer(LightweightModule):
-    def __init__(self, mesh_device, state_dict, state_dict_prefix, dtype, model_args):
+    def __init__(self, mesh_device, tt_ccl, state_dict, state_dict_prefix, dtype, model_args):
         super().__init__()
         self.state_dict = state_dict
         self.mesh_device = mesh_device
+        self.tt_ccl = tt_ccl
 
         self.vision_tower = MistralVisionTower(
             mesh_device=mesh_device,
+            tt_ccl=self.tt_ccl,
             state_dict=state_dict,
             state_dict_prefix=state_dict_prefix,
             dtype=dtype,
diff --git a/models/experimental/mistral_24b/tt/vision_attention.py b/models/experimental/mistral_24b/tt/vision_attention.py
@@ -6,7 +6,7 @@
 
 import ttnn
 from models.common.lightweightmodule import LightweightModule
-from models.utility_functions import nearest_32
+from models.utility_functions import is_blackhole, nearest_32
 
 
 def rotate_half(x):
@@ -33,6 +33,7 @@ class TtMistralImageAttention(LightweightModule):
     def __init__(
         self,
         mesh_device,
+        tt_ccl,
         state_dict,
         state_dict_prefix,
         weight_cache_path,
@@ -43,6 +44,7 @@ def __init__(
 
         self.state_dict = state_dict
         self.mesh_device = mesh_device
+        self.tt_ccl = tt_ccl
         self.num_devices = configuration.num_devices
 
         self.hidden_size = configuration.vision_dim
@@ -237,7 +239,23 @@ def forward(self, x_11SH, position_embeddings=None):
 
         # All reduce
         if self.num_devices > 1:  # replace with reduce_scatter and all_gather
-            dense_out_gathered = ttnn.all_gather(output_11SH, dim=1, num_links=1, topology=ttnn.Topology.Linear)
+            # TODO: 26411
+            # Remove this blackhole condition once fabric CCLs are working on blackhole
+            if is_blackhole():
+                dense_out_gathered = ttnn.all_gather(output_11SH, dim=1, num_links=1, topology=ttnn.Topology.Linear)
+            else:
+                dense_out_gathered = ttnn.experimental.all_gather_async(
+                    output_11SH,
+                    persistent_output_buffer=None,
+                    dim=1,
+                    multi_device_global_semaphore=self.tt_ccl.get_and_cycle_ag_semaphore_handles(),
+                    num_links=1,
+                    topology=ttnn.Topology.Linear,
+                    barrier_semaphore=self.tt_ccl.get_and_cycle_barrier_semaphore_handle(),
+                    chunks_per_sync=10,
+                    num_workers_per_link=2,
+                    num_buffers_per_channel=2,
+                )
             output_11SH.deallocate(True)
             dense_out_reduced = ttnn.experimental.fast_reduce_nc(
                 dense_out_gathered, dims=[1], output=None, compute_kernel_config=None
diff --git a/models/experimental/mistral_24b/tt/vision_pixtral_image_block.py b/models/experimental/mistral_24b/tt/vision_pixtral_image_block.py
@@ -14,6 +14,7 @@ class TtPixtralImageTransformerBlock(LightweightModule):
     def __init__(
         self,
         mesh_device,
+        tt_ccl,
         state_dict,
         state_dict_prefix,
         weight_cache_path,
@@ -23,6 +24,7 @@ def __init__(
         super().__init__()
         self.state_dict = state_dict
         self.mesh_device = mesh_device
+        self.tt_ccl = tt_ccl
         self.configuration = configuration
         self.num_devices = configuration.num_devices
         self.hidden_size = configuration.vision_dim
@@ -40,6 +42,7 @@ def __init__(
 
         self.attention = TtLlamaImageAttention(
             mesh_device,
+            tt_ccl,
             state_dict,
             state_dict_prefix=f"{state_dict_prefix}attention.",
             weight_cache_path=weight_cache_path,
diff --git a/models/experimental/mistral_24b/tt/vision_pixtral_transformer.py b/models/experimental/mistral_24b/tt/vision_pixtral_transformer.py
@@ -12,6 +12,7 @@ class TtPixtralTransformer(LightweightModule):
     def __init__(
         self,
         mesh_device,
+        tt_ccl,
         state_dict,
         state_dict_prefix,
         weight_cache_path,
@@ -23,11 +24,13 @@ def __init__(
 
         self.state_dict = state_dict
         self.mesh_device = mesh_device
+        self.tt_ccl = tt_ccl
 
         block_key = "layers"
         self.resblocks = [
             TtPixtralImageTransformerBlock(
                 mesh_device=mesh_device,
+                tt_ccl=self.tt_ccl,
                 state_dict=state_dict,
                 state_dict_prefix=f"{state_dict_prefix}{block_key}.{i}.",
                 weight_cache_path=weight_cache_path,
diff --git a/models/tt_transformers/tt/model_config.py b/models/tt_transformers/tt/model_config.py
@@ -1395,8 +1395,6 @@ def prepare_residual_tensor_prefill(self, x_bsh, force_replicated=False):
 
     def _get_text_prefix(self):
         if self.is_vision():
-            if "Mistral-Small-3.1-24B-Instruct-2503" in self.model_name:
-                return "language_model."
             return "text_model."
         else:
             return ""
@@ -1742,8 +1740,12 @@ def is_vision(self):
         return self.vision_chunk_size > 0
 
     def get_state_dict_prefix(self, module_name, layer_num, is_vision=False):
-        text_prefix = self.state_dict_text_prefix
-        vision_prefix = self.state_dict_vision_prefix
+        if self.is_vision() and "Mistral-Small-3.1-24B" not in self.model_name:
+            text_prefix = self.state_dict_text_prefix
+        else:
+            text_prefix = "" if not is_vision else self.state_dict_text_prefix
+
+        vision_prefix = self.state_dict_vision_prefix if is_vision else ""
 
         layer_prefix = f"layers.{layer_num}." if layer_num is not None else ""
 
diff --git a/real_inputs/pixtral_transformer_inputs/demo_small.jpg b/real_inputs/pixtral_transformer_inputs/demo_small.jpg
diff --git a/real_inputs/pixtral_transformer_inputs/people.jpg b/real_inputs/pixtral_transformer_inputs/people.jpg