flexaihq · MohammedTaherMcW · Aug 7, 2025 · Jul 31, 2025 · Aug 11, 2025 · Aug 12, 2025
diff --git a/models/common/rmsnorm.py b/models/common/rmsnorm.py
@@ -85,7 +85,7 @@ def __init__(
             torch_weight,
             device=device,
             dtype=weight_dtype,
-            layout=ttnn.ROW_MAJOR_LAYOUT,
+            layout=ttnn.TILE_LAYOUT,
             memory_config=weight_memory_config,
             cache_file_name=cache_name,
             mesh_mapper=ttnn.ReplicateTensorToMesh(device) if is_mesh_device else None,
@@ -96,7 +96,7 @@ def __init__(
                 torch_weight,
                 device=device,
                 dtype=weight_dtype,
-                layout=ttnn.ROW_MAJOR_LAYOUT,
+                layout=ttnn.TILE_LAYOUT,
                 memory_config=weight_memory_config,
                 cache_file_name=cache_name,
                 mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, 2), mesh_shape=list(device.shape))
@@ -128,6 +128,11 @@ def forward(self, x: ttnn.Tensor, mode, in_sharded=False, out_sharded=False) ->
         else:
             assert not out_sharded, "Non-sharded version of RMSNorm cannot output a sharded tensor"
 
+        if x.shape[-1] % weight.shape[-1] == 0:
+            # Reshape weight only if x's last dimension is divisible by weight's last dimension,
+            # to avoid padding errors in RMSNorm when dimensions are not aligned
+            weight = ttnn.reshape(weight, [1, 1, 1, -1])
+
         x = norm(
             x,
             epsilon=self.eps,

diff --git a/models/tt_transformers/demo/simple_vision_demo.py b/models/tt_transformers/demo/simple_vision_demo.py
@@ -27,7 +27,9 @@
 import ttnn
 from models.demos.utils.llm_demo_utils import create_benchmark_data, verify_perf
 from models.perf.benchmarking_utils import BenchmarkProfiler
+from models.tt_transformers.tt.common import hf_multimodal_encode
 from models.tt_transformers.tt.generator import Generator
+from models.tt_transformers.tt.model_config import CheckpointType
 
 
 def get_batch_sampler(temperature, top_p, tokenizer):
@@ -61,6 +63,7 @@ def create_multimodal_model(
     checkpoint=None,
 ):
     from models.tt_transformers.tt.model_config import ModelArgs
+    from models.tt_transformers.tt.multimodal.gemma.gemma_e2e_model import TtGemmaModel
     from models.tt_transformers.tt.multimodal.llama_vision_model import CrossAttentionTransformer
 
     tt_model_args = ModelArgs(mesh_device, max_batch_size=max_batch_size)
@@ -76,14 +79,26 @@ def create_multimodal_model(
 
     if checkpoint is None:
         checkpoint = tt_model_args.load_state_dict()
-    model = CrossAttentionTransformer(
-        mesh_device,
-        state_dict=checkpoint,
-        weight_cache_path=tt_model_args.weight_cache_path(dtype),
-        dtype=dtype,
-        configuration=tt_model_args,
-        use_paged_kv_cache=use_paged_kv_cache,
-    )
+    print(f"Loaded checkpoint for {tt_model_args.base_model_name} with {checkpoint.keys()} keys")
+
+    if tt_model_args.base_model_name == "gemma-3-4b":
+        model = TtGemmaModel(
+            mesh_device=mesh_device,
+            state_dict=checkpoint,
+            weight_cache_path=tt_model_args.weight_cache_path(ttnn.bfloat8_b),
+            dtype=ttnn.bfloat8_b,
+            args=tt_model_args,
+            use_paged_kv_cache=use_paged_kv_cache,
+        )
+    else:
+        model = CrossAttentionTransformer(
+            mesh_device,
+            state_dict=checkpoint,
+            weight_cache_path=tt_model_args.weight_cache_path(dtype),
+            dtype=dtype,
+            configuration=tt_model_args,
+            use_paged_kv_cache=use_paged_kv_cache,
+        )
     return tt_model_args, model, checkpoint
 
 
@@ -128,7 +143,7 @@ def prepare_generator_args(
 )
 @pytest.mark.parametrize(
     "test_type,max_seq_len",
-    (("normal", 512),),
+    (("normal", 2048),),
     ids=["normal"],
 )
 @pytest.mark.parametrize(
@@ -148,7 +163,9 @@ def prepare_generator_args(
         # 4,
     ],
 )
-@pytest.mark.parametrize("device_params", [{"trace_region_size": 14951424, "num_command_queues": 2}], indirect=True)
+@pytest.mark.parametrize(
+    "device_params", [{"trace_region_size": 14951424, "num_command_queues": 2, "l1_small_size": 24576}], indirect=True
+)
 def test_multimodal_demo_text(
     mesh_device,
     warmup_iters,
@@ -172,9 +189,6 @@ def test_multimodal_demo_text(
     profiler = BenchmarkProfiler()
     profiler.start("run")
 
-    ckpt_dir = os.environ["LLAMA_DIR"]
-    tokenizer_path = str(Path(ckpt_dir) / "tokenizer.model")
-
     num_devices = mesh_device.get_num_devices() if isinstance(mesh_device, ttnn.MeshDevice) else 1
     max_batch_size *= data_parallel  # input batch_size is interpreted as size per DP group
 
@@ -185,11 +199,26 @@ def test_multimodal_demo_text(
         max_batch_size=max_batch_size,
         max_seq_len=max_seq_len,
     )
+
+    HF_MODEL = model_args[0].checkpoint_type == CheckpointType.HuggingFace
+
+    if not HF_MODEL:
+        ckpt_dir = os.environ["LLAMA_DIR"]
+        tokenizer_path = str(Path(ckpt_dir) / "tokenizer.model")
+
+        tokenizer = Tokenizer(model_path=tokenizer_path)
+        formatter = ChatFormat(tokenizer)
+    else:
+        from transformers import AutoProcessor
+
+        processor = AutoProcessor.from_pretrained(model_args[0].CKPT_DIR)
+
     generator = Generator(model, model_args, mesh_device)
-    tokenizer = Tokenizer(model_path=tokenizer_path)
-    formatter = ChatFormat(tokenizer)
 
-    xattn_caches = [model.setup_cache(model_args[i].max_batch_size) for i, model in enumerate(generator.model)]
+    xattn_caches = [
+        model.setup_cache(model_args[i].max_batch_size) if not HF_MODEL else None
+        for i, model in enumerate(generator.model)
+    ]
 
     # Create random images for trace capture with specific dimensions
     trace_img_560x560 = create_random_image(560, 560)
@@ -250,10 +279,12 @@ def test_multimodal_demo_text(
     total_users = len(dialogs)
     num_batches = total_users // max_batch_size
 
-    sampler = get_batch_sampler(temperature, top_p, tokenizer)
+    sampler = get_batch_sampler(temperature, top_p, model_args[0].tokenizer)
     _num_prefill_tokens = 0
     _num_decode_tokens = 0
 
+    prompt_encoder = hf_multimodal_encode if HF_MODEL else formatter.encode_dialog_prompt
+
     for iter_num in range(warmup_iters + 1):
         logger.info(f"Iteration {iter_num}")
         current_dialogs = trace_dialogs + dialogs
@@ -263,9 +294,14 @@ def test_multimodal_demo_text(
                 for msg in dialog:
                     print(f"{msg.role.capitalize()}: {msg.content}\n")
             batch_model_input = [
-                formatter.encode_dialog_prompt(dialog, tool_prompt_format=False) for dialog in batch_dialogs
+                prompt_encoder(dialog, processor) if HF_MODEL else prompt_encoder(dialog, tool_prompt_format=False)
+                for dialog in batch_dialogs
             ]
 
+            if HF_MODEL:
+                # Use the processor's tokenizer instead of model_args tokenizer to ensure consistency
+                tokenizer = processor.tokenizer
+
             # Do initial prefill
             vision_images = [
                 model_input.vision.images if model_input.vision else None for model_input in batch_model_input
@@ -278,7 +314,8 @@ def test_multimodal_demo_text(
             total_lens = prefill_lens + max_gen_len
 
             # Create padded tokens tensor for batch
-            pad_id = tokenizer.pad_id
+            stop_tokens = model_args[0].tokenizer.stop_tokens
+            pad_id = tokenizer.pad_token_id if HF_MODEL else tokenizer.pad_id
             bsz = len(prompt_tokens)
             tokens = torch.full((bsz, max(total_lens)), pad_id, dtype=torch.long)
 
@@ -358,19 +395,29 @@ def test_multimodal_demo_text(
                         profiler.end(f"compile_decode", iteration=batch_idx)
 
                     # Disable checking for eot until I have more robust code for batch > 1
-                    # if text in ["<|eot_id|>", "<|eom_id|>"]:
-                    #     break
+                    if HF_MODEL:
+                        if next_tokens in stop_tokens:
+                            break
+                    else:
+                        # Disable checking for eot until I have more robust code for batch > 1
+                        pass
+                        # if text in ["<|eot_id|>", "<|eom_id|>"]:
+                        #     break
                 _num_decode_tokens += (
                     gen_idx * max_batch_size
                 )  # gen_idx is (num_tokens - 1) to avoid counting compile iter
 
             # Log full text output for each user in batch
-            vision_tokens = [tokenizer.special_tokens["<|image|>"], 128256]
+            if HF_MODEL:
+                # For HF models, get vision tokens from the processor if they exist
+                vision_tokens = []
+            else:
+                vision_tokens = [tokenizer.special_tokens["<|image|>"], 128256]
 
             for user_id in range(max_batch_size):
                 # Remove <|image|> tokens since they break the tokenizer
                 tokens_out = [
-                    t if t not in vision_tokens else tokenizer.pad_id
+                    t if t not in vision_tokens else pad_id
                     for t in tokens[user_id].tolist()[: position_id[user_id] + 2]
                 ]
                 text = tokenizer.decode(tokens_out)

diff --git a/models/tt_transformers/tests/multimodal/gemma/test_mmp.py b/models/tt_transformers/tests/multimodal/gemma/test_mmp.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import pytest
+import torch
+from loguru import logger
+
+import ttnn
+from models.tt_transformers.tt.model_config import ModelArgs
+from models.tt_transformers.tt.multimodal.gemma.multi_modal_projector import TtGemma3MultiModalProjector
+from models.utility_functions import comp_allclose, comp_pcc, skip_for_grayskull
+
+
+@torch.no_grad()
+@skip_for_grayskull("Requires wormhole_b0 to run")
+@pytest.mark.parametrize(
+    "device",
+    [
+        {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get(
+            os.environ.get("device"), len(ttnn.get_device_ids())
+        )
+    ],
+    indirect=True,
+)
+@pytest.mark.parametrize(
+    "seq_len",
+    (1152,),
+)
+@pytest.mark.parametrize(
+    "batch_size",
+    (1,),
+)
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
+def test_multi_modal_inference(seq_len, batch_size, reset_seeds, device):
+    print("device:", device)
+    dtype = ttnn.bfloat16
+    mode = "decode" if seq_len <= 32 else "prefill"
+
+    tt_model_args = ModelArgs(
+        device,
+        max_batch_size=batch_size,
+        max_seq_len=128,
+    )
+
+    tt_model_args.n_layers = 1
+    state_dict = tt_model_args.load_state_dict()
+
+    reference_model = tt_model_args.reference_vision_multi_modal()
+    # first_layer_prefix = "multi_modal_projector."
+
+    # partial_state_dict = {
+    #     k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix))
+    # }
+
+    # reference_model.load_state_dict(partial_state_dict)
+
+    # create input tensor for multi_modal_projector layer
+    patches_per_image = 64
+    num_patches = patches_per_image * patches_per_image
+    input = torch.randn((batch_size, num_patches, seq_len))
+    reference_output = reference_model(input)
+
+    # DistributedNorm inputs are fractured across devices and interleaved in DRAM (for prefill) and L1 (for decode)
+    tt_input = ttnn.from_torch(
+        input,
+        device=device,
+        dtype=dtype,
+        layout=ttnn.TILE_LAYOUT,
+        mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, -1), mesh_shape=tt_model_args.cluster_shape),
+        # memory_config=(
+        #     tt_model_args.get_model_config()["DECODE_RESIDUAL_MEMCFG"] if mode == "decode" else ttnn.DRAM_MEMORY_CONFIG
+        # ),
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+    )
+
+    tt_model = TtGemma3MultiModalProjector(
+        mesh_device=device,
+        state_dict=state_dict,
+        state_dict_prefix="model.multi_modal_projector",
+        image_size=tt_model_args.vision_chunk_size,
+        patch_size=tt_model_args.vision_patch_size,
+        hidden_size=tt_model_args.vision_hidden_dim,
+        mm_tokens_per_image=tt_model_args.mm_tokens_per_image,
+        weight_cache_path=tt_model_args.weight_cache_path(dtype),
+        layer_norm_eps=1e-06,  # layer_norm_eps
+        dtype=dtype,
+        configuration=tt_model_args,
+    )
+    tt_output = tt_model(tt_input)
+
+    tt_output_torch = ttnn.to_torch(tt_output).squeeze(0)
+    passing, pcc_message = comp_pcc(reference_output, tt_output_torch)
+
+    pcc_required = 0.9999
+    logger.info(comp_allclose(reference_output, tt_output_torch))
+    logger.info(f"PCC: {pcc_message}")
+    assert passing, f"PCC value is lower than {pcc_required} for some of the outputs. Check Warnings!"