flexaihq · MohammedTaherMcW · Aug 8, 2025
diff --git a/.github/workflows/single-card-demo-tests-impl.yaml b/.github/workflows/single-card-demo-tests-impl.yaml
@@ -92,6 +92,8 @@ jobs:
           #          # Moved to t3k tests until OOM on single card runners resolved
                     # { name: "qwen7b", runner-label: "N300", performance: false, cmd: run_qwen7b_func, owner_id: U03PUAKE719}, # Mark O'Connor
                     { name: "qwen25_vl", runner-label: "N300", performance: true, cmd: run_qwen25_vl_func, owner_id: U07RY6B5FLJ},  #Gongyu Wang
+                    # { name: "gemma3_4b", runner-label: "N300", performance: true, cmd: run_gemma3_4b_func, owner_id: },  # TODO Owner ID needs to be updated
+
         ]
     name: ${{ matrix.test-group.name }}-${{ matrix.test-group.runner-label }}-${{ (matrix.test-group.performance && 'perf') || 'func' }}
     env:

diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md
@@ -45,6 +45,8 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep
 | Mistral-7B        | N150        | 95        | 99        | 29.75         | 100.24    |
 | Mistral-7B        | N300        | 95        | 99        | 47.01         | 65.95     |
 | Mistral-7B        | T3K         | 95        | 99        | 67.82         | 53.93     |
+| gemma-3-4b        | N150        | 67.0      | 80        | 28.00         | 81.00     |
+| gemma-3-4b        | N300        | 52.0      | 72.0      | 23.00         | 152       |
 
 
 ## Accuracy
@@ -82,6 +84,8 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att
 | Mistral-7B        | N150        | 95        | 99        | 29.75         | 100.24    |
 | Mistral-7B        | N300        | 95        | 99        | 47.01         | 65.95     |
 | Mistral-7B        | T3K         | 95        | 99        | 67.82         | 53.93     |
+| gemma-3-4b        | N150        | 67.0      | 80        | 28.00         | 81.00     |
+| gemma-3-4b        | N300        | 52.0      | 72.0      | 23.00         | 152       |
 
 ##  Long-context (64K Tokens)
 

diff --git a/models/tt_transformers/demo/simple_text_demo.py b/models/tt_transformers/demo/simple_text_demo.py
@@ -945,7 +945,15 @@ def test_demo_text(
     )
 
     # Benchmark targets
-    supported_models = ["Llama-3.2-1B", "Llama-3.2-3B", "Llama-3.1-8B", "Llama-3.2-11B", "Llama-3.1-70B", "Mistral-7B"]
+    supported_models = [
+        "Llama-3.2-1B",
+        "Llama-3.2-3B",
+        "Llama-3.1-8B",
+        "Llama-3.2-11B",
+        "Llama-3.1-70B",
+        "Mistral-7B",
+        "gemma-3-4b",
+    ]
     supported_devices = ["N150", "P100", "P150", "P300", "N300", "P150x4", "T3K", "TG"]
 
     tt_device_name = determine_device_name(mesh_device)  # submesh device should not decide performance target
@@ -994,6 +1002,9 @@ def test_demo_text(
             "N300_Mistral-7B": 38,  # TODO Update target
             "T3K_Mistral-7B": 45,  # TODO Update target
             "TG_Mistral-7B": 45,  # TODO Update target
+            #
+            "N150_gemma-3-4b": 23,
+            "N300_gemma-3-4b": 38,  # TODO Update target
         }
         if model_device_key in dict_target_decode_tok_s_u:
             target_decode_tok_s_u = dict_target_decode_tok_s_u[model_device_key]
@@ -1075,15 +1086,18 @@ def test_demo_text(
                 # "T3K_Qwen2.5-Coder-32B": 180,  # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754)
                 # "T3K_Qwen2.5-72B": 211,  # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754)
                 # "T3K_Qwen3-32B": 250, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754)
+                "N150_gemma-3-4b": 100,  # TODO Update target
             }
             ci_target_decode_tok_s_u = {
                 # N150 targets - higher is better
                 "N150_Llama-3.2-1B": 66,
                 "N150_Llama-3.2-3B": 35,
                 "N150_Llama-3.1-8B": 21,
                 "N150_Mistral-7B": 23,
+                "N150_gemma-3-4b": 23,  # TODO Update target
                 # N300 targets
                 "N300_Qwen2.5-7B": 20,
+                "N300_gemma-3-4b": 20,  # TODO Update target
                 # T3K targets
                 # "T3K_Llama-3.1-70B": 16, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303)
                 # "T3K_Qwen2.5-72B": 13, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303)

diff --git a/models/tt_transformers/demo/simple_vision_demo.py b/models/tt_transformers/demo/simple_vision_demo.py
@@ -480,12 +480,14 @@ def test_multimodal_demo_text(
             "N300_Llama-3.2-11B": 23.5,
             "T3K_Llama-3.2-11B": 21.5,
             "T3K_Llama-3.2-90B": 3,
+            "N300_gemma-3-4b": 390,
         }[f"{tt_device_name}_{base_model_name}"]
 
         target_decode_tok_s_u = {
             "N300_Llama-3.2-11B": 21.5,
             "T3K_Llama-3.2-11B": 37,
             "T3K_Llama-3.2-90B": 6,
+            "N300_gemma-3-4b": 24,
         }[f"{tt_device_name}_{base_model_name}"]
 
         target_decode_tok_s = target_decode_tok_s_u * max_batch_size

diff --git a/models/tt_transformers/tests/reference_outputs/gemma-3-4b-it.refpt b/models/tt_transformers/tests/reference_outputs/gemma-3-4b-it.refpt
diff --git a/models/tt_transformers/tests/test_accuracy.py b/models/tt_transformers/tests/test_accuracy.py
@@ -245,14 +245,27 @@ def test_tt_model_acc(
             theta=model_args.rope_theta,
             rope_scaling=model_args.rope_scaling,
         )
+
+        if model_args.rope_local_theta is not None:
+            # If local theta is set, use it to compute the local rope matrices
+            rot_mats_local = get_rot_mats(
+                head_dim=model_args.head_dim,
+                device=mesh_device,
+                seq_len=prefill_lens[0],
+                theta=model_args.rope_local_theta,
+                rope_scaling=None,
+            )
+        else:
+            rot_mats_local = None
+
         prefill_input = model_args.prepare_residual_tensor_prefill(
             pt_prefill_input[batch_id],
         )
 
         tt_out = tt_model(
             prefill_input,
             current_pos=None,
-            rot_mats=rot_mats_prefill,
+            rot_mats=[rot_mats_prefill, rot_mats_local],
             user_id=batch_id,
             mode="prefill",
             page_table=page_table_tt,
@@ -280,7 +293,7 @@ def test_tt_model_acc(
 
     # Get cos/sin matrices for the current position of each user
     rot_mats = tt_model.rope_setup.get_rot_mats(current_pos)
-
+    rot_mats_local = None if tt_model.rope_setup_local is None else tt_model.rope_setup.get_rot_mats(current_pos)
     # Print table header
     if use_reference_file:
         logger.info(f"{'Progress':<15}{'Correct':<8}{'True':<15}{'Actual':<15}{'Top 5 Predictions':<75}")
@@ -310,7 +323,7 @@ def test_tt_model_acc(
         tt_out = tt_model(
             decode_input,
             current_pos_tensor,
-            rot_mats=rot_mats,
+            rot_mats=[rot_mats, rot_mats_local],
             mode="decode",
             page_table=page_table_tt,
         )
@@ -351,7 +364,9 @@ def test_tt_model_acc(
         # Update rot_mats for next iteration
         current_pos += 1
         rot_mats = tt_model.rope_setup.get_rot_mats(current_pos)
-
+        rot_mats_local = (
+            tt_model.rope_setup_local.get_rot_mats(current_pos) if tt_model.rope_setup_local is not None else None
+        )
         # Modify the accuracy checking section when using reference text
         if not use_reference_file:
             # Get probabilities from model output

diff --git a/tests/scripts/single_card/run_single_card_demo_tests.sh b/tests/scripts/single_card/run_single_card_demo_tests.sh
@@ -21,6 +21,14 @@ run_qwen7b_func() {
 
 }
 
+
+run_gemma3_4b_func() {
+
+   HF_MODEL=google/gemma-3-1b-it MESH_DEVICE=N300 pytest -n auto models/tt_transformers/demo/simple_text_demo.py -k performance-ci-1 --timeout 1800
+
+}
+
+
 run_qwen25_vl_func() {
   fail=0
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,6 +21,14 @@ run_qwen7b_func() { @@
     }
+    run_gemma3_4b_func() {
+       HF_MODEL=google/gemma-3-1b-it MESH_DEVICE=N300 pytest -n auto models/tt_transformers/demo/simple_text_demo.py -k performance-ci-1 --timeout 1800
+    }
     run_qwen25_vl_func() {
       fail=0
@@ Expand Down @@