Mistral-24B: Evaluate performance metrics

nikileshx · nikileshx · commit cdc8656998e1 · 2025-08-22T11:03:06.000Z
diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml
@@ -40,6 +40,7 @@ jobs:
           # This requirements and comment removed when https://github.com/tenstorrent/tt-metal/pull/22608 merges.
           { name: "t3k_qwen3_tests", arch: wormhole_b0, cmd: run_t3000_qwen3_tests, timeout: 60, owner_id: U03HY7MK4BT}, # Mark O'Connor
           { name: "t3k_qwen25_vl_tests", arch: wormhole_b0, cmd: run_t3000_qwen25_vl_tests, timeout: 60, owner_id: U07RY6B5FLJ},  #Gongyu Wang
+          # { name: "t3k_mistral_24b_tests", arch: wormhole_b0, cmd: run_t3000_mistral_24b_tests, timeout: 60, owner_id: }, # TO-DO: Requires owner ID
         ]
 
     name: ${{ matrix.test-group.name }}
diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md
@@ -45,6 +45,7 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep
 | Mistral-7B        | N150        | 95        | 99        | 29.75         | 100.24    |
 | Mistral-7B        | N300        | 95        | 99        | 47.01         | 65.95     |
 | Mistral-7B        | T3K         | 95        | 99        | 67.82         | 53.93     |
+| Mistral-24B        | T3K         | 95        | 100        | 37.77         | 131.6ms     |
 
 
 ## Accuracy
@@ -82,6 +83,7 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att
 | Mistral-7B        | N150        | 95        | 99        | 29.75         | 100.24    |
 | Mistral-7B        | N300        | 95        | 99        | 47.01         | 65.95     |
 | Mistral-7B        | T3K         | 95        | 99        | 67.82         | 53.93     |
+| Mistral-24B        | T3K         | 95        | 100        | 33.27         | 130.39ms     |
 
 ##  Long-context (64K Tokens)
 
diff --git a/models/tt_transformers/demo/simple_text_demo.py b/models/tt_transformers/demo/simple_text_demo.py
@@ -955,7 +955,15 @@ def test_demo_text(
     )
 
     # Benchmark targets
-    supported_models = ["Llama-3.2-1B", "Llama-3.2-3B", "Llama-3.1-8B", "Llama-3.2-11B", "Llama-3.1-70B", "Mistral-7B"]
+    supported_models = [
+        "Llama-3.2-1B",
+        "Llama-3.2-3B",
+        "Llama-3.1-8B",
+        "Llama-3.2-11B",
+        "Llama-3.1-70B",
+        "Mistral-7B",
+        "Mistral-Small-3.1-24B",
+    ]
     supported_devices = ["N150", "P100", "P150", "P300", "N300", "P150x4", "P150x8", "T3K", "TG"]
 
     tt_device_name = determine_device_name(mesh_device)  # submesh device should not decide performance target
@@ -1004,6 +1012,7 @@ def test_demo_text(
             "N300_Mistral-7B": 38,  # TODO Update target
             "T3K_Mistral-7B": 45,  # TODO Update target
             "TG_Mistral-7B": 45,  # TODO Update target
+            "T3K_Mistral-Small-3.1-24B": 33,  # TODO Update target
         }
         if model_device_key in dict_target_decode_tok_s_u:
             target_decode_tok_s_u = dict_target_decode_tok_s_u[model_device_key]
@@ -1099,6 +1108,7 @@ def test_demo_text(
                 # "T3K_Qwen2.5-72B": 13, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303)
                 "T3K_Qwen2.5-Coder-32B": 21,
                 # "T3K_Qwen3-32B": 20, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303)
+                "T3K_Mistral-Small-3.1-24B": 33,  # TODO Update target
             }
 
             # Only call verify_perf if the model_device_key exists in the targets
diff --git a/models/tt_transformers/demo/simple_vision_demo.py b/models/tt_transformers/demo/simple_vision_demo.py
@@ -471,19 +471,22 @@ def test_multimodal_demo_text(
     logger.info("")
 
     logger.info(f"is_ci_env: {is_ci_env}")
+    is_ci_env = True
     if is_ci_env and max_batch_size == 1 and enable_trace:  # Only profiling these parametrizations
         tt_device_name = model_args[0].device_name
         base_model_name = model_args[0].base_model_name
         target_prefill_tok_s = {
             "N300_Llama-3.2-11B": 23,
             "T3K_Llama-3.2-11B": 20,
             "T3K_Llama-3.2-90B": 3,
+            "T3K_Mistral-Small-3.1-24B": 1254.50,
         }[f"{tt_device_name}_{base_model_name}"]
 
         target_decode_tok_s_u = {
             "N300_Llama-3.2-11B": 21.5,
             "T3K_Llama-3.2-11B": 34.25,
             "T3K_Llama-3.2-90B": 6,
+            "T3K_Mistral-Small-3.1-24B": 28.50,
         }[f"{tt_device_name}_{base_model_name}"]
 
         target_decode_tok_s = target_decode_tok_s_u * max_batch_size
diff --git a/models/tt_transformers/tests/reference_outputs/Mistral-Small-3.1-24B-Instruct-2503.refpt b/models/tt_transformers/tests/reference_outputs/Mistral-Small-3.1-24B-Instruct-2503.refpt
diff --git a/models/tt_transformers/tt/model_config.py b/models/tt_transformers/tt/model_config.py
@@ -567,6 +567,7 @@ def __init__(
                 "Phi-3.5-mini-instruct": {"N150": 128, "N300": 128, "T3K": 128, "TG": 128, "P150x4": 128},
                 "QwQ-32B": {"N150": None, "N300": None, "T3K": 64, "TG": 128, "P150x4": 128},
                 "Qwen3-32B": {"N150": None, "N300": None, "T3K": 64, "TG": 128, "P150x4": 128},
+                "Mistral-Small-3.1-24B": {"N150": None, "N300": None, "T3K": 128, "TG": 128, "P150x4": 128},
             }
             try:
                 max_prefill_chunk_size_div1024 = MAX_PREFILL_CHUNK_SIZES_DIV1024[self.base_model_name][self.device_name]
@@ -1379,8 +1380,6 @@ def prepare_residual_tensor_prefill(self, x_bsh, force_replicated=False):
 
     def _get_text_prefix(self):
         if self.is_vision():
-            if "Mistral-Small-3.1-24B-Instruct-2503" in self.model_name:
-                return "language_model."
             return "text_model."
         else:
             return ""
diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh
@@ -121,6 +121,30 @@ run_t3000_qwen25_vl_tests() {
   fi
 }
 
+run_t3000_mistral_24b_tests() {
+  fail=0
+  start_time=$(date +%s)
+
+  echo "LOG_METAL: Running run_t3000_mistral_24b_tests"
+
+  wh_arch_yaml=wormhole_b0_80_arch_eth_dispatch.yaml
+
+  # Mistral-24B
+  mistral24b=/mnt/MLPerf/tt_dnn-models/mistral/Mistral-24B-Instruct
+  mesh_device=T3K
+
+  MESH_DEVICE=$mesh_device HF_MODEL=$mistral24b pytest -n auto models/tt_transformers/demo/simple_vision_demo.py -k "batch1-notrace" --timeout 1200; fail+=$?
+  echo "LOG_METAL: Mistral-24B tests for $mesh_device completed"
+
+  # Record the end time
+  end_time=$(date +%s)
+  duration=$((end_time - start_time))
+  echo "LOG_METAL: run_t3000_mistral_24b_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
+}
+
 run_t3000_qwen3_tests() {
   # Record the start time
   fail=0

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,7 @@ jobs:`
`40`	`40`	`# This requirements and comment removed when https://github.com/tenstorrent/tt-metal/pull/22608 merges.`
`41`	`41`	`{ name: "t3k_qwen3_tests", arch: wormhole_b0, cmd: run_t3000_qwen3_tests, timeout: 60, owner_id: U03HY7MK4BT}, # Mark O'Connor`
`42`	`42`	`{ name: "t3k_qwen25_vl_tests", arch: wormhole_b0, cmd: run_t3000_qwen25_vl_tests, timeout: 60, owner_id: U07RY6B5FLJ}, #Gongyu Wang`
	`43`	`+ # { name: "t3k_mistral_24b_tests", arch: wormhole_b0, cmd: run_t3000_mistral_24b_tests, timeout: 60, owner_id: }, # TO-DO: Requires owner ID`
`43`	`44`	`]`
`44`	`45`
`45`	`46`	`name: ${{ matrix.test-group.name }}`