diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml index c5b6c5c6548f..b1cf66be4eec 100644 --- a/.github/workflows/t3000-demo-tests-impl.yaml +++ b/.github/workflows/t3000-demo-tests-impl.yaml @@ -40,6 +40,7 @@ jobs: # This requirements and comment removed when https://github.com/tenstorrent/tt-metal/pull/22608 merges. { name: "t3k_qwen3_tests", arch: wormhole_b0, cmd: run_t3000_qwen3_tests, timeout: 60, owner_id: U03HY7MK4BT}, # Mark O'Connor { name: "t3k_qwen25_vl_tests", arch: wormhole_b0, cmd: run_t3000_qwen25_vl_tests, timeout: 60, owner_id: U07RY6B5FLJ}, #Gongyu Wang + # { name: "t3k_mistral_24b_tests", arch: wormhole_b0, cmd: run_t3000_mistral_24b_tests, timeout: 60, owner_id: }, # TO-DO: Requires owner ID ] name: ${{ matrix.test-group.name }} diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md index 26d1fb377913..b1d51109012a 100644 --- a/models/tt_transformers/PERF.md +++ b/models/tt_transformers/PERF.md @@ -45,6 +45,7 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep | Mistral-7B | N150 | 95 | 99 | 29.75 | 100.24 | | Mistral-7B | N300 | 95 | 99 | 47.01 | 65.95 | | Mistral-7B | T3K | 95 | 99 | 67.82 | 53.93 | +| Mistral-24B | T3K | 95 | 100 | 37.77 | 131.6ms | ## Accuracy @@ -82,6 +83,7 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att | Mistral-7B | N150 | 95 | 99 | 29.75 | 100.24 | | Mistral-7B | N300 | 95 | 99 | 47.01 | 65.95 | | Mistral-7B | T3K | 95 | 99 | 67.82 | 53.93 | +| Mistral-24B | T3K | 95 | 100 | 33.27 | 130.39ms | ## Long-context (64K Tokens) diff --git a/models/tt_transformers/demo/simple_text_demo.py b/models/tt_transformers/demo/simple_text_demo.py index 355cf5ee4302..a5664cfb1f6f 100644 --- a/models/tt_transformers/demo/simple_text_demo.py +++ b/models/tt_transformers/demo/simple_text_demo.py @@ -955,7 +955,15 @@ def test_demo_text( ) # Benchmark targets - supported_models = ["Llama-3.2-1B", "Llama-3.2-3B", "Llama-3.1-8B", "Llama-3.2-11B", "Llama-3.1-70B", "Mistral-7B"] + supported_models = [ + "Llama-3.2-1B", + "Llama-3.2-3B", + "Llama-3.1-8B", + "Llama-3.2-11B", + "Llama-3.1-70B", + "Mistral-7B", + "Mistral-Small-3.1-24B", + ] supported_devices = ["N150", "P100", "P150", "P300", "N300", "P150x4", "P150x8", "T3K", "TG"] tt_device_name = determine_device_name(mesh_device) # submesh device should not decide performance target @@ -1004,6 +1012,7 @@ def test_demo_text( "N300_Mistral-7B": 38, # TODO Update target "T3K_Mistral-7B": 45, # TODO Update target "TG_Mistral-7B": 45, # TODO Update target + "T3K_Mistral-Small-3.1-24B": 33, # TODO Update target } if model_device_key in dict_target_decode_tok_s_u: target_decode_tok_s_u = dict_target_decode_tok_s_u[model_device_key] @@ -1099,6 +1108,7 @@ def test_demo_text( # "T3K_Qwen2.5-72B": 13, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303) "T3K_Qwen2.5-Coder-32B": 21, # "T3K_Qwen3-32B": 20, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303) + "T3K_Mistral-Small-3.1-24B": 33, # TODO Update target } # Only call verify_perf if the model_device_key exists in the targets diff --git a/models/tt_transformers/demo/simple_vision_demo.py b/models/tt_transformers/demo/simple_vision_demo.py index 951f0b7e13c9..0ad95d1b9422 100644 --- a/models/tt_transformers/demo/simple_vision_demo.py +++ b/models/tt_transformers/demo/simple_vision_demo.py @@ -471,6 +471,7 @@ def test_multimodal_demo_text( logger.info("") logger.info(f"is_ci_env: {is_ci_env}") + is_ci_env = True if is_ci_env and max_batch_size == 1 and enable_trace: # Only profiling these parametrizations tt_device_name = model_args[0].device_name base_model_name = model_args[0].base_model_name @@ -478,12 +479,14 @@ def test_multimodal_demo_text( "N300_Llama-3.2-11B": 23, "T3K_Llama-3.2-11B": 20, "T3K_Llama-3.2-90B": 3, + "T3K_Mistral-Small-3.1-24B": 1254.50, }[f"{tt_device_name}_{base_model_name}"] target_decode_tok_s_u = { "N300_Llama-3.2-11B": 21.5, "T3K_Llama-3.2-11B": 34.25, "T3K_Llama-3.2-90B": 6, + "T3K_Mistral-Small-3.1-24B": 28.50, }[f"{tt_device_name}_{base_model_name}"] target_decode_tok_s = target_decode_tok_s_u * max_batch_size diff --git a/models/tt_transformers/tests/reference_outputs/Mistral-Small-3.1-24B-Instruct-2503.refpt b/models/tt_transformers/tests/reference_outputs/Mistral-Small-3.1-24B-Instruct-2503.refpt new file mode 100644 index 000000000000..5a274c8a3f92 Binary files /dev/null and b/models/tt_transformers/tests/reference_outputs/Mistral-Small-3.1-24B-Instruct-2503.refpt differ diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh index a4f8084339cb..cc007d1bf800 100755 --- a/tests/scripts/t3000/run_t3000_demo_tests.sh +++ b/tests/scripts/t3000/run_t3000_demo_tests.sh @@ -124,6 +124,30 @@ run_t3000_qwen25_vl_tests() { fi } +run_t3000_mistral_24b_tests() { + fail=0 + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_mistral_24b_tests" + + wh_arch_yaml=wormhole_b0_80_arch_eth_dispatch.yaml + + # Mistral-24B + mistral24b=/mnt/MLPerf/tt_dnn-models/mistral/Mistral-24B-Instruct + mesh_device=T3K + + MESH_DEVICE=$mesh_device HF_MODEL=$mistral24b pytest -n auto models/tt_transformers/demo/simple_vision_demo.py -k "batch1-notrace" --timeout 1200; fail+=$? + echo "LOG_METAL: Mistral-24B tests for $mesh_device completed" + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_mistral_24b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi +} + run_t3000_qwen3_tests() { # Record the start time fail=0