Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/single-card-demo-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ jobs:
# # Moved to t3k tests until OOM on single card runners resolved
# { name: "qwen7b", runner-label: "N300", performance: false, cmd: run_qwen7b_func, owner_id: U03PUAKE719}, # Mark O'Connor
{ name: "qwen25_vl", runner-label: "N300", performance: true, cmd: run_qwen25_vl_func, owner_id: U07RY6B5FLJ}, #Gongyu Wang
# { name: "gemma3_4b", runner-label: "N300", performance: true, cmd: run_gemma3_4b_func, owner_id: }, # TODO Owner ID needs to be updated

]
name: ${{ matrix.test-group.name }}-${{ matrix.test-group.runner-label }}-${{ (matrix.test-group.performance && 'perf') || 'func' }}
env:
Expand Down
4 changes: 4 additions & 0 deletions models/tt_transformers/PERF.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep
| Mistral-7B | N150 | 95 | 99 | 29.75 | 100.24 |
| Mistral-7B | N300 | 95 | 99 | 47.01 | 65.95 |
| Mistral-7B | T3K | 95 | 99 | 67.82 | 53.93 |
| gemma-3-4b | N150 | 67.0 | 80 | 28.00 | 81.00 |
| gemma-3-4b | N300 | 52.0 | 72.0 | 23.00 | 152 |


## Accuracy
Expand Down Expand Up @@ -82,6 +84,8 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att
| Mistral-7B | N150 | 95 | 99 | 29.75 | 100.24 |
| Mistral-7B | N300 | 95 | 99 | 47.01 | 65.95 |
| Mistral-7B | T3K | 95 | 99 | 67.82 | 53.93 |
| gemma-3-4b | N150 | 67.0 | 80 | 28.00 | 81.00 |
| gemma-3-4b | N300 | 52.0 | 72.0 | 23.00 | 152 |

## Long-context (64K Tokens)

Expand Down
16 changes: 15 additions & 1 deletion models/tt_transformers/demo/simple_text_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -945,7 +945,15 @@ def test_demo_text(
)

# Benchmark targets
supported_models = ["Llama-3.2-1B", "Llama-3.2-3B", "Llama-3.1-8B", "Llama-3.2-11B", "Llama-3.1-70B", "Mistral-7B"]
supported_models = [
"Llama-3.2-1B",
"Llama-3.2-3B",
"Llama-3.1-8B",
"Llama-3.2-11B",
"Llama-3.1-70B",
"Mistral-7B",
"gemma-3-4b",
]
supported_devices = ["N150", "P100", "P150", "P300", "N300", "P150x4", "T3K", "TG"]

tt_device_name = determine_device_name(mesh_device) # submesh device should not decide performance target
Expand Down Expand Up @@ -994,6 +1002,9 @@ def test_demo_text(
"N300_Mistral-7B": 38, # TODO Update target
"T3K_Mistral-7B": 45, # TODO Update target
"TG_Mistral-7B": 45, # TODO Update target
#
"N150_gemma-3-4b": 23,
"N300_gemma-3-4b": 38, # TODO Update target
}
if model_device_key in dict_target_decode_tok_s_u:
target_decode_tok_s_u = dict_target_decode_tok_s_u[model_device_key]
Expand Down Expand Up @@ -1075,15 +1086,18 @@ def test_demo_text(
# "T3K_Qwen2.5-Coder-32B": 180, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754)
# "T3K_Qwen2.5-72B": 211, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754)
# "T3K_Qwen3-32B": 250, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754)
"N150_gemma-3-4b": 100, # TODO Update target
}
ci_target_decode_tok_s_u = {
# N150 targets - higher is better
"N150_Llama-3.2-1B": 66,
"N150_Llama-3.2-3B": 35,
"N150_Llama-3.1-8B": 21,
"N150_Mistral-7B": 23,
"N150_gemma-3-4b": 23, # TODO Update target
# N300 targets
"N300_Qwen2.5-7B": 20,
"N300_gemma-3-4b": 20, # TODO Update target
# T3K targets
# "T3K_Llama-3.1-70B": 16, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303)
# "T3K_Qwen2.5-72B": 13, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303)
Expand Down
2 changes: 2 additions & 0 deletions models/tt_transformers/demo/simple_vision_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,12 +480,14 @@ def test_multimodal_demo_text(
"N300_Llama-3.2-11B": 23.5,
"T3K_Llama-3.2-11B": 21.5,
"T3K_Llama-3.2-90B": 3,
"N300_gemma-3-4b": 390,
}[f"{tt_device_name}_{base_model_name}"]

target_decode_tok_s_u = {
"N300_Llama-3.2-11B": 21.5,
"T3K_Llama-3.2-11B": 37,
"T3K_Llama-3.2-90B": 6,
"N300_gemma-3-4b": 24,
}[f"{tt_device_name}_{base_model_name}"]

target_decode_tok_s = target_decode_tok_s_u * max_batch_size
Expand Down
Binary file not shown.
23 changes: 19 additions & 4 deletions models/tt_transformers/tests/test_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,14 +245,27 @@ def test_tt_model_acc(
theta=model_args.rope_theta,
rope_scaling=model_args.rope_scaling,
)

if model_args.rope_local_theta is not None:
# If local theta is set, use it to compute the local rope matrices
rot_mats_local = get_rot_mats(
head_dim=model_args.head_dim,
device=mesh_device,
seq_len=prefill_lens[0],
theta=model_args.rope_local_theta,
rope_scaling=None,
)
else:
rot_mats_local = None

prefill_input = model_args.prepare_residual_tensor_prefill(
pt_prefill_input[batch_id],
)

tt_out = tt_model(
prefill_input,
current_pos=None,
rot_mats=rot_mats_prefill,
rot_mats=[rot_mats_prefill, rot_mats_local],
user_id=batch_id,
mode="prefill",
page_table=page_table_tt,
Expand Down Expand Up @@ -280,7 +293,7 @@ def test_tt_model_acc(

# Get cos/sin matrices for the current position of each user
rot_mats = tt_model.rope_setup.get_rot_mats(current_pos)

rot_mats_local = None if tt_model.rope_setup_local is None else tt_model.rope_setup.get_rot_mats(current_pos)
# Print table header
if use_reference_file:
logger.info(f"{'Progress':<15}{'Correct':<8}{'True':<15}{'Actual':<15}{'Top 5 Predictions':<75}")
Expand Down Expand Up @@ -310,7 +323,7 @@ def test_tt_model_acc(
tt_out = tt_model(
decode_input,
current_pos_tensor,
rot_mats=rot_mats,
rot_mats=[rot_mats, rot_mats_local],
mode="decode",
page_table=page_table_tt,
)
Expand Down Expand Up @@ -351,7 +364,9 @@ def test_tt_model_acc(
# Update rot_mats for next iteration
current_pos += 1
rot_mats = tt_model.rope_setup.get_rot_mats(current_pos)

rot_mats_local = (
tt_model.rope_setup_local.get_rot_mats(current_pos) if tt_model.rope_setup_local is not None else None
)
# Modify the accuracy checking section when using reference text
if not use_reference_file:
# Get probabilities from model output
Expand Down
8 changes: 8 additions & 0 deletions tests/scripts/single_card/run_single_card_demo_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ run_qwen7b_func() {

}


run_gemma3_4b_func() {

HF_MODEL=google/gemma-3-1b-it MESH_DEVICE=N300 pytest -n auto models/tt_transformers/demo/simple_text_demo.py -k performance-ci-1 --timeout 1800

}


run_qwen25_vl_func() {
fail=0

Expand Down