From 375e9f57fd0ea54cd15f20c7b8b5c8c39d4c36d3 Mon Sep 17 00:00:00 2001 From: MohammedTaherMcW Date: Sun, 24 Aug 2025 18:52:08 +0000 Subject: [PATCH 1/4] Add Accuracy and Performance metrics for Gemma-3-27b-it --- .../single-card-demo-tests-impl.yaml | 1 + .github/workflows/t3000-demo-tests-impl.yaml | 1 + models/tt_transformers/PERF.md | 7 +- .../tt_transformers/demo/simple_text_demo.py | 22 +- .../demo/simple_vision_demo.py | 6 + models/tt_transformers/tests/test_accuracy.py | 493 ++++++++++++++++++ .../single_card/run_single_card_demo_tests.sh | 6 + tests/scripts/t3000/run_t3000_demo_tests.sh | 24 + 8 files changed, 558 insertions(+), 2 deletions(-) create mode 100644 models/tt_transformers/tests/test_accuracy.py diff --git a/.github/workflows/single-card-demo-tests-impl.yaml b/.github/workflows/single-card-demo-tests-impl.yaml index c0b1dc998f4c..93f49b61c218 100644 --- a/.github/workflows/single-card-demo-tests-impl.yaml +++ b/.github/workflows/single-card-demo-tests-impl.yaml @@ -115,6 +115,7 @@ jobs: { name: "qwen25_vl", runner-label: "N300", performance: true, cmd: run_qwen25_vl_func, owner_id: U07RY6B5FLJ}, #Gongyu Wang { name: "gemma3", runner-label: "N150", performance: true, cmd: run_gemma3_perf, owner_id: U08TJ70UFRT}, # Harry Andrews { name: "gemma3", runner-label: "N300", performance: true, cmd: run_gemma3_perf, owner_id: U08TJ70UFRT}, # Harry Andrews + # { name: "gemma3_4b", runner-label: "N300", performance: true, cmd: run_gemma3_4b_func, owner_id: }, # TODO Owner ID needs to be updated ]] steps: - name: Compute tests diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml index 157db20b3a25..7e02bda0f274 100644 --- a/.github/workflows/t3000-demo-tests-impl.yaml +++ b/.github/workflows/t3000-demo-tests-impl.yaml @@ -41,6 +41,7 @@ jobs: { name: "t3k_qwen3_tests", arch: wormhole_b0, cmd: run_t3000_qwen3_tests, timeout: 60, owner_id: U03HY7MK4BT}, # Mark O'Connor { name: "t3k_qwen25_vl_tests", arch: wormhole_b0, cmd: run_t3000_qwen25_vl_tests, timeout: 30, owner_id: U07RY6B5FLJ}, #Gongyu Wang { name: "t3k_gemma3_tests", arch: wormhole_b0, cmd: run_t3000_gemma3_tests, timeout: 30, owner_id: U08TJ70UFRT}, # Harry Andrews + # { name: "t3k_gemma_3_27b_tests", arch: wormhole_b0, cmd: run_t3000_gemma_3_27b_tests, timeout: 60, owner_id: }, # TODO: Requires owner ID ] name: ${{ matrix.test-group.name }} diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md index fb0f2f60da15..750bbe78257e 100644 --- a/models/tt_transformers/PERF.md +++ b/models/tt_transformers/PERF.md @@ -51,7 +51,9 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep | Phi-3-mini-128k-instruct | N150 | 89 | 99 | 45.0 | 73.32 | | Phi-3-mini-128k-instruct | N300 | 89 | 99 | 60.87 | 114.94 | | Mixtral-8x7B-v0.1 | T3K | 95 | 100 | 67.82 | 53.93 | - +| gemma-3-4b | N150 | 78 | 95 | 34 | 68 | +| gemma-3-4b | N300 | 78 | 95 | 35 | 125 | +| gemma-3-27b | T3K | 90 | 99 | 16 | 331 | ## Accuracy @@ -92,6 +94,9 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att | Phi-3-mini-128k-instruct | N150 | 94 | 99 | 40.41 | 82.58 | | Phi-3-mini-128k-instruct | N300 | 94 | 99 | 57.0 | 115.36 | | Mixtral-8x7B-v0.1 | T3K | 95 | 100 | 67.82 | 53.93 | +| gemma-3-4b | N150 | 88 | 98 | 30 | 79 | +| gemma-3-4b | N300 | 86 | 98 | 32 | 135 | +| gemma-3-27b | T3K | 91 | 100 | 15 | 361 | ## Long-context (64K Tokens) diff --git a/models/tt_transformers/demo/simple_text_demo.py b/models/tt_transformers/demo/simple_text_demo.py index 3b23f36e1284..d586fb459943 100644 --- a/models/tt_transformers/demo/simple_text_demo.py +++ b/models/tt_transformers/demo/simple_text_demo.py @@ -1048,7 +1048,16 @@ def test_demo_text( ) # Benchmark targets - supported_models = ["Llama-3.2-1B", "Llama-3.2-3B", "Llama-3.1-8B", "Llama-3.2-11B", "Llama-3.1-70B", "Mistral-7B"] + supported_models = [ + "Llama-3.2-1B", + "Llama-3.2-3B", + "Llama-3.1-8B", + "Llama-3.2-11B", + "Llama-3.1-70B", + "Mistral-7B", + "gemma-3-4b", + "gemma-3-27b", + ] supported_devices = ["N150", "P100", "P150", "P300", "N300", "P150x4", "P150x8", "T3K", "TG"] tt_device_name = determine_device_name(mesh_device) # submesh device should not decide performance target @@ -1097,6 +1106,11 @@ def test_demo_text( "N300_Mistral-7B": 38, # TODO Update target "T3K_Mistral-7B": 45, # TODO Update target "TG_Mistral-7B": 45, # TODO Update target + # + "N150_gemma-3-4b": 23, # TODO Update target + "N300_gemma-3-4b": 38, # TODO Update target + # + "T3K_gemma-3-27b": 38, # TODO Update target } if model_device_key in dict_target_decode_tok_s_u: target_decode_tok_s_u = dict_target_decode_tok_s_u[model_device_key] @@ -1189,13 +1203,16 @@ def test_demo_text( "N150_Llama-3.2-3B": 62, "N150_Llama-3.1-8B": 120, "N150_Mistral-7B": 106, + "N150_gemma-3-4b": 65, # TODO Update target # N300 targets "N300_Qwen2.5-7B": 90, + "N300_gemma-3-4b": 125, # TODO Update target # T3K targets "T3K_Llama-3.1-70B": 204, "T3K_Qwen2.5-Coder-32B": 173, # `f10cs08` "T3K_Qwen2.5-72B": 240, "T3K_Qwen3-32B": 166.5, + "T3K_gemma-3-27b": 330, # TODO Update target } ci_target_decode_tok_s_u = { # N150 targets - higher is better @@ -1203,13 +1220,16 @@ def test_demo_text( "N150_Llama-3.2-3B": 35, "N150_Llama-3.1-8B": 21, "N150_Mistral-7B": 23, + "N150_gemma-3-4b": 35, # TODO Update target # N300 targets "N300_Qwen2.5-7B": 22.8, + "N300_gemma-3-4b": 35, # TODO Update target # T3K targets "T3K_Llama-3.1-70B": 15, "T3K_Qwen2.5-72B": 13.25, "T3K_Qwen2.5-Coder-32B": 21, "T3K_Qwen3-32B": 21, + "T3K_gemma-3-27b": 15, # TODO Update target } # Only call verify_perf if the model_device_key exists in the targets diff --git a/models/tt_transformers/demo/simple_vision_demo.py b/models/tt_transformers/demo/simple_vision_demo.py index fb32456c314d..ab03123cd351 100644 --- a/models/tt_transformers/demo/simple_vision_demo.py +++ b/models/tt_transformers/demo/simple_vision_demo.py @@ -499,12 +499,18 @@ def test_multimodal_demo_text( run_config = (tt_device_name, base_model_name, max_batch_size) targets_prefill_tok_s = { + ("N150", "gemma-3-4b", 1): 265, ("N300", "Llama-3.2-11B", 16): 22.4, + ("N300", "gemma-3-4b", 1): 350, ("T3K", "Llama-3.2-90B", 1): 15.3, + ("T3K", "gemma-3-27b", 1): 250, } targets_decode_tok_s_u = { + ("N150", "gemma-3-4b", 1): 20, ("N300", "Llama-3.2-11B", 16): 17, + ("N300", "gemma-3-4b", 1): 20, ("T3K", "Llama-3.2-90B", 1): 4.3, + ("T3K", "gemma-3-27b", 1): 12, } perf_targets = {} diff --git a/models/tt_transformers/tests/test_accuracy.py b/models/tt_transformers/tests/test_accuracy.py new file mode 100644 index 000000000000..2a8bb1a0eef0 --- /dev/null +++ b/models/tt_transformers/tests/test_accuracy.py @@ -0,0 +1,493 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 +import bz2 +import os +from pathlib import Path + +import pytest +import torch +from loguru import logger + +import ttnn +from models.tt_transformers.tt.common import PagedAttentionConfig, preprocess_inputs_prefill +from models.tt_transformers.tt.model import Transformer +from models.tt_transformers.tt.model_config import DecodersPrecision, ModelArgs, parse_decoder_json +from models.tt_transformers.tt.rope import get_rot_mats + + +def get_accuracy_thresholds(model_args, optimizations): + """Parse accuracy thresholds from PERF.md for the given model, optimization mode, and device.""" + # Read PERF.md + perf_file = Path(__file__).parent.parent / "PERF.md" + with open(perf_file, "r") as f: + content = f.read() + + # Split into sections based on optimization mode + sections = content.split("## ") + if callable(optimizations): + optimizations = optimizations(model_args) + first_decoder_conf = optimizations.decoder_optimizations[0] + target_section = next(s for s in sections if s.lower().startswith(f"{first_decoder_conf.__name__}\n")) + + # Parse the table and find the row for our model and device + # Potential lines have the form "| Llama-3.1-8b | T3K | 91 | 99 | 49.8 |" + base_model_name = model_args.base_model_name + device_name = model_args.device_name + correct_line = ( + lambda line: "|" in line + and base_model_name.lower() in line.split("|")[1].strip().lower() + and device_name.lower() in line.split("|")[2].strip().lower() + and not "(DP=".lower() in line.lower() # ignore DP/HP report for now + ) + rows = [ + line.split("|")[1:] # Each row starts with a separator + for line in target_section.split("\n") + if correct_line(line) + ] + if not rows: + raise ValueError( + f"Could not find accuracy data for {base_model_name} on {device_name} in {optimizations.__name__} mode" + ) + + assert ( + len(rows) == 1 + ), f"Found multiple rows for {base_model_name} on {device_name} in {optimizations.__name__} mode in PERF.md" + row = rows[0] + top1_acc = float(row[2].strip()) + top5_acc = float(row[3].strip()) + + # Allow for rounding + return top1_acc - 0.5, top5_acc - 0.5 + + +@torch.no_grad() +@pytest.mark.timeout(1200) +@pytest.mark.parametrize( + "prefill_len, decode_len, max_seq_len", # Max seqlen should be at least prefill_len + decode_len + ((512, 511, 1024),), + # ((131072-8192, 8192-1, 131072),), +) +@pytest.mark.parametrize( + "mesh_device", + [ + {"N150": (1, 1), "N300": (1, 2), "N150x4": (1, 4), "T3K": (1, 8), "TG": (8, 4)}.get( + os.environ.get("MESH_DEVICE"), len(ttnn.get_device_ids()) + ) + ], + indirect=True, +) +@pytest.mark.parametrize( + "optimizations", + [ + lambda model_args: DecodersPrecision.performance(model_args.n_layers, model_args.model_name), + lambda model_args: DecodersPrecision.accuracy(model_args.n_layers, model_args.model_name), + ], + ids=["performance", "accuracy"], +) +@pytest.mark.parametrize( + "paged_attention", + ( + True, + # False + ), + ids=( + "paged_attention", + # "default_attention" + ), +) +@pytest.mark.parametrize( + "page_params", + [{"page_block_size": 32, "page_max_num_blocks": 1024}], +) +@pytest.mark.parametrize( + "batch_size", + (1,), +) +@pytest.mark.parametrize( + "use_reference_file", + [ + pytest.param(True, id="reference_file"), + pytest.param(False, id="reference_text"), + ], +) +@pytest.mark.parametrize("device_params", [{"fabric_config": True}], indirect=True) +def test_tt_model_acc( + prefill_len, + decode_len, + max_seq_len, + batch_size, + paged_attention, + page_params, + optimizations, + mesh_device, + use_reference_file, + reset_seeds, + ensure_gc, + is_ci_env, + request, +): + if is_ci_env and not use_reference_file: + pytest.skip("CI test only runs vs reference file") + + dtype = ttnn.bfloat8_b + + json_config_file = request.config.getoption("--decoder_config_file") + if json_config_file: + optimizations = parse_decoder_json(json_config_file) + else: + optimizations = request.config.getoption("--optimizations") or optimizations + + # Load model args and tokenizer + model_args = ModelArgs( + mesh_device, optimizations=optimizations, max_batch_size=batch_size, max_seq_len=max_seq_len, cache_hf=True + ) + logger.info(f"Optimizations: {model_args.optimizations._full_name}") + + tokenizer = model_args.tokenizer + + # Load state_dict for TT model + logger.info("Loading weights...") + state_dict = model_args.load_state_dict() + logger.info("Finished loading weights...") + + # Load the reference data + + if use_reference_file: + # Existing reference file loading logic + reference_data_file = f"models/tt_transformers/tests/reference_outputs/{model_args.model_name}.refpt" + logger.info(f"Loading reference data from {reference_data_file}") + assert os.path.exists(reference_data_file) + reference_data = torch.load(reference_data_file) + reference_tokens = reference_data["reference_tokens"] + top5_tokens = reference_data["top5_tokens"] + else: + # Load and encode the reference text + current_file_path = os.path.dirname(os.path.abspath(__file__)) + prompt_file = os.path.join(current_file_path, "tale-of-two-cities.txt.bz2") + with bz2.open(prompt_file, "rt", encoding="utf-8") as f: + text = f.read() + + # Encode text to tokens + encoded_tokens = model_args.encode_prompt(text, system_prompt_text=None, instruct=False) + total_length = prefill_len + decode_len + 1 + reference_tokens = torch.tensor(encoded_tokens[:total_length]).unsqueeze(0) + top5_tokens = None # Will be computed during inference + + N = prefill_len + decode_len + input_ids = reference_tokens[:, : N + 1] # Shape [1, N+1] + + page_table_tt = None + paged_attention_config = None + + if paged_attention: + paged_attention_config = PagedAttentionConfig( + block_size=page_params["page_block_size"], + max_num_blocks=page_params["page_max_num_blocks"], + ) + # Implied shuffling of blocks + permutation = torch.randperm(paged_attention_config.max_num_blocks) + # Page table which maps virtual blocks to physical + reverse_permutation = torch.argsort(permutation) + page_table = reverse_permutation.reshape( + model_args.max_batch_size, paged_attention_config.max_num_blocks // model_args.max_batch_size + ) + page_table_tt = ttnn.from_torch( + page_table, + device=mesh_device, + dtype=ttnn.int32, + layout=ttnn.ROW_MAJOR_LAYOUT, + mesh_mapper=ttnn.ShardTensor2dMesh( + mesh_device, + dims=(None, -2) if batch_size > 1 else (None, None), + mesh_shape=model_args.cluster_shape, + ), + ) + + # Initialize TT model + tt_model = Transformer( + args=model_args, + mesh_device=mesh_device, + dtype=dtype, + state_dict=state_dict, + weight_cache_path=model_args.weight_cache_path(dtype), + paged_attention_config=paged_attention_config, + ) + # Initialize embedding + embd = model_args.reference_embedding() + state_dict_prefix = model_args.get_state_dict_prefix("", None) + embd.load_state_dict({"emb.weight": state_dict[f"{state_dict_prefix}tok_embeddings.weight"]}) + + # Skip prefill if prefill_len is 0 + if prefill_len > 0: + logger.info(f"Starting prefill...") + batch_id = 0 + input_prompts = [tokenizer.decode(reference_tokens[0, :prefill_len].tolist())] + ( + input_tokens_prefill_pt, + encoded_prompts, + decoding_pos, + prefill_lens, + ) = preprocess_inputs_prefill( + input_prompts, + tokenizer, + [model_args], + instruct=False, + max_generated_tokens=decode_len, + max_prefill_len=prefill_len, + ) + pt_prefill_input = [embd(input_tokens_prefill_pt[b]).view(1, prefill_lens[b], -1) for b in range(1)] + + # Pre-compute the rotational embedding matrix and send to device + if hasattr(tt_model, "rope_local_setup"): + # If local theta is set, use it to compute the local rope matrices + rot_mats_local = get_rot_mats( + head_dim=model_args.head_dim, + device=mesh_device, + seq_len=prefill_lens[0], + theta=model_args.rope_theta_local, + rope_scaling=None, + ) + else: + rot_mats_local = None + + rot_mats_prefill = get_rot_mats( + head_dim=model_args.head_dim, + device=mesh_device, + seq_len=prefill_lens[0], + theta=model_args.rope_theta, + rope_scaling=model_args.rope_scaling, + ) + prefill_input = model_args.prepare_residual_tensor_prefill( + pt_prefill_input[batch_id], + ) + + tt_out = tt_model( + prefill_input, + current_pos=None, + rot_mats_global=rot_mats_prefill, + rot_mats_local=rot_mats_local, + user_id=batch_id, + mode="prefill", + page_table=page_table_tt, + get_last_token=((decoding_pos[batch_id] - 1) // 32) * 32, + ) + + # Start decoding + logger.info(f"Starting decode...") + generation_start_pos = prefill_len + generation_length = decode_len + + # Initial positions + decoding_pos = [generation_start_pos] * model_args.max_batch_size + current_pos = torch.tensor([decoding_pos[b] for b in range(model_args.max_batch_size)]) + current_pos_tensor = ttnn.from_torch( + current_pos, + device=mesh_device, + dtype=ttnn.int32, + mesh_mapper=ttnn.ShardTensor2dMesh( + mesh_device, + dims=(None, 0) if (model_args.is_galaxy and batch_size > 1) else (None, None), + mesh_shape=model_args.cluster_shape, + ), + ) + + # Get cos/sin matrices for the current position of each user + rot_mats = tt_model.rope_setup.get_rot_mats(current_pos) + rot_mats_local = ( + tt_model.rope_local_setup.get_rot_mats(current_pos) if hasattr(tt_model, "rope_local_setup") else None + ) + # Print table header + if use_reference_file: + logger.info(f"{'Progress':<15}{'Correct':<8}{'True':<15}{'Actual':<15}{'Top 5 Predictions':<75}") + else: + logger.info(f"{'Progress':<15}{'Correct':<8}{'True':<15}{'Top 5 Predictions':<75}") + logger.info("-" * 113) + + top1_correct = [] + top5_correct = [] + errors = [] # New list to store error information + + for i in range(generation_length): + # Input is reference token at each step + ref_token = input_ids[0, prefill_len + i].item() + # Get the true next token (if available) + true_token = input_ids[0, prefill_len + i + 1].item() if i < generation_length - 1 else None + # Convert to torch tensor + ref_token = torch.tensor([[ref_token]], dtype=torch.int32) # Shape [1,1] + # Get embedding + pt_decode_input = embd(ref_token).view(1, 1, -1) + # Prepare input for TT model + decode_input = model_args.prepare_residual_tensor_decode( + pt_decode_input, + model_args.model_config["DECODE_RESIDUAL_MEMCFG"], + ) + # Run TT model + tt_out = tt_model( + decode_input, + current_pos_tensor, + rot_mats_global=rot_mats, + rot_mats_local=rot_mats_local, + mode="decode", + page_table=page_table_tt, + ) + + if tt_model.args.num_devices > 1: + cluster_axis = 0 if tt_model.args.is_galaxy else None + num_links = tt_model.args.num_all_gather_links if tt_model.args.is_galaxy else 1 + tt_out_gathered = ttnn.experimental.all_gather_async( + tt_out, + persistent_output_buffer=None, + dim=3, + multi_device_global_semaphore=tt_model.tt_ccl.get_and_cycle_ag_semaphore_handles(cluster_axis), + num_links=num_links, + memory_config=tt_out.memory_config(), + cluster_axis=cluster_axis, + topology=tt_model.args.ccl_topology() if tt_model.args.is_galaxy else ttnn.Topology.Linear, + barrier_semaphore=tt_model.tt_ccl.get_and_cycle_barrier_semaphore_handle(cluster_axis), + chunks_per_sync=10, + num_workers_per_link=2, + num_buffers_per_channel=2, + ) + + ttnn.deallocate(tt_out) + else: + tt_out_gathered = tt_out + tt_out_rm = ttnn.untilize(tt_out_gathered, use_multicore=True) + ttnn.deallocate(tt_out_gathered) + tt_out_tok = ttnn.argmax( + tt_out_rm, + dim=3, + keepdim=True, + use_multicore=True if model_args.max_batch_size == 1 else False, + ) + if not use_reference_file: + tt_logits = ttnn.to_torch(tt_out_rm, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=1))[0, 0, 0, :] + ttnn.deallocate(tt_out_rm) + + tt_argmax_token = ttnn.to_torch(tt_out_tok, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=1))[ + 0, 0, 0, 0 + ] + + ttnn.plus_one(current_pos_tensor) + + # Update rot_mats for next iteration + current_pos += 1 + rot_mats = tt_model.rope_setup.get_rot_mats(current_pos) + rot_mats_local = ( + tt_model.rope_local_setup.get_rot_mats(current_pos) if hasattr(tt_model, "rope_local_setup") else None + ) + # Modify the accuracy checking section when using reference text + if not use_reference_file: + # Get probabilities from model output + probs = torch.softmax(tt_logits, dim=-1) + _, tt_top5_tokens = torch.topk(probs, k=5, dim=-1) + + # Check against actual next token + true_token = input_ids[0, prefill_len + i + 1].item() + top1_match = tt_argmax_token.item() == true_token + top5_match = true_token in tt_top5_tokens + ref_top5_text = [tokenizer.decode([t]) for t in tt_top5_tokens] + else: + # Existing logic for reference file comparison + ref_top5_tokens = top5_tokens[prefill_len + i] + top1_match = tt_argmax_token.item() == ref_top5_tokens[0].item() + top5_match = tt_argmax_token in ref_top5_tokens + ref_top5_text = [tokenizer.decode([t]) for t in ref_top5_tokens] + + # Check top-1 and top-5 accuracy + top1_correct.append(top1_match) + top5_correct.append(top5_match) + true_match = ( + tt_argmax_token.item() == input_ids[0, prefill_len + i + 1].item() if i < generation_length - 1 else False + ) + + # Store error information vs reference model if top5 is incorrect + if use_reference_file and not top5_match: + context_start = max(0, prefill_len + i - 9) + context_tokens = input_ids[0, context_start : prefill_len + i + 1] + context_text = tokenizer.decode(context_tokens.tolist()) + incorrect_token = tokenizer.decode([tt_argmax_token]) + expected_tokens = [tokenizer.decode([t]) for t in ref_top5_tokens] + errors.append( + { + "position": prefill_len + i, + "context": context_text, + "incorrect": incorrect_token, + "expected": expected_tokens, + "predicted_id": tt_argmax_token.item(), + "expected_ids": ref_top5_tokens.tolist(), + } + ) + + sanitize = lambda x: repr(x)[1:-1] # Use repr() and remove the outer quotes + + # Decode tokens to text + tt_argmax_text = tokenizer.decode([tt_argmax_token]) + true_text = tokenizer.decode([true_token]) if true_token is not None else "N/A" + + # Prepare table row + progress_str = f"{i+1}/{generation_length}" + correct = "x" if top1_match else ("-" if top5_match else ("!" if true_match else " ")) + tt_argmax_text = sanitize(tt_argmax_text) + true_text = sanitize(true_text) + ref_top5_str = " ".join(f"{sanitize(t):<14}" for t in ref_top5_text) + + # Print table row + if use_reference_file: + logger.info(f"{progress_str:<15}{correct:<8}{true_text:<15}{tt_argmax_text:<15}{ref_top5_str}") + else: + logger.info(f"{progress_str:<15}{correct:<8}{true_text:<15}{ref_top5_str}") + + # Compute accuracies over every 100 tokens + num_tokens = len(top1_correct) + num_segments = (num_tokens + 99) // 100 + for seg in range(num_segments): + start = seg * 100 + end = min(start + 100, num_tokens) + top1_acc = 100 * sum(top1_correct[start:end]) / (end - start) + top5_acc = 100 * sum(top5_correct[start:end]) / (end - start) + max_width = len(str(decode_len)) + logger.info( + f"Tokens {start:{max_width}d}-{end:{max_width}d}: Top-1 accuracy: {top1_acc:3.0f} %, Top-5 accuracy: {top5_acc:3.0f} %" + ) + + # Report total accuracies + total_top1_acc = 100 * sum(top1_correct) / num_tokens + total_top5_acc = 100 * sum(top5_correct) / num_tokens + logger.info( + f"Total tokens {num_tokens}: Top-1 accuracy: {total_top1_acc:3.1f} %, Top-5 accuracy: {total_top5_acc:3.1f} %" + ) + + # Only show error summary when using reference files + if use_reference_file: + logger.info("\nError Summary (only showing errors where reference top-1 matches true token):") + logger.info("-" * 120) + for error in errors: + if error["position"] + 1 < input_ids.shape[1]: + true_token = input_ids[0, error["position"] + 1].item() + else: + true_token = None + if error["expected_ids"][0] == true_token: + sanitize = lambda x: repr(x)[1:-1] # Use repr() and remove the outer quotes + context = sanitize(error["context"]) + incorrect = sanitize(error["incorrect"]) + expected = " | ".join(sanitize(t) for t in error["expected"]) + true_word = sanitize(tokenizer.decode([true_token])) + logger.info(f"{error['position']}: {context}[{incorrect}] != [{expected}], true: [{true_word}]") + + if use_reference_file: + logger.info(f"Top-1: {total_top1_acc:.0f}% | Top-5: {total_top5_acc:.0f}%") + + if not json_config_file: + # Get accuracy thresholds from PERF.md, unless the configuration is from a json + min_top1_acc, min_top5_acc = get_accuracy_thresholds( + model_args, + optimizations, + ) + assert ( + total_top1_acc >= min_top1_acc + ), f"Top-1 accuracy {total_top1_acc:.1f}% is too low (expected >={min_top1_acc}%)" + assert ( + total_top5_acc >= min_top5_acc + ), f"Top-5 accuracy {total_top5_acc:.1f}% is too low (expected >={min_top5_acc}%)" diff --git a/tests/scripts/single_card/run_single_card_demo_tests.sh b/tests/scripts/single_card/run_single_card_demo_tests.sh index 37401b51ba40..02cb92ea8266 100755 --- a/tests/scripts/single_card/run_single_card_demo_tests.sh +++ b/tests/scripts/single_card/run_single_card_demo_tests.sh @@ -25,6 +25,12 @@ run_qwen7b_func() { } +run_gemma3_4b_func() { + + HF_MODEL=/mnt/MLPerf/tt_dnn-models/google/gemma-3-4b-it MESH_DEVICE=N300 pytest -n auto models/tt_transformers/demo/simple_text_demo.py -k performance-ci-1 --timeout 1800 + +} + run_qwen25_vl_func() { fail=0 diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh index 6086ae93731c..54f9f6920faf 100755 --- a/tests/scripts/t3000/run_t3000_demo_tests.sh +++ b/tests/scripts/t3000/run_t3000_demo_tests.sh @@ -153,6 +153,30 @@ run_t3000_qwen3_tests() { fi } +run_t3000_gemma_3_27b_tests() { + fail=0 + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_gemma_3_27b_tests" + + wh_arch_yaml=wormhole_b0_80_arch_eth_dispatch.yaml + + # Gemma-3-27B + gemma3_27b=/mnt/MLPerf/tt_dnn-models/google/gemma-3-27b-it + mesh_device=T3K + + MESH_DEVICE=$mesh_device HF_MODEL=$gemma3_27b pytest -n auto models/tt_transformers/demo/simple_vision_demo.py -k "batch1-notrace" --timeout 1200; fail+=$? + echo "LOG_METAL: Gemma-3-27B tests for $mesh_device completed" + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_gemma_3_27b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi +} + run_t3000_llama3_vision_tests() { # Record the start time fail=0 From 70969674061734a2ab8381efb5cbd86e5b136525 Mon Sep 17 00:00:00 2001 From: MohammedTaherMcW Date: Fri, 12 Sep 2025 09:54:14 +0000 Subject: [PATCH 2/4] Add Performance and Accuracy metrics for Gemma-3-1b-it --- .../workflows/single-card-demo-tests-impl.yaml | 2 ++ models/tt_transformers/PERF.md | 2 ++ models/tt_transformers/demo/simple_text_demo.py | 6 ++++++ .../tests/generate_reference_hf.py | 0 .../tests/reference_outputs/gemma-3-1b-it.refpt | Bin 0 -> 51061 bytes .../single_card/run_single_card_demo_tests.sh | 8 +++++++- 6 files changed, 17 insertions(+), 1 deletion(-) mode change 100644 => 100755 models/tt_transformers/tests/generate_reference_hf.py create mode 100644 models/tt_transformers/tests/reference_outputs/gemma-3-1b-it.refpt diff --git a/.github/workflows/single-card-demo-tests-impl.yaml b/.github/workflows/single-card-demo-tests-impl.yaml index 93f49b61c218..f429366cc5cc 100644 --- a/.github/workflows/single-card-demo-tests-impl.yaml +++ b/.github/workflows/single-card-demo-tests-impl.yaml @@ -115,8 +115,10 @@ jobs: { name: "qwen25_vl", runner-label: "N300", performance: true, cmd: run_qwen25_vl_func, owner_id: U07RY6B5FLJ}, #Gongyu Wang { name: "gemma3", runner-label: "N150", performance: true, cmd: run_gemma3_perf, owner_id: U08TJ70UFRT}, # Harry Andrews { name: "gemma3", runner-label: "N300", performance: true, cmd: run_gemma3_perf, owner_id: U08TJ70UFRT}, # Harry Andrews + # { name: "gemma3_1b", runner-label: "N150", performance: true, cmd: run_gemma3_1b_func, owner_id: }, # TODO Owner ID needs to be updated # { name: "gemma3_4b", runner-label: "N300", performance: true, cmd: run_gemma3_4b_func, owner_id: }, # TODO Owner ID needs to be updated ]] + steps: - name: Compute tests shell: bash diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md index 750bbe78257e..74edbe0ae664 100644 --- a/models/tt_transformers/PERF.md +++ b/models/tt_transformers/PERF.md @@ -51,6 +51,7 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep | Phi-3-mini-128k-instruct | N150 | 89 | 99 | 45.0 | 73.32 | | Phi-3-mini-128k-instruct | N300 | 89 | 99 | 60.87 | 114.94 | | Mixtral-8x7B-v0.1 | T3K | 95 | 100 | 67.82 | 53.93 | +| gemma-3-1b | N150 |32 |48 | 53.3 |59.9 | | gemma-3-4b | N150 | 78 | 95 | 34 | 68 | | gemma-3-4b | N300 | 78 | 95 | 35 | 125 | | gemma-3-27b | T3K | 90 | 99 | 16 | 331 | @@ -94,6 +95,7 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att | Phi-3-mini-128k-instruct | N150 | 94 | 99 | 40.41 | 82.58 | | Phi-3-mini-128k-instruct | N300 | 94 | 99 | 57.0 | 115.36 | | Mixtral-8x7B-v0.1 | T3K | 95 | 100 | 67.82 | 53.93 | +| gemma-3-1b | N150 |32 |48 | 51.0 |62.02 | | gemma-3-4b | N150 | 88 | 98 | 30 | 79 | | gemma-3-4b | N300 | 86 | 98 | 32 | 135 | | gemma-3-27b | T3K | 91 | 100 | 15 | 361 | diff --git a/models/tt_transformers/demo/simple_text_demo.py b/models/tt_transformers/demo/simple_text_demo.py index d586fb459943..56cbf41763f0 100644 --- a/models/tt_transformers/demo/simple_text_demo.py +++ b/models/tt_transformers/demo/simple_text_demo.py @@ -1055,6 +1055,7 @@ def test_demo_text( "Llama-3.2-11B", "Llama-3.1-70B", "Mistral-7B", + "gemma-3-1b", "gemma-3-4b", "gemma-3-27b", ] @@ -1107,6 +1108,8 @@ def test_demo_text( "T3K_Mistral-7B": 45, # TODO Update target "TG_Mistral-7B": 45, # TODO Update target # + "N150_gemma-3-1b": 52, # TODO Update target + # "N150_gemma-3-4b": 23, # TODO Update target "N300_gemma-3-4b": 38, # TODO Update target # @@ -1203,6 +1206,8 @@ def test_demo_text( "N150_Llama-3.2-3B": 62, "N150_Llama-3.1-8B": 120, "N150_Mistral-7B": 106, + "N150_Qwen2.5-7B": 60, + "N150_gemma-3-1b": 62, # TODO Update target "N150_gemma-3-4b": 65, # TODO Update target # N300 targets "N300_Qwen2.5-7B": 90, @@ -1220,6 +1225,7 @@ def test_demo_text( "N150_Llama-3.2-3B": 35, "N150_Llama-3.1-8B": 21, "N150_Mistral-7B": 23, + "N150_gemma-3-1b": 52, # TODO Update target "N150_gemma-3-4b": 35, # TODO Update target # N300 targets "N300_Qwen2.5-7B": 22.8, diff --git a/models/tt_transformers/tests/generate_reference_hf.py b/models/tt_transformers/tests/generate_reference_hf.py old mode 100644 new mode 100755 diff --git a/models/tt_transformers/tests/reference_outputs/gemma-3-1b-it.refpt b/models/tt_transformers/tests/reference_outputs/gemma-3-1b-it.refpt new file mode 100644 index 0000000000000000000000000000000000000000..06ed898e6dcb7f6e60d40f4a9908bb4cd26b01d6 GIT binary patch literal 51061 zcmchg2b5RUwZ<j=Y8bUFhFLTrH283ijAATd!i zV$^`KgOOs7Eh^X>C=wJgD9D>X-}jprj$CJwwceYx@OauT-`;1Ra?id0A?XzlkD{to zqhI_T6IG20v&Li(8xjj>)2RKwaR$(~WAX}fMiymE$|{^tB6XP-lMcn-_-BmF9zQlWCo8w8e_E^Zi_j%CAwaLm z1zQg*VV?LGFF?bn^C1KnHKiykFJZ0zX&uVX7i_Y}7Zqh^C1jY{^OFXZOZX@L#mjJX zlzIpm5?mia*E#Z;AVcE8(>Qa;b^?r{2r)QeJ_?sEC06t#clYo*0~lEVWeqH6^kG@BmUhk%=uXC!>8Mev}ll|TE4(yOkMK-aym{I@?Q22m8%%~pE9 z>4GnIL*I7R2RqV!T^-TOaQ_=EN6oe>Kcc?cX?Cm@kY?@VV7TIyIgMjBiBDg zCR%-jSs@?DcrK83uT{l_jY~%P%VwjiwEw()MS}9*)fKvRW*z`ZK zp7=>^m(%~uboE1yaif)g)BJugUUl3Ly3=g81#{J&@)?#ZwVKL5^*F;HR+}FDnpao5 z3Z_4~yYe}vbGPyRa)Zi0pQ@C2f&ITXF5utXD~oOiG$EftTqyV&^^*PWS^|^$Ga&_9i;TUu1Z%Qt+eCsm7*W| zpx++b`3LjaX}!RQa@s)$eddUKbX05o&%Q_G!011;fyxJWR66T?rJp%2w@Frhy7_-# z{+|YNoL*DydYRsPcPsx>Q>72tZz|MQ9ytq42R~fYNcCF-yIX?592!1e5hw!foBKnajDy(2kooi4;~-$1A5p4dh|cTa=+koT z$^2fgr@X^wbg>=#@+B%~o}>Tv*JbB9rdQkZ+Oib2r{7@D@8Htae){+eYFE|z9Pbo) z@wexU?{g=q{dDK^JpRv4tWQh+?lul9&5t-iJIW_DR{JHEd%x{d%l#jBO597gzfCz_ z{h-^$;}E&wzu)ES&1d;>>PK9B@IvJqoBx^5is3^`gy%I)L}2x?YPaA1y6QybXIj2z zJ>GWgRllDaDPk3a9(PAK75{I=Cfe=vUfTg&urqjoQ+Lnr z%#+=p76Il*=F6du-^fKgMlRwt{>?Z+-uCpJpgeJ#?4b-$J}zb=)6gJDzro z`d{a`@{RR|PK!3GU-gCnJMB{Cr|wYtw&UxZ!OBl>sPxMurI*;w6HSNm+l>GFE!3{P zyb&gL8T&rx**j`-nt z&+A>UQG;E^AKZpqt#aB?|GMJ|_|tyeBDKeE;Ig=-%E1j>SdSC`&=>jZTkjb+sQ-|6 zrSHy5l%HBz`J00H_ugjJ|F)(KPJ7bN-Vso9&r^Ts!Z(zI`8mWf4j1qQm(c%M*W-Vy zBu2#k1OE6S`;S{*Rzvn5p}e1-C%xzAsa;r){JCmY>JQj)l;^WW!Y@OF!t*o_639;sPG{GdRta znh}1DUtRAG!|cbQKjXIr-bdlzlrw)YKf$m1NHLsSUnzczABX-2e)tRi%X%Am!2>+O z89cxZI^ab=+Jh(U;E!K2UhoU*;Y&Sqk(WH};D_HM4=L?wM?drl+fh$@TaQ;A zFSoZ7zpXu#wy+;p@;McL%6S3(@xRa?u`~Yqc{9z{?OL*<{HaGJ$N_q`C*)V#7W{b z;|RGUA=BWAI`CFm@7cqSYYz(YZ;#>T&Q;2t0|@3+<{s@Qoqt0 zrFFMTUg*Maqt7LBy???_?{dDP$9LbR;`5ysmUYfhewEj4ubilSbgt5cSBf9|G2})M z!vXMtDCMrw4xb#AeZ9{m%Y|{ZEUVMIRha8?N@7E|5G= z&r|)*ekxz|s#5HRKRu`BrOy+S2 z-qJ+vKQ``fc)t!V*cHCufqjuHTjZi}T)ysd;sAb&KQk_gC+s)aAN7$NMqMoz`z6kE zIRCk1sTj5L{`_|1+0kjbQ@c|%$$9Kd`n}+Y(Cp8qxU^7p?|LDC-TR7eYw@`sx_7(_g-u}STELD15Kax z75;^v^1K*-{my>N^Ht<#J%HTI=jcg1B+ei&{t0d^Enmt6$y?8H<4%{?^f@qmiKm>q zXLtclyyaYZPE*nQ)XyoZJ*oUDK9_|a@?sbC!Y=dGEt>0eo>TN);5>MvH~P->`8@MA zc1X2;*yDW950jh{w|$Hgv5tfY&sDyR3^Hvx02QTD<5Bx)Z_;VaCpLyNMIVSpp5B^lm z=fsuo)VKo|#@o?dWxsIT?f3qwO|I%cI$HfdFuf^RDu*7p76*8vFM6Ukc!zq&;S9du zv2vjJzvBB*?FK0i&coI#AM(YXA%EzG{)gY;hatY;jz6OBf)B+1iX^3%{a*O(R1z)h z?s6p&OTnP0BWl!Jl8 zlRh5+AMEgh@5zA&`WGH4d2S!2boyYWv;#-x<((eytnZLJ)Cc+Jid+PK_~pWG3Y_cy zs0PGU{E>AF>lo&5##_iYzHq5 zN)T{_FZAR1$Kl634USJ5hjVs{&v(Z8tQ_U{_SHDp?ed#1QTd$qN_*MAerTjT`0aUK z`L&aj_BTL>{YH882<1mFa4ZYIoY|5uT<2gnaAkgmAN=71U+fr%AAZMtf#2b0k9mG# zKDfs73vmv7@FUhahZ#re%RB#~4|1aidX{8+z$ z1A2o;w%nw2Uc9n~0NUjJ(eHh~yZ2QZzyD~V4tCop*ZrXb{XCbW2Xcq{;6EXLj9b<< z;KzKr?HMs(yb>3|6TIU1!?(QnHL~Ay^7>~*P4T~PfYOsIDgBG@hp^sD_qo$=-G99I zf8Z4QAN~uTtk-VP@Qb*|%DKyV0X^(Of9w<513O?B@B)|e+9Sms-|@Lz@-4yvdtk3P zKG+9&LVFMg@MGqu4?4*oI~Z5&bLFYx|L4jF+XFj1==N+54=)Nl)p2t&gm!Iz!&_m6S#qA zTs$BSupTC^gB$BO@MFKhJ_Fpq2mcFkBtA1w5|8OOxw#zw0iS35$@9P>-`7ZSoNw%P z_{=1=-{H8^$!W>!#0UOxb$~4&r!dH|!MpIp<#3t?3t{v)1bx?D+CHm1B40eqgA| zlTzfT4|;!8-FEEKO!YrHKW3W#XX)!I$G+`6-;K3joaz0<)%j`{+84WG=R7rwz!jXq z6}w`G^4j;=jUwFOe(5)`tsL+i@1t4ohJ4`zfA9+Ji(RoZ_Qjsqmva!t>w=Rc7v-lJ z2ylA4NcD@)k=&FYW1MGsyn$bi@uMAi>cJa7LSOU)@3L?v?s4wKJ|ox9cc+Y&yqqJS z)kFTmID;Sj!8zn#7S5axJmh%trtb|btsx*j^F02%*F)gKIAL8H;)GntgS^OzAB6q~ zF5tnu3J!~|m!RxlS!dv%@D2GP2X=|W1;1lGw%2~b_(yNn58y!>;*e{9Dr;OWay#%T zFD|b-PL=L&<_MpQ9v4Ay0~hFn2RLDmGI4QSVjd(8@|@?RHR8*;FSsyXh`aPB?jj%Z zARqQZK5)TLIKRbD@Ei1JU-*w<8c&<{X&miaE;-=?F315t(_c(1kyI zkpuoAF3fBA4fwEbXPv_NDeKSNGsGWU;_!$&=SM!`3*)K0xD@+6tO1TE%s;H>-WnzU zflr7Jcz{nFF3dyV@{@*3lqB+{-vO!SJa~|T3-N_G&AH*lnZoh>QA!t`ue9$~O7Gb& zh9SMs@4`5aUZMWrU0$5Q@dv+v&b|ly!9C;yzW5{ZggA444Zh5C>~HWl)>YxTHn@c2 zl6XdZrycg#I9a$3&epv27aMk34dK4zSfyWh|1#V5dAFL%Lwcb-c%B*RAL8_%`aO04 zckIJB2;(Gn2>n0K9?Tcmg}A_aD%2al250cA=ye%5f**K>bVB@)I}ShU(JRiMnO~T1 z-d-qsvo717BzsQuy}nys)j)=RXs=M7(4V>gfggkCWaBa0>y%Ud9?lPA#DA9Sxo=q8 z@dSLqYhO#XCyvZNUisSozD_2ji=(cI@NJY6rgH zIagwoK9|KmX~(*XeLc80nW&C0R#yrx+{56$2KU!E2cVvF73xpvB|7Zi;_h$kw0)2d zdG4?sxrgwx_vwtMIlWE)My1c4rF7#NO1aMg-q-^?u?yo7eRp-0y>D~eTKbam><`$7 z@;rLnBdTBbo)lp}8^;g3=1R=cef&J;~L`-oY-G-PbAfNaxazm`Tkll;NB{H z@EiCb2lE?zumkf;SMMXamkEA6p8zk;!NT(~?7}_^`(qE}Le5Y=@F8BYPXG^|!xOKV zZ{zNHaE=9EaDYGjkRuKk);a&GDZH6)nTLn`R^^PBTBj+$tgTY~bHnq>gVXm4x24}N zV4k2J-0)BIh?5`v%8MW4o%)h3KF4+e5A+8o>;R5s*#p1le2)7ioI`Q1WYgco z7{2gOevyR#GzK3Ok0!3BRRHh%aico7fo@I6x2 ze^>clCvk%Me82B=TbB6H?)%(>`Q|8Fy5Dzo>HCi0050GZ;(|T655oM9U9bn|gS2B` zhaLXz4_t-(kR#+9XAjPQh|}0%RGubo;t2L&U4T7)a=ZrraNmMG3Vn~^8v8$X;GPcj zvW+u%gL9tC(GR`xw-9e|4)G-pfHUWf-1FcZl5-^DGkoC_@(1q_XYdW@LvY34IcH=Y zjvc@Se80a`j_>VisXx-Ljn`lG{rqb~dja=yW2NW;Zs>tt%sX-Tfg3o24|s+5EW-6E zd_#W75#kr(2A<#-;&t98=?Sjj20q#LBjPIG*}_l1@pCTxllS^qPn@?|1mHWw^B(r< z{Dk6G$nXElpRq&SJrCkMet;dY3wFQ{LVFM=umkoP>wPEZ8?QSqgy$UK0{+ZvtOKwY z^2N!;cnkd$zYOCkei(OugL5?SfIoO32Yf?Zz~LA8_&MDl-({{~}0?gYyk6OA{ z<=mIb*&@Jr525~8)$<+#=aPR-RXO;@J?Bb&PVJBzeUKYHz!Chw3;e*1d4>HG_Yb&t zz&(V@n-ve>2Yuv#FLK1;2M)ZCLA)Ujp6&HK`}FW!54`cKN`7y9L;r*E!{2gLTzY?* z^BCkt-cT>}0Y7j8KkzKY(eXC!UO^S#FN81nh5X~q??R7u)aR;sl;@OriuTO&9UX6&*Qk%9 zx54W(aGq=W$F7kcoV$WQbXi}cH}>Zol=ntBH{#rt=Q?Gbvx56h>%)6;;W*)Z6}~fl z?ppT^DZ+dIZ}w4s?M9`nS3`Vw-+!C;Wvl}psiJ<2uRSL#4<2#&a()UQ&Idw0I1lCf zO1zigFZNhav}fTZt6GmE-~i&`V z@T*`v!SRN+q6hx;-($OmbeJdL2afn7IDsE|+JR@jaYsJz12^!ZzPN_u1V74=r(b8) zgDZYB!}wo1TzEb2=L>xT|AQWS5!ag7k1rf6`p0_QUQ|ta+9L<}h4iQgPx^ri-}9jz z^Evok>G%&_+SN3UA%4_PIYDyL5Bxs#IHa8RA%484jNdWtihmYk{O=XVVZMj_vmca# zM~iKeqvaUcp}{@Mzw7hXbp^_Ep6t)DmA?P*@lE3QVY=Eu7d%5fh=bsae}?lkaT%OL zI-$RUGxTW>&fo~n)PpPWns)dp_!2Ll@c9~X5S)A1Z`l`v!yB!H%gerRgx}v|yWH>b zbGB3ZAs;xR7xIB0ej4H!;>WlKw=ccFf)DuZaeQIC&+|BiPlzA#fFE*#8#peTARIP2 zF2fi6(1&vT75XoDzf#$8;^;Q=*Dbz)-MO{s^FC`G$ALb7m4Eg$-orgl3`$ZUG2>-~%0S1TX5z6A$pavizBLJLDD-_<<9+(T=zd ze&9v?1vlgcw+@4a$JS?*9(IOO=4tSwA9#g0@VpJ&*jLqmN%Y#*k|Oj2FX-Zr;1q`+ zxUsIl|3Z4ih0y=NFT{g&0Qe!-r0S9%`m7J|zri=Fp7r6+zfpdy*Kgp5-qn2$`LdQ-0!!O0OQMl=%=k;KO&H!H;-HJ^qKEfG_(H@B=T#FYSqg_#b`;j^LEi zNOFQ-h%a*EkF0e0!;bM9S3)NiowLCt$x`IV0;1t;X8 zKXkzd{K%K`7n#z1H~7&Xdf5-@5-I&v)oi z+x3jkfo@mDDZlqZ`@eY}e8l}I$A7RNzpI2^W#Ix}{OC3FXIW76 z;B~9z4RIN~M>wGexPZsjo~m!;^$gzy0~h?Eb0@V6aiJbRVBP}{@CtFE9e$H1@)3A| z1Gs?4_q){)y}{u@e+Rd4u;M7V;UDM^J~t+d&W<#tU)f(C@HksISLOHE{;WryvR%Lr ze?mX}gnG(*nhyHXZue~Q88}qw)P7DcRJ!{brL_Oq@27z;^ci2|@v}?3PTJ>n7k8jMQ`mfO2LY>}$xbyo|e>_R`hux|4Y~ymf&%sWyA52P8{XVZhm)S2Sx;*rI;sJgg+9w6|ISyS!%S?|MoE04c${zHFo3GIs>uNW`#*n#%UANT`wIS-?p-)Ev9 z?b=!|{0@6|Hg4eljd8;sjBDud`)BkAr}u1!jN^p^^Yu3KO?5f*{H(QV$L}7lF`jpN zABH`fZ&ACi>M5Ogi_*Lfnh@7|yxlZP<>%P1ns|NqiS0Ah7KXh zxZR$?s(*2zQqBoqb-R&QsJuqLQsM{pxYTknUa%{8g*g74UF zetCk@J&ubxwUkfu`k<=q$GI-!2L8l5aR1Tkqbf&;4)x4K*q!!ozo2&Lfgdp5LcZbe z6(R>XZ|@;`A-_)c%l>Jqf7)`Ar=D`;?BjFA8Mfn`Q`EoC9g0I$T)xHU8OTGNCO#AY z7(jRTE+>#g)V-< zeGB3M^cfE~m@ngo?_q{~2M!m%UiNpMyHxM0^7kwU{4aO=P%iR}i#_)DE<;2ol#h1E z1ztCs-!07VPVaV{K|aR857ra;@Jr~&@dcNUef|{kr9b?h@H(t_b?LLpks6?U%`X+_#JuhLtf@r&iRp-_O$C~{ku04f9m-z_{RHH&UzC0`OeVLWYs@zoU8l1 zeN(2&FW#iIc#P6}9`^d`apmizDFt`*#!t};9MIzr9{1?M`i6J~9r}SE^uUiibfEu{ zabett;}`mjTl$kSF7eOzJwBn|%jZNnU8Mg)$ESSrr9Xa(oT0z+yVUSy{K5zR&mEb_L66b&q04%L-|L1?u_2ZLrOXRgZ-Fy;^d<}a%PVezh}ECJ>2^y+JO_eP#*dZ zIKh{8AwG-~evgE8;9j|D^v_9>V};{1a@8{~RWejhyMHxMo^cY6=NbN?5atE^68!L2 z{FCqQ6Q{u&eD_u}f6I&AS=XUI@qB^(i*bFd_4<2%(HZrL3RbjH9$euU%E|K$#$6xB z{rg+0Kk^KAyo=KpJqIVNALH(R$8GF{9hrZjU)AvkKFITA3(?!){cfu5w#46U`n0#| z(Tfzi!~t-8#CQ=8n8%sdS$7i`phJJ;zs36u_)&hc%dsEzwau6N03UjMv~zp>Hs9?w zT2FqTy};$j!+BZAhkE$2o_xf1=o8R+qmmr&T;spg^BnWyFw4=&{qMLzc;m~|_qUBx|6V>PzG8s#xw%SNU$jb6zV|rE_m=l-@PiNjh&?i4mel4(e-J;e29G+$4|JPc|D|u{?LIgIC4J6e&J@z$vGYU<8Xrx`ZAu_ zH*zkB-rye6Wj{bXC$16Ch-27^c*lN%`^21+ah|rt`{;*_XD6rN&HXCk5qcmm^C0q~ zFVDAl4#+*Lx*pfu*T9chUqXj*wd7U#DSSE4qdmW$ggo~(Rr|LJm8SUl7xVw;pKBb` zAAayjwf?MI*{AS*o9SLJuJAeQRL29x1@xZu{Jqli7wzfK_k+H@T=HDu^&9g!^spy% z@egpmZn^a2eDTde${*)-S8k^A(1jj;1l`~H{@L8?)PCscO5ZqJ>6w30`mpKK4|(wa zkUsf1e`Q~cpW>JJE9bl116*zU+~INEb)X7{O_zMNJU)>R9J$xPdz;HW&kzr2kKfae zxWIWV=Tj}b|LW>_qw{=W88#^-mIgLc@D=l!(jTo?I}3pwe>_$H6s#4XnK=#g*yc;3Q1a*O5t*y9g< z(GxpHwtu$u2-|@Jc!NLsf(!fYP*41rc#9tBN8AMu@EOB;%<}znvg`uAs?C(Y{1m05 z|ETl==*U8VGqBsIS&SiKWN_@w^zeNspEB!ro{2bh9cWo2NLp^++ zt)_D9K)(yEZ#X~vm@J0GLC*IoSpPqF5IyRNTf||`=}-Jgzz{!QUZXtyHq8*74*OLO zohz?bc^EGb_rT2=r}mZKRyu8v+BbK+AwEtpel1T^eO)z=nm?%g6JILDFBo6l+o>Eq zh)0)ueY@mE)o<`RZR)kkuU;w!bJhyK9>#at7}aymUpGm4_;F8h%^4~W`C~`?mGR3s zWu73DBeQMd-)q!FN1<_H(Q6W{PgQ`#(5O`6EwI`nuOU#6iX(`Z6Bz z6H>})$2>%T+6_8VeAovvUi+S>^23J;&>z22eql%X&sE1OoqewAs~LCvk@#_s&q0YJ z$OAs$LR`g(x z4!NOAJNkuwN_~Oxq8;-OcIoMQ?8v@_dg7?VPQ_UD)XMtSy8@PQ8f z(WlsQpfB>G2l4p^$FsGki~jBrN{Po67OKI&UQjvq80UWQ!!GcnU06H+6Vitc@<3;gpBoU@vZZ$E@BK0VW6vDdqX*9=p;uT<4QNlA<~YcD zgZF{Cr@p~*@cfQ`(@hWivcI0z%=5!D2lImt{m~b^m>-ER`Su&`>+#(UzT+{io%le9 zxPiYz2m2G3X#c3=HgJMMQP4|M2Hd&VK{@n7iRulNV_;7iK7 z8hnX|e(gVkFXgl=u-&%Xt;gdEmpk<2sM~$Bb7a-?!krg>?w^w5J{Gm9tC-IlvJ)kqbOoPm&M&ecf61 zpdNa(3+eJZzwGbw?O)7myf3sbS#(pq-yu#la{Jf*B!8%JuF`AlH~e0}WA+=~XZSF< z|A0RxyIlpRyF@NRuKNe5V&~sfar9;hfS-kQLweYk^WE#cpJQIazo3I0;0QhZI1bkj zj0@|+A@-BM8prPE3eW0}XOZLPxz-!|p%?t&OFiYZLof6U+wC##v}e3!OPq-LlzDWH z%h^wluP?=>6bt{4yl_E|@qJWIo^i+fxA-699y!4s`R_JE?r6;04Y+hv0tU--ikiXKu?|;xZhq(5@N7yZzi=evBL6MoRa4$$NI<&g9$ zRWDln!1jNX+)qjT83o9K-Lpht~aRm3J~<*3;lT+v690Gkw1Nf}d|-7ve>ZkppMu1^hmDkL0KQ<9=Q} z?ID$8FXoFclT|+5a#Zkr(E`hhzV#i)Deq8C{la^jtOHng%&DSw^jl*-=)cnEPqoaC z^3ZOpP86NRmJfY5`*{-c5O^ckGVh0qP5(6Si@@P+&$AQTi2h2)KkDZiZ|2!gy$<2t zM0x!Nc^8UJ#Pb=x3lZvnll^w*CF+1*jXzm=_{aI*J%iNlR_jaqxjyH^uEa&wm$m&| zi?o{aV?3UA8861qXD+9GBe!qu6uqyxO$G?%$u!=`vCZ_M6Us^bCvHayfB5m8560n8 z+mZd!t0@QL2ri)9SA$L^*JhjwbA!Vhpv)bT|F)uT%i0H6%AXD-?w3%GNGuSeqO@SWI@{~bNWf58VzbF3fj zLi_QZBkae%c}{P29D15k{02I4cH=u>d^hd(QR*M3M`%C%`wQ>K;`)X5qu+eT1AfOm zu77C1=O;@}e&_qu5(2uDvKWDzj41`$BFNkqCeyF*sju---%`4!+9^i;~FO~-}Pi1 z@x8r4oi)JX`tkiqzPt7Z}Y^R%D$@qNEZrpxc(#L30?HNo#M=^|X-{``&&^6d3G zl<%;zE+CHdN)vs4uVJRwojKMAT*LW`@AZZA*RDzG$arLb&i5y?1G;hZ8TZA)?@A*N z{la++{9oH9d!kny-r;;kf4)}|*DsvMuJAhRP2)}cNAEZ~aq}79nPgqk=s4-~@<{?P zj$SyweQtU7WU7B$zi?h-9JS6;zhe6@e#3Xt22qa~qD2 zxcP_pzt;PcIR4?h1AbwAZ0LRa5%q+_@2%IZgOtx#5`}T(Ki$W$@7rp><{X6Q4eVQT z%#Zug4+s9c-|}I9zEj9P8a#NL!!#Vcj0BHml#J;MU)jXyv4rtE+myuY`i!@=Vb{4%E>dfoCsHMK8zUnz1G z7fBH8nCAELun&BBe~{d3X*>m2Ttb6*_$ zV~6g|M4xd!eTMYS^m#S@fFE+Me9;`WLw@)+`IiLb9y0yDIZf@kFU-09*cZefyf5;4 z8aqLkd$jl;bU)uHy4({)FP;kuT#L0doS2y&DEmIy%feZba}3I$|Y)tUvTaVuJGUC`z6?OU~srk;4{^_6QdZ%F@p-xZ(>j-N(IDm)LZ_sJ1h3sW zc6b8F#J_k6QlgYYNYM7cRfh>#h2uwNkDofSs6?8K@uT~vwJ4WB?K^kr(4l>|&K)~- z>CnAP*Y=$|c2Dilv3u8!?Nd{`ckj}@L#M7CJEnH(l#ufIC%1i=aErvhA?XzkSe_dh zwOcK{A@Qg&@z0NrOIq=F==&LM^DoN|>8#$d`Pvc^QQ}`yx*te~mK6PxPTMhgg*hXO zGHi#CT(8LmTLHGkLF8ULG`R^yd4(g#W@VIMIW{*ZE4L`5IJ4&`4HB_C@y}=4rkhwg z{ZhbDQ;M?k3P)!ZhJ^n8c8econ^0mc{!2m!%-lYt^6$QTFqM=Qu~a&QRQ}!ne38Vm zELwLcDlusPyMs7>n9Nbn9g57MU3W-&<;o|zU*n1=`Tt5T|M>q%d1Y@+3GKvRNxuU@ YsP=m06D~9)z0!e+wrWZ5M0>UUABF5F$^ZZW literal 0 HcmV?d00001 diff --git a/tests/scripts/single_card/run_single_card_demo_tests.sh b/tests/scripts/single_card/run_single_card_demo_tests.sh index 02cb92ea8266..70914e74aab7 100755 --- a/tests/scripts/single_card/run_single_card_demo_tests.sh +++ b/tests/scripts/single_card/run_single_card_demo_tests.sh @@ -27,10 +27,16 @@ run_qwen7b_func() { run_gemma3_4b_func() { - HF_MODEL=/mnt/MLPerf/tt_dnn-models/google/gemma-3-4b-it MESH_DEVICE=N300 pytest -n auto models/tt_transformers/demo/simple_text_demo.py -k performance-ci-1 --timeout 1800 + HF_MODEL=/mnt/MLPerf/tt_dnn-models/google/gemma-3-4b-it MESH_DEVICE=N300 pytest -n auto models/tt_transformers/demo/simple_text_demo.py -k performance-ci-1 --timeout 1800 +} + +run_gemma3_1b_func() { + + HF_MODEL=/mnt/MLPerf/tt_dnn-models/google/gemma-3-1b-it MESH_DEVICE=N150 pytest -n auto models/tt_transformers/demo/simple_text_demo.py -k performance-ci-1 --timeout 1800 } + run_qwen25_vl_func() { fail=0 From 3a7918710498c2ed164d701c1cb7aa091c8b21da Mon Sep 17 00:00:00 2001 From: MohammedTaherMcW Date: Wed, 24 Sep 2025 14:59:39 +0000 Subject: [PATCH 3/4] Add Mask support in test_accuracy --- models/tt_transformers/PERF.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md index 74edbe0ae664..f41b5d54dd93 100644 --- a/models/tt_transformers/PERF.md +++ b/models/tt_transformers/PERF.md @@ -54,6 +54,9 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep | gemma-3-1b | N150 |32 |48 | 53.3 |59.9 | | gemma-3-4b | N150 | 78 | 95 | 34 | 68 | | gemma-3-4b | N300 | 78 | 95 | 35 | 125 | +| gemma-3-1b | N150 | 84 | 96 | 53.3 | 59.9 | +| gemma-3-4b | N150 | 76 | 93 | 34 | 68 | +| gemma-3-4b | N300 | 77 | 94 | 35 | 125 | | gemma-3-27b | T3K | 90 | 99 | 16 | 331 | ## Accuracy @@ -98,6 +101,9 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att | gemma-3-1b | N150 |32 |48 | 51.0 |62.02 | | gemma-3-4b | N150 | 88 | 98 | 30 | 79 | | gemma-3-4b | N300 | 86 | 98 | 32 | 135 | +| gemma-3-1b | N150 | 93 | 98 | 51.0 | 62.02 | +| gemma-3-4b | N150 | 84 | 97 | 30 | 79 | +| gemma-3-4b | N300 | 85 | 97 | 32 | 135 | | gemma-3-27b | T3K | 91 | 100 | 15 | 361 | ## Long-context (64K Tokens) From 5bf58c04e16986714762f9c0e5169c87793f55c0 Mon Sep 17 00:00:00 2001 From: MohammedTaherMcW Date: Thu, 25 Sep 2025 12:57:53 +0000 Subject: [PATCH 4/4] Updated Perf metrics --- models/tt_transformers/PERF.md | 22 +++++++------------ .../tt_transformers/demo/simple_text_demo.py | 12 +++++----- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md index f41b5d54dd93..8e9934342fc6 100644 --- a/models/tt_transformers/PERF.md +++ b/models/tt_transformers/PERF.md @@ -51,13 +51,10 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep | Phi-3-mini-128k-instruct | N150 | 89 | 99 | 45.0 | 73.32 | | Phi-3-mini-128k-instruct | N300 | 89 | 99 | 60.87 | 114.94 | | Mixtral-8x7B-v0.1 | T3K | 95 | 100 | 67.82 | 53.93 | -| gemma-3-1b | N150 |32 |48 | 53.3 |59.9 | -| gemma-3-4b | N150 | 78 | 95 | 34 | 68 | -| gemma-3-4b | N300 | 78 | 95 | 35 | 125 | -| gemma-3-1b | N150 | 84 | 96 | 53.3 | 59.9 | -| gemma-3-4b | N150 | 76 | 93 | 34 | 68 | -| gemma-3-4b | N300 | 77 | 94 | 35 | 125 | -| gemma-3-27b | T3K | 90 | 99 | 16 | 331 | +| gemma-3-1b | N150 | 83 | 95 | 58.93 | 61.95 | +| gemma-3-4b | N150 | 86 | 97 | 36.19 | 64.96 | +| gemma-3-4b | N300 | 86 | 98 | 37.4 | 120.38 | +| gemma-3-27b | T3K | 91 | 99 | 16.73 | 356.81 | ## Accuracy @@ -98,13 +95,10 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att | Phi-3-mini-128k-instruct | N150 | 94 | 99 | 40.41 | 82.58 | | Phi-3-mini-128k-instruct | N300 | 94 | 99 | 57.0 | 115.36 | | Mixtral-8x7B-v0.1 | T3K | 95 | 100 | 67.82 | 53.93 | -| gemma-3-1b | N150 |32 |48 | 51.0 |62.02 | -| gemma-3-4b | N150 | 88 | 98 | 30 | 79 | -| gemma-3-4b | N300 | 86 | 98 | 32 | 135 | -| gemma-3-1b | N150 | 93 | 98 | 51.0 | 62.02 | -| gemma-3-4b | N150 | 84 | 97 | 30 | 79 | -| gemma-3-4b | N300 | 85 | 97 | 32 | 135 | -| gemma-3-27b | T3K | 91 | 100 | 15 | 361 | +| gemma-3-1b | N150 | 92 | 99 | 56.32 | 52.57 | +| gemma-3-4b | N150 | 90 | 100 | 31.65 | 77.72 | +| gemma-3-4b | N300 | 90 | 99 | 34.94 | 142.08 | +| gemma-3-27b | T3K | 96 | 100 | 15.74 | 359.51 | ## Long-context (64K Tokens) diff --git a/models/tt_transformers/demo/simple_text_demo.py b/models/tt_transformers/demo/simple_text_demo.py index 56cbf41763f0..0638781c387e 100644 --- a/models/tt_transformers/demo/simple_text_demo.py +++ b/models/tt_transformers/demo/simple_text_demo.py @@ -1108,12 +1108,12 @@ def test_demo_text( "T3K_Mistral-7B": 45, # TODO Update target "TG_Mistral-7B": 45, # TODO Update target # - "N150_gemma-3-1b": 52, # TODO Update target + "N150_gemma-3-1b": 20, # TODO Update target # - "N150_gemma-3-4b": 23, # TODO Update target - "N300_gemma-3-4b": 38, # TODO Update target + "N150_gemma-3-4b": 11, # TODO Update target + "N300_gemma-3-4b": 12, # TODO Update target # - "T3K_gemma-3-27b": 38, # TODO Update target + "T3K_gemma-3-27b": 10, # TODO Update target } if model_device_key in dict_target_decode_tok_s_u: target_decode_tok_s_u = dict_target_decode_tok_s_u[model_device_key] @@ -1225,8 +1225,8 @@ def test_demo_text( "N150_Llama-3.2-3B": 35, "N150_Llama-3.1-8B": 21, "N150_Mistral-7B": 23, - "N150_gemma-3-1b": 52, # TODO Update target - "N150_gemma-3-4b": 35, # TODO Update target + "N150_gemma-3-1b": 20, # TODO Update target + "N150_gemma-3-4b": 11, # TODO Update target # N300 targets "N300_Qwen2.5-7B": 22.8, "N300_gemma-3-4b": 35, # TODO Update target