diff --git a/.github/workflows/single-card-demo-tests-impl.yaml b/.github/workflows/single-card-demo-tests-impl.yaml index 4dfc2d1005ba..7e3b10e6c374 100644 --- a/.github/workflows/single-card-demo-tests-impl.yaml +++ b/.github/workflows/single-card-demo-tests-impl.yaml @@ -92,6 +92,8 @@ jobs: # # Moved to t3k tests until OOM on single card runners resolved # { name: "qwen7b", runner-label: "N300", performance: false, cmd: run_qwen7b_func, owner_id: U03PUAKE719}, # Mark O'Connor { name: "qwen25_vl", runner-label: "N300", performance: true, cmd: run_qwen25_vl_func, owner_id: U07RY6B5FLJ}, #Gongyu Wang + # { name: "gemma3_4b", runner-label: "N300", performance: true, cmd: run_gemma3_4b_func, owner_id: }, # TODO Owner ID needs to be updated + ] name: ${{ matrix.test-group.name }}-${{ matrix.test-group.runner-label }}-${{ (matrix.test-group.performance && 'perf') || 'func' }} env: diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md index 0a44dba88a60..279c62be81c4 100644 --- a/models/tt_transformers/PERF.md +++ b/models/tt_transformers/PERF.md @@ -45,6 +45,8 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep | Mistral-7B | N150 | 95 | 99 | 29.75 | 100.24 | | Mistral-7B | N300 | 95 | 99 | 47.01 | 65.95 | | Mistral-7B | T3K | 95 | 99 | 67.82 | 53.93 | +| gemma-3-4b | N150 | 67.0 | 80 | 28.00 | 81.00 | +| gemma-3-4b | N300 | 52.0 | 72.0 | 23.00 | 152 | ## Accuracy @@ -82,6 +84,8 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att | Mistral-7B | N150 | 95 | 99 | 29.75 | 100.24 | | Mistral-7B | N300 | 95 | 99 | 47.01 | 65.95 | | Mistral-7B | T3K | 95 | 99 | 67.82 | 53.93 | +| gemma-3-4b | N150 | 67.0 | 80 | 28.00 | 81.00 | +| gemma-3-4b | N300 | 52.0 | 72.0 | 23.00 | 152 | ## Long-context (64K Tokens) diff --git a/models/tt_transformers/demo/simple_text_demo.py b/models/tt_transformers/demo/simple_text_demo.py index 95c4f013a244..12b35858ad91 100644 --- a/models/tt_transformers/demo/simple_text_demo.py +++ b/models/tt_transformers/demo/simple_text_demo.py @@ -945,7 +945,15 @@ def test_demo_text( ) # Benchmark targets - supported_models = ["Llama-3.2-1B", "Llama-3.2-3B", "Llama-3.1-8B", "Llama-3.2-11B", "Llama-3.1-70B", "Mistral-7B"] + supported_models = [ + "Llama-3.2-1B", + "Llama-3.2-3B", + "Llama-3.1-8B", + "Llama-3.2-11B", + "Llama-3.1-70B", + "Mistral-7B", + "gemma-3-4b", + ] supported_devices = ["N150", "P100", "P150", "P300", "N300", "P150x4", "T3K", "TG"] tt_device_name = determine_device_name(mesh_device) # submesh device should not decide performance target @@ -994,6 +1002,9 @@ def test_demo_text( "N300_Mistral-7B": 38, # TODO Update target "T3K_Mistral-7B": 45, # TODO Update target "TG_Mistral-7B": 45, # TODO Update target + # + "N150_gemma-3-4b": 23, + "N300_gemma-3-4b": 38, # TODO Update target } if model_device_key in dict_target_decode_tok_s_u: target_decode_tok_s_u = dict_target_decode_tok_s_u[model_device_key] @@ -1075,6 +1086,7 @@ def test_demo_text( # "T3K_Qwen2.5-Coder-32B": 180, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754) # "T3K_Qwen2.5-72B": 211, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754) # "T3K_Qwen3-32B": 250, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754) + "N150_gemma-3-4b": 100, # TODO Update target } ci_target_decode_tok_s_u = { # N150 targets - higher is better @@ -1082,8 +1094,10 @@ def test_demo_text( "N150_Llama-3.2-3B": 35, "N150_Llama-3.1-8B": 21, "N150_Mistral-7B": 23, + "N150_gemma-3-4b": 23, # TODO Update target # N300 targets "N300_Qwen2.5-7B": 20, + "N300_gemma-3-4b": 20, # TODO Update target # T3K targets # "T3K_Llama-3.1-70B": 16, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303) # "T3K_Qwen2.5-72B": 13, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24303) diff --git a/models/tt_transformers/demo/simple_vision_demo.py b/models/tt_transformers/demo/simple_vision_demo.py index bcc27ce9c474..8c31cddf42c8 100644 --- a/models/tt_transformers/demo/simple_vision_demo.py +++ b/models/tt_transformers/demo/simple_vision_demo.py @@ -480,12 +480,14 @@ def test_multimodal_demo_text( "N300_Llama-3.2-11B": 23.5, "T3K_Llama-3.2-11B": 21.5, "T3K_Llama-3.2-90B": 3, + "N300_gemma-3-4b": 390, }[f"{tt_device_name}_{base_model_name}"] target_decode_tok_s_u = { "N300_Llama-3.2-11B": 21.5, "T3K_Llama-3.2-11B": 37, "T3K_Llama-3.2-90B": 6, + "N300_gemma-3-4b": 24, }[f"{tt_device_name}_{base_model_name}"] target_decode_tok_s = target_decode_tok_s_u * max_batch_size diff --git a/models/tt_transformers/tests/reference_outputs/gemma-3-4b-it.refpt b/models/tt_transformers/tests/reference_outputs/gemma-3-4b-it.refpt new file mode 100644 index 000000000000..fc22f76e3488 Binary files /dev/null and b/models/tt_transformers/tests/reference_outputs/gemma-3-4b-it.refpt differ diff --git a/models/tt_transformers/tests/test_accuracy.py b/models/tt_transformers/tests/test_accuracy.py index 78de89940988..934f53f67399 100644 --- a/models/tt_transformers/tests/test_accuracy.py +++ b/models/tt_transformers/tests/test_accuracy.py @@ -245,6 +245,19 @@ def test_tt_model_acc( theta=model_args.rope_theta, rope_scaling=model_args.rope_scaling, ) + + if model_args.rope_local_theta is not None: + # If local theta is set, use it to compute the local rope matrices + rot_mats_local = get_rot_mats( + head_dim=model_args.head_dim, + device=mesh_device, + seq_len=prefill_lens[0], + theta=model_args.rope_local_theta, + rope_scaling=None, + ) + else: + rot_mats_local = None + prefill_input = model_args.prepare_residual_tensor_prefill( pt_prefill_input[batch_id], ) @@ -252,7 +265,7 @@ def test_tt_model_acc( tt_out = tt_model( prefill_input, current_pos=None, - rot_mats=rot_mats_prefill, + rot_mats=[rot_mats_prefill, rot_mats_local], user_id=batch_id, mode="prefill", page_table=page_table_tt, @@ -280,7 +293,7 @@ def test_tt_model_acc( # Get cos/sin matrices for the current position of each user rot_mats = tt_model.rope_setup.get_rot_mats(current_pos) - + rot_mats_local = None if tt_model.rope_setup_local is None else tt_model.rope_setup.get_rot_mats(current_pos) # Print table header if use_reference_file: logger.info(f"{'Progress':<15}{'Correct':<8}{'True':<15}{'Actual':<15}{'Top 5 Predictions':<75}") @@ -310,7 +323,7 @@ def test_tt_model_acc( tt_out = tt_model( decode_input, current_pos_tensor, - rot_mats=rot_mats, + rot_mats=[rot_mats, rot_mats_local], mode="decode", page_table=page_table_tt, ) @@ -351,7 +364,9 @@ def test_tt_model_acc( # Update rot_mats for next iteration current_pos += 1 rot_mats = tt_model.rope_setup.get_rot_mats(current_pos) - + rot_mats_local = ( + tt_model.rope_setup_local.get_rot_mats(current_pos) if tt_model.rope_setup_local is not None else None + ) # Modify the accuracy checking section when using reference text if not use_reference_file: # Get probabilities from model output diff --git a/tests/scripts/single_card/run_single_card_demo_tests.sh b/tests/scripts/single_card/run_single_card_demo_tests.sh index 59f6ee8b82df..3bb997dd46f3 100755 --- a/tests/scripts/single_card/run_single_card_demo_tests.sh +++ b/tests/scripts/single_card/run_single_card_demo_tests.sh @@ -21,6 +21,14 @@ run_qwen7b_func() { } + +run_gemma3_4b_func() { + + HF_MODEL=google/gemma-3-1b-it MESH_DEVICE=N300 pytest -n auto models/tt_transformers/demo/simple_text_demo.py -k performance-ci-1 --timeout 1800 + +} + + run_qwen25_vl_func() { fail=0