diff --git a/.github/workflows/single-card-demo-tests-impl.yaml b/.github/workflows/single-card-demo-tests-impl.yaml index 92c115ab9c96..c5782e2947b7 100644 --- a/.github/workflows/single-card-demo-tests-impl.yaml +++ b/.github/workflows/single-card-demo-tests-impl.yaml @@ -88,6 +88,7 @@ jobs: # # Moved to t3k tests until OOM on single card runners resolved # { name: "qwen7b", runner-label: "N300", performance: false, cmd: run_qwen7b_func, owner_id: U03PUAKE719}, # Mark O'Connor { name: "qwen25_vl", runner-label: "N300", performance: false, cmd: run_qwen25_vl_func, owner_id: U07RY6B5FLJ}, #Gongyu Wang + # { name: "gemma3_1b", runner-label: "N150", performance: false, cmd: run_gemma3_1b_func, owner_id:}, # TODO Owner ID needs to be updated ] name: ${{ matrix.test-group.name }}-${{ matrix.test-group.runner-label }}-${{ (matrix.test-group.performance && 'perf') || 'func' }} env: diff --git a/models/tt_transformers/PERF.md b/models/tt_transformers/PERF.md index 737ef759fdbb..e301d5b3a7b3 100644 --- a/models/tt_transformers/PERF.md +++ b/models/tt_transformers/PERF.md @@ -45,6 +45,7 @@ This configuration uses bfp4 MLP and bfp8 attention weights for all models excep | Mistral-7B | N150 | 95 | 99 | 29.75 | 100.24 | | Mistral-7B | N300 | 95 | 99 | 47.01 | 65.95 | | Mistral-7B | T3K | 95 | 99 | 67.82 | 53.93 | +| gemma-3-1b | N150 | 30 | 40 | 40.00 | 40.00 | ## Accuracy @@ -82,6 +83,7 @@ Llama 3 models test as insensitive to attention precision and so we use bfp8 att | Mistral-7B | N150 | 95 | 99 | 29.75 | 100.24 | | Mistral-7B | N300 | 95 | 99 | 47.01 | 65.95 | | Mistral-7B | T3K | 95 | 99 | 67.82 | 53.93 | +| gemma-3-1b | N150 | 30 | 40 | 40.00 | 40.00 | ## Long-context (64K Tokens) diff --git a/models/tt_transformers/demo/simple_text_demo.py b/models/tt_transformers/demo/simple_text_demo.py index c58504228ef4..b8326b2861c9 100644 --- a/models/tt_transformers/demo/simple_text_demo.py +++ b/models/tt_transformers/demo/simple_text_demo.py @@ -945,7 +945,15 @@ def test_demo_text( ) # Benchmark targets - supported_models = ["Llama-3.2-1B", "Llama-3.2-3B", "Llama-3.1-8B", "Llama-3.2-11B", "Llama-3.1-70B", "Mistral-7B"] + supported_models = [ + "Llama-3.2-1B", + "Llama-3.2-3B", + "Llama-3.1-8B", + "Llama-3.2-11B", + "Llama-3.1-70B", + "Mistral-7B", + "gemma-3-1b", + ] supported_devices = ["N150", "P100", "P150", "P300", "N300", "P150x4", "T3K", "TG"] tt_device_name = determine_device_name(mesh_device) # submesh device should not decide performance target @@ -994,6 +1002,8 @@ def test_demo_text( "N300_Mistral-7B": 38, # TODO Update target "T3K_Mistral-7B": 45, # TODO Update target "TG_Mistral-7B": 45, # TODO Update target + # + "N150_gemma-3-1b": 25, } if model_device_key in dict_target_decode_tok_s_u: target_decode_tok_s_u = dict_target_decode_tok_s_u[model_device_key] @@ -1075,6 +1085,7 @@ def test_demo_text( # "T3K_Qwen2.5-Coder-32B": 180, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754) # "T3K_Qwen2.5-72B": 211, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754) # "T3K_Qwen3-32B": 250, # too much variability in CI (https://github.com/tenstorrent/tt-metal/issues/24754) + "N150_gemma-3-1b": 100, } ci_target_decode_tok_s_u = { # N150 targets - higher is better @@ -1082,6 +1093,7 @@ def test_demo_text( "N150_Llama-3.2-3B": 35, "N150_Llama-3.1-8B": 21, "N150_Mistral-7B": 23, + "N150_gemma-3-1b": 25, # N300 targets "N300_Qwen2.5-7B": 20, # T3K targets diff --git a/models/tt_transformers/tests/reference_outputs/gemma-3-1b-it.refpt b/models/tt_transformers/tests/reference_outputs/gemma-3-1b-it.refpt new file mode 100644 index 000000000000..d57fab334f4b Binary files /dev/null and b/models/tt_transformers/tests/reference_outputs/gemma-3-1b-it.refpt differ diff --git a/models/tt_transformers/tests/test_accuracy.py b/models/tt_transformers/tests/test_accuracy.py index 78de89940988..2515dab97864 100644 --- a/models/tt_transformers/tests/test_accuracy.py +++ b/models/tt_transformers/tests/test_accuracy.py @@ -245,6 +245,19 @@ def test_tt_model_acc( theta=model_args.rope_theta, rope_scaling=model_args.rope_scaling, ) + + if model_args.rope_local_theta is not None: + rope_setup_prefill_local = get_prefill_rot_mat( + model_args.head_dim, + mesh_device, + prefill_lens[0], + model_args.rope_local_theta, + model_args.rope_scaling_factor, + model_args.orig_context_len, + ) + else: + rope_setup_prefill_local = None + prefill_input = model_args.prepare_residual_tensor_prefill( pt_prefill_input[batch_id], ) @@ -252,7 +265,7 @@ def test_tt_model_acc( tt_out = tt_model( prefill_input, current_pos=None, - rot_mats=rot_mats_prefill, + rot_mats=[rot_mats_prefill, rope_setup_prefill_local], user_id=batch_id, mode="prefill", page_table=page_table_tt, @@ -280,6 +293,7 @@ def test_tt_model_acc( # Get cos/sin matrices for the current position of each user rot_mats = tt_model.rope_setup.get_rot_mats(current_pos) + rot_mats_local = None if tt_model.rope_setup_local is None else tt_model.rope_setup_local.get_rot_mats(current_pos) # Print table header if use_reference_file: @@ -310,7 +324,7 @@ def test_tt_model_acc( tt_out = tt_model( decode_input, current_pos_tensor, - rot_mats=rot_mats, + rot_mats=[rot_mats, rot_mats_local], mode="decode", page_table=page_table_tt, ) @@ -351,6 +365,9 @@ def test_tt_model_acc( # Update rot_mats for next iteration current_pos += 1 rot_mats = tt_model.rope_setup.get_rot_mats(current_pos) + rot_mats_local = ( + None if tt_model.rope_setup_local is None else tt_model.rope_setup_local.get_rot_mats(current_pos) + ) # Modify the accuracy checking section when using reference text if not use_reference_file: diff --git a/tests/scripts/single_card/run_single_card_demo_tests.sh b/tests/scripts/single_card/run_single_card_demo_tests.sh index a058614bb532..e75b77457c2f 100755 --- a/tests/scripts/single_card/run_single_card_demo_tests.sh +++ b/tests/scripts/single_card/run_single_card_demo_tests.sh @@ -49,6 +49,13 @@ run_qwen25_vl_func() { fi } + +run_gemma3_1b_func(){ + + MESH_DEVICE=N150 HF_MODEL=google/gemma-3-1b-it WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/tt_transformers/demo/simple_text_demo.py -k performance-ci-1 --timeout 1800 + +} + run_segformer_func() { #Segformer Segmentation Demo WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings models/demos/segformer/demo/demo_for_semantic_segmentation.py --timeout 600; fail+=$?