Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions models/common/rmsnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def __init__(
torch_weight,
device=device,
dtype=weight_dtype,
layout=ttnn.ROW_MAJOR_LAYOUT,
layout=ttnn.TILE_LAYOUT,
memory_config=weight_memory_config,
cache_file_name=cache_name,
mesh_mapper=ttnn.ReplicateTensorToMesh(device) if is_mesh_device else None,
Expand All @@ -96,7 +96,7 @@ def __init__(
torch_weight,
device=device,
dtype=weight_dtype,
layout=ttnn.ROW_MAJOR_LAYOUT,
layout=ttnn.TILE_LAYOUT,
memory_config=weight_memory_config,
cache_file_name=cache_name,
mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, 2), mesh_shape=list(device.shape))
Expand Down Expand Up @@ -128,6 +128,11 @@ def forward(self, x: ttnn.Tensor, mode, in_sharded=False, out_sharded=False) ->
else:
assert not out_sharded, "Non-sharded version of RMSNorm cannot output a sharded tensor"

if x.shape[-1] % weight.shape[-1] == 0:
# Reshape weight only if x's last dimension is divisible by weight's last dimension,
# to avoid padding errors in RMSNorm when dimensions are not aligned
weight = ttnn.reshape(weight, [1, 1, 1, -1])

x = norm(
x,
epsilon=self.eps,
Expand Down
93 changes: 70 additions & 23 deletions models/tt_transformers/demo/simple_vision_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
import ttnn
from models.demos.utils.llm_demo_utils import create_benchmark_data, verify_perf
from models.perf.benchmarking_utils import BenchmarkProfiler
from models.tt_transformers.tt.common import hf_multimodal_encode
from models.tt_transformers.tt.generator import Generator
from models.tt_transformers.tt.model_config import CheckpointType


def get_batch_sampler(temperature, top_p, tokenizer):
Expand Down Expand Up @@ -61,6 +63,7 @@ def create_multimodal_model(
checkpoint=None,
):
from models.tt_transformers.tt.model_config import ModelArgs
from models.tt_transformers.tt.multimodal.gemma.gemma_e2e_model import TtGemmaModel
from models.tt_transformers.tt.multimodal.llama_vision_model import CrossAttentionTransformer

tt_model_args = ModelArgs(mesh_device, max_batch_size=max_batch_size)
Expand All @@ -76,14 +79,26 @@ def create_multimodal_model(

if checkpoint is None:
checkpoint = tt_model_args.load_state_dict()
model = CrossAttentionTransformer(
mesh_device,
state_dict=checkpoint,
weight_cache_path=tt_model_args.weight_cache_path(dtype),
dtype=dtype,
configuration=tt_model_args,
use_paged_kv_cache=use_paged_kv_cache,
)
print(f"Loaded checkpoint for {tt_model_args.base_model_name} with {checkpoint.keys()} keys")

if tt_model_args.base_model_name == "gemma-3-4b":
model = TtGemmaModel(
mesh_device=mesh_device,
state_dict=checkpoint,
weight_cache_path=tt_model_args.weight_cache_path(ttnn.bfloat8_b),
dtype=ttnn.bfloat8_b,
args=tt_model_args,
use_paged_kv_cache=use_paged_kv_cache,
)
else:
model = CrossAttentionTransformer(
mesh_device,
state_dict=checkpoint,
weight_cache_path=tt_model_args.weight_cache_path(dtype),
dtype=dtype,
configuration=tt_model_args,
use_paged_kv_cache=use_paged_kv_cache,
)
return tt_model_args, model, checkpoint


Expand Down Expand Up @@ -128,7 +143,7 @@ def prepare_generator_args(
)
@pytest.mark.parametrize(
"test_type,max_seq_len",
(("normal", 512),),
(("normal", 2048),),
ids=["normal"],
)
@pytest.mark.parametrize(
Expand All @@ -148,7 +163,9 @@ def prepare_generator_args(
# 4,
],
)
@pytest.mark.parametrize("device_params", [{"trace_region_size": 14951424, "num_command_queues": 2}], indirect=True)
@pytest.mark.parametrize(
"device_params", [{"trace_region_size": 14951424, "num_command_queues": 2, "l1_small_size": 24576}], indirect=True
)
def test_multimodal_demo_text(
mesh_device,
warmup_iters,
Expand All @@ -172,9 +189,6 @@ def test_multimodal_demo_text(
profiler = BenchmarkProfiler()
profiler.start("run")

ckpt_dir = os.environ["LLAMA_DIR"]
tokenizer_path = str(Path(ckpt_dir) / "tokenizer.model")

num_devices = mesh_device.get_num_devices() if isinstance(mesh_device, ttnn.MeshDevice) else 1
max_batch_size *= data_parallel # input batch_size is interpreted as size per DP group

Expand All @@ -185,11 +199,26 @@ def test_multimodal_demo_text(
max_batch_size=max_batch_size,
max_seq_len=max_seq_len,
)

HF_MODEL = model_args[0].checkpoint_type == CheckpointType.HuggingFace

if not HF_MODEL:
ckpt_dir = os.environ["LLAMA_DIR"]
tokenizer_path = str(Path(ckpt_dir) / "tokenizer.model")

tokenizer = Tokenizer(model_path=tokenizer_path)
formatter = ChatFormat(tokenizer)
else:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(model_args[0].CKPT_DIR)

generator = Generator(model, model_args, mesh_device)
tokenizer = Tokenizer(model_path=tokenizer_path)
formatter = ChatFormat(tokenizer)

xattn_caches = [model.setup_cache(model_args[i].max_batch_size) for i, model in enumerate(generator.model)]
xattn_caches = [
model.setup_cache(model_args[i].max_batch_size) if not HF_MODEL else None
for i, model in enumerate(generator.model)
]

# Create random images for trace capture with specific dimensions
trace_img_560x560 = create_random_image(560, 560)
Expand Down Expand Up @@ -250,10 +279,12 @@ def test_multimodal_demo_text(
total_users = len(dialogs)
num_batches = total_users // max_batch_size

sampler = get_batch_sampler(temperature, top_p, tokenizer)
sampler = get_batch_sampler(temperature, top_p, model_args[0].tokenizer)
_num_prefill_tokens = 0
_num_decode_tokens = 0

prompt_encoder = hf_multimodal_encode if HF_MODEL else formatter.encode_dialog_prompt

for iter_num in range(warmup_iters + 1):
logger.info(f"Iteration {iter_num}")
current_dialogs = trace_dialogs + dialogs
Expand All @@ -263,9 +294,14 @@ def test_multimodal_demo_text(
for msg in dialog:
print(f"{msg.role.capitalize()}: {msg.content}\n")
batch_model_input = [
formatter.encode_dialog_prompt(dialog, tool_prompt_format=False) for dialog in batch_dialogs
prompt_encoder(dialog, processor) if HF_MODEL else prompt_encoder(dialog, tool_prompt_format=False)
for dialog in batch_dialogs
]

if HF_MODEL:
# Use the processor's tokenizer instead of model_args tokenizer to ensure consistency
tokenizer = processor.tokenizer

# Do initial prefill
vision_images = [
model_input.vision.images if model_input.vision else None for model_input in batch_model_input
Expand All @@ -278,7 +314,8 @@ def test_multimodal_demo_text(
total_lens = prefill_lens + max_gen_len

# Create padded tokens tensor for batch
pad_id = tokenizer.pad_id
stop_tokens = model_args[0].tokenizer.stop_tokens
pad_id = tokenizer.pad_token_id if HF_MODEL else tokenizer.pad_id
bsz = len(prompt_tokens)
tokens = torch.full((bsz, max(total_lens)), pad_id, dtype=torch.long)

Expand Down Expand Up @@ -358,19 +395,29 @@ def test_multimodal_demo_text(
profiler.end(f"compile_decode", iteration=batch_idx)

# Disable checking for eot until I have more robust code for batch > 1
# if text in ["<|eot_id|>", "<|eom_id|>"]:
# break
if HF_MODEL:
if next_tokens in stop_tokens:
break
else:
# Disable checking for eot until I have more robust code for batch > 1
pass
# if text in ["<|eot_id|>", "<|eom_id|>"]:
# break
_num_decode_tokens += (
gen_idx * max_batch_size
) # gen_idx is (num_tokens - 1) to avoid counting compile iter

# Log full text output for each user in batch
vision_tokens = [tokenizer.special_tokens["<|image|>"], 128256]
if HF_MODEL:
# For HF models, get vision tokens from the processor if they exist
vision_tokens = []
else:
vision_tokens = [tokenizer.special_tokens["<|image|>"], 128256]

for user_id in range(max_batch_size):
# Remove <|image|> tokens since they break the tokenizer
tokens_out = [
t if t not in vision_tokens else tokenizer.pad_id
t if t not in vision_tokens else pad_id
for t in tokens[user_id].tolist()[: position_id[user_id] + 2]
]
text = tokenizer.decode(tokens_out)
Expand Down
100 changes: 100 additions & 0 deletions models/tt_transformers/tests/multimodal/gemma/test_mmp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

import os

import pytest
import torch
from loguru import logger

import ttnn
from models.tt_transformers.tt.model_config import ModelArgs
from models.tt_transformers.tt.multimodal.gemma.multi_modal_projector import TtGemma3MultiModalProjector
from models.utility_functions import comp_allclose, comp_pcc, skip_for_grayskull


@torch.no_grad()
@skip_for_grayskull("Requires wormhole_b0 to run")
@pytest.mark.parametrize(
"device",
[
{"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get(
os.environ.get("device"), len(ttnn.get_device_ids())
)
],
indirect=True,
)
@pytest.mark.parametrize(
"seq_len",
(1152,),
)
@pytest.mark.parametrize(
"batch_size",
(1,),
)
@pytest.mark.parametrize("device_params", [{"l1_small_size": 24576}], indirect=True)
def test_multi_modal_inference(seq_len, batch_size, reset_seeds, device):
print("device:", device)
dtype = ttnn.bfloat16
mode = "decode" if seq_len <= 32 else "prefill"

tt_model_args = ModelArgs(
device,
max_batch_size=batch_size,
max_seq_len=128,
)

tt_model_args.n_layers = 1
state_dict = tt_model_args.load_state_dict()

reference_model = tt_model_args.reference_vision_multi_modal()
# first_layer_prefix = "multi_modal_projector."

# partial_state_dict = {
# k[len(first_layer_prefix) :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix))
# }

# reference_model.load_state_dict(partial_state_dict)

# create input tensor for multi_modal_projector layer
patches_per_image = 64
num_patches = patches_per_image * patches_per_image
input = torch.randn((batch_size, num_patches, seq_len))
reference_output = reference_model(input)

# DistributedNorm inputs are fractured across devices and interleaved in DRAM (for prefill) and L1 (for decode)
tt_input = ttnn.from_torch(
input,
device=device,
dtype=dtype,
layout=ttnn.TILE_LAYOUT,
mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, -1), mesh_shape=tt_model_args.cluster_shape),
# memory_config=(
# tt_model_args.get_model_config()["DECODE_RESIDUAL_MEMCFG"] if mode == "decode" else ttnn.DRAM_MEMORY_CONFIG
# ),
memory_config=ttnn.DRAM_MEMORY_CONFIG,
)

tt_model = TtGemma3MultiModalProjector(
mesh_device=device,
state_dict=state_dict,
state_dict_prefix="model.multi_modal_projector",
image_size=tt_model_args.vision_chunk_size,
patch_size=tt_model_args.vision_patch_size,
hidden_size=tt_model_args.vision_hidden_dim,
mm_tokens_per_image=tt_model_args.mm_tokens_per_image,
weight_cache_path=tt_model_args.weight_cache_path(dtype),
layer_norm_eps=1e-06, # layer_norm_eps
dtype=dtype,
configuration=tt_model_args,
)
tt_output = tt_model(tt_input)

tt_output_torch = ttnn.to_torch(tt_output).squeeze(0)
passing, pcc_message = comp_pcc(reference_output, tt_output_torch)

pcc_required = 0.9999
logger.info(comp_allclose(reference_output, tt_output_torch))
logger.info(f"PCC: {pcc_message}")
assert passing, f"PCC value is lower than {pcc_required} for some of the outputs. Check Warnings!"
Loading