Migrate Gemma-3-1b-it to TT-Transformers Library

MohammedTaherMcW · MohammedTaherMcW · commit 0e1763231d2d · 2025-08-04T16:11:44.000Z
diff --git a/models/common/rmsnorm.py b/models/common/rmsnorm.py
@@ -82,7 +82,7 @@ def __init__(
             torch_weight,
             device=device,
             dtype=weight_dtype,
-            layout=ttnn.ROW_MAJOR_LAYOUT,
+            layout=ttnn.TILE_LAYOUT,
             memory_config=weight_memory_config,
             cache_file_name=cache_name,
             mesh_mapper=ttnn.ReplicateTensorToMesh(device) if is_mesh_device else None,
@@ -93,7 +93,7 @@ def __init__(
                 torch_weight,
                 device=device,
                 dtype=weight_dtype,
-                layout=ttnn.ROW_MAJOR_LAYOUT,
+                layout=ttnn.TILE_LAYOUT,
                 memory_config=weight_memory_config,
                 cache_file_name=cache_name,
                 mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, 2), mesh_shape=list(device.shape))
@@ -125,6 +125,11 @@ def forward(self, x: ttnn.Tensor, mode, in_sharded=False, out_sharded=False) ->
         else:
             assert not out_sharded, "Non-sharded version of RMSNorm cannot output a sharded tensor"
 
+        if x.shape[-1] % weight.shape[-1] == 0:
+            # Reshape weight only if x's last dimension is divisible by weight's last dimension,
+            # to avoid padding errors in RMSNorm when dimensions are not aligned
+            weight = ttnn.reshape(weight, [1, 1, 1, -1])
+
         x = norm(
             x,
             epsilon=self.eps,
diff --git a/models/tt_transformers/tests/test_decoder.py b/models/tt_transformers/tests/test_decoder.py
@@ -177,7 +177,7 @@ def test_decoder_inference(
         tt_out = tt_model(
             decode_input,
             current_pos_tensor,
-            rot_mats=rot_mats,
+            rot_mats=[rot_mats, rot_mats],
             mode="decode",
             page_table=page_table_tt,
         )
@@ -191,7 +191,7 @@ def test_decoder_inference(
         freqs_cis_i = freqs_cis[current_pos[0], :].unsqueeze(0)
 
         # Reference model
-        ref_output = reference_model(pt_decode_input, current_pos[0], freqs_cis_i, mask=None)
+        ref_output = reference_model(pt_decode_input.to(dtype=torch.bfloat16), current_pos[0], freqs_cis_i, mask=None)
 
         passing, pcc_message = comp_pcc(ref_output, tt_output_torch)
 
diff --git a/models/tt_transformers/tests/test_embedding.py b/models/tt_transformers/tests/test_embedding.py
@@ -58,6 +58,7 @@ def test_embedding(max_seq_len, batch_size, mesh_device, reset_seeds, ensure_gc)
 
     prompts = ["Joy"] * 32
     pt_input = torch.tensor([model_args.encode_prompt(prompt, instruct=False) for prompt in prompts])
+    embed_scale = model_args.embed_scale
     reference_output = reference_emb(pt_input)
     logger.info(f"reference_output: {reference_output.shape}")
 
@@ -68,7 +69,7 @@ def test_embedding(max_seq_len, batch_size, mesh_device, reset_seeds, ensure_gc)
         dtype=ttnn.uint32,
         layout=ttnn.ROW_MAJOR_LAYOUT,
     )
-    tt_output = tt_emb(tt_input)
+    tt_output = tt_emb(tt_input, embed_scale)
     tt_output_torch = ttnn.to_torch(
         tt_output,
         mesh_composer=ttnn.ConcatMesh2dToTensor(mesh_device, dims=(0, -1), mesh_shape=model_args.cluster_shape),
diff --git a/models/tt_transformers/tt/attention.py b/models/tt_transformers/tt/attention.py
@@ -27,6 +27,7 @@ def __init__(
         use_paged_kv_cache=False,
     ):
         super().__init__()
+        self.is_sliding = configuration.is_sliding[layer_num]
 
         self.state_dict = state_dict
         self.mesh_device = mesh_device
diff --git a/models/tt_transformers/tt/common.py b/models/tt_transformers/tt/common.py
@@ -238,25 +238,31 @@ def compute_llama3_parameters(freqs: torch.Tensor, scale_factor: float, orig_con
     return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
 
 
+def compute_linear_parameters(freqs: torch.Tensor, scale_factor: float, orig_context_len: int):
+    """Linear scaling for rotary embeddings."""
+    freqs /= scale_factor
+    return freqs
+
+
 def compute_default_parameters(freqs: torch.Tensor, scale_factor: float, orig_context_len: int):
     """Default scaling for rotary embeddings."""
     return freqs
 
 
-def apply_scaling(freqs: torch.Tensor, scale_factor: float, orig_context_len: int):
+def apply_scaling(freqs: torch.Tensor, scale_factor: float, orig_context_len: int, rope_type="llama3"):
     # FIXME: Llama-3.x specific scaling - we need to support yarn for Qwen2.5 models
 
-    hf_model_env = os.getenv("HF_MODEL")
-
-    if hf_model_env == "google/gemma-3-1b-it":
+    if rope_type == "default":
         freqs = compute_default_parameters(freqs, scale_factor, orig_context_len)
-    elif "LLAMA_DIR" in os.environ or (hf_model_env and "llama" in hf_model_env.lower()):
+    elif rope_type == "linear":
+        freqs = compute_linear_parameters(freqs, scale_factor, orig_context_len)
+    elif rope_type == "llama3":
         freqs = compute_llama3_parameters(freqs, scale_factor, orig_context_len)
 
     return freqs
 
 
-def precompute_freqs(dim: int, end: int, theta, scale_factor, orig_context_len):
+def precompute_freqs(dim: int, end: int, theta, scale_factor, orig_context_len, rope_type="llama3"):
     """
     Precompute the frequency tensor for sine and cosine values with given dimensions.
 
@@ -271,7 +277,7 @@ def precompute_freqs(dim: int, end: int, theta, scale_factor, orig_context_len):
     freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
     t = torch.arange(end)
     if scale_factor is not None:
-        freqs = apply_scaling(freqs, scale_factor, orig_context_len)
+        freqs = apply_scaling(freqs, scale_factor, orig_context_len, rope_type=rope_type)
     freqs = torch.outer(t, freqs).float()
     return torch.cos(freqs), torch.sin(freqs)
 
diff --git a/models/tt_transformers/tt/decoder.py b/models/tt_transformers/tt/decoder.py
@@ -102,6 +102,53 @@ def __init__(
             args,
             TG=args.is_galaxy,
         )
+        if f"layers.{layer_num}.pre_feedforward_layernorm.weight" in self.state_dict:
+            self.pre_ff_norm = DistributedNorm(  # pre_feedforward_layernorm
+                RMSNorm(
+                    device=mesh_device,
+                    dim=args.dim,
+                    eps=args.norm_eps,
+                    state_dict=state_dict,
+                    add_unit_offset=self.args.rms_norm_add_unit_offset,
+                    state_dict_prefix=args.get_state_dict_prefix("", layer_num),
+                    weight_cache_path=None if args.dummy_weights else weight_cache_path,
+                    weight_dtype=ttnn.bfloat16,
+                    weight_key="pre_feedforward_layernorm",
+                    is_distributed=self.args.is_distributed_norm,
+                    sharded_program_config=self.model_config["SHARDED_NORM_MLP_PRGM_CFG"],
+                    sharded_output_config=self.model_config["SHARDED_MLP_INPUT_MEMCFG"],
+                    ccl_topology=self.args.ccl_topology(),
+                ),
+                args,
+                TG=args.is_galaxy,
+            )
+        else:
+            # If pre_feedforward_layernorm is not in state_dict, we do not use it
+            self.pre_ff_norm = None
+
+        if f"layers.{layer_num}.post_feedforward_layernorm.weight" in self.state_dict:
+            self.post_ff_norm = DistributedNorm(  # post_feedforward_layernorm
+                RMSNorm(
+                    device=mesh_device,
+                    dim=args.dim,
+                    eps=args.norm_eps,
+                    add_unit_offset=self.args.rms_norm_add_unit_offset,
+                    state_dict=state_dict,
+                    state_dict_prefix=args.get_state_dict_prefix("", layer_num),
+                    weight_cache_path=None if args.dummy_weights else weight_cache_path,
+                    weight_dtype=ttnn.bfloat16,
+                    weight_key="post_feedforward_layernorm",
+                    is_distributed=self.args.is_distributed_norm,
+                    sharded_program_config=self.model_config["SHARDED_NORM_MLP_PRGM_CFG"],
+                    sharded_output_config=self.model_config["SHARDED_MLP_INPUT_MEMCFG"],
+                    ccl_topology=self.args.ccl_topology(),
+                ),
+                args,
+                TG=args.is_galaxy,
+            )
+        else:
+            # If post_feedforward_layernorm is not in state_dict, we do not use it
+            self.post_ff_norm = None
 
     def forward(
         self,
@@ -116,6 +163,7 @@ def forward(
         kv_cache=None,
     ) -> ttnn.Tensor:
         TG = self.args.is_galaxy
+        residual = x
         # x is fractured across devices and interleaved in DRAM (for prefill) and sharded in L1 (for decode)
         skip_mem_cfg = self.model_config["DECODE_RESIDUAL_MEMCFG"] if mode == "decode" else ttnn.DRAM_MEMORY_CONFIG
         assert (
@@ -124,36 +172,53 @@ def forward(
         # Norms take fractured inputs and output replicated across devices
         attn_in = self.attention_norm(x, mode)
         # Attention takes replicated inputs and produces fractured outputs
+        if self.attention.is_sliding:
+            position_embeddings = rot_mats[1]
+        else:
+            position_embeddings = rot_mats[0]
+
         attn_out = self.attention.forward(
             attn_in,
             current_pos,
-            rot_mats,
+            position_embeddings,
             user_id,
             mode,
             page_table=page_table,
             chunk_page_table=chunk_page_table,
             chunk_start_idx=chunk_start_idx,
             kv_cache=kv_cache,
         )
-        # Here x and attn_out are both fractured across devices
-        h = ttnn.add(x, attn_out, memory_config=skip_mem_cfg, dtype=ttnn.bfloat16 if TG else None)
-        ttnn.deallocate(attn_out)
+        if self.pre_ff_norm == None:
+            attn_out = ttnn.add(x, attn_out, memory_config=skip_mem_cfg, dtype=ttnn.bfloat16 if TG else None)
+
+            residual = attn_out
+
+        hidden_states = self.ff_norm(attn_out, mode)
+        if self.pre_ff_norm is not None:
+            hidden_states = ttnn.add(hidden_states, residual, memory_config=skip_mem_cfg, dtype=ttnn.bfloat16)
+
+            residual = hidden_states
+
+            hidden_states = self.pre_ff_norm(hidden_states, mode)
+
         if mode == "prefill":
             x.deallocate(True)
 
-        # Norms take fractured inputs and output replicated across devices
-        ff_in = self.ff_norm(h, mode)
+        # ttnn.deallocate(attn_out)
+
         if TG and mode == "decode":
-            ff_in = ttnn.to_memory_config(ff_in, memory_config=self.model_config["MLP_ACT_MEMCFG"])
+            hidden_states = ttnn.to_memory_config(hidden_states, memory_config=self.model_config["MLP_ACT_MEMCFG"])
         # MLP takes replicated inputs and produces fractured outputs
-        ff_out = self.feed_forward.forward(ff_in, mode)
-        # ff_out and h are both fractured across devices
+        hidden_states = self.feed_forward.forward(hidden_states, mode)
         activation_dtype = self.model_config["DECODERS_OPTIMIZATIONS"].get_tensor_dtype(
             decoder_id=self.layer_num, tensor=TensorGroup.ACTIVATION
         )
+        if self.post_ff_norm is not None:
+            hidden_states = self.post_ff_norm(hidden_states, mode)
+
         out = ttnn.add(
-            h,
-            ff_out,
+            residual,
+            hidden_states,
             memory_config=skip_mem_cfg,
             dtype=self.args.ccl_dtype
             if TG and not self.args.is_distributed_norm(mode)
diff --git a/models/tt_transformers/tt/embedding.py b/models/tt_transformers/tt/embedding.py
@@ -33,6 +33,7 @@ def __init__(
             cache_file_name=cache_name,
         )
 
-    def forward(self, x: ttnn.Tensor) -> ttnn.Tensor:
+    def forward(self, x: ttnn.Tensor, embed_scale: int = 1.0) -> ttnn.Tensor:
         x = ttnn.embedding(x, self.weights, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+        x = ttnn.multiply(x, embed_scale)
         return x
diff --git a/models/tt_transformers/tt/lm_head.py b/models/tt_transformers/tt/lm_head.py
@@ -31,6 +31,7 @@ def __init__(
         self.num_devices = args.num_devices
 
         size_per_device = self.vocab_size // self.num_devices
+        self.model_config = args.get_model_config()
 
         if args.is_galaxy:
             size_per_device = self.padded_vocab_size // self.num_devices
@@ -138,12 +139,14 @@ def forward(self, x: ttnn.Tensor):
                 compute_kernel_config=self.compute_kernel_config,
                 program_config=pc,
                 memory_config=ttnn.L1_WIDTH_SHARDED_MEMORY_CONFIG,
-                dtype=ttnn.bfloat8_b,
+                dtype=self.args.lm_head_dtype or ttnn.bfloat8_b,
+            )
+            outputs.append(
+                ttnn.sharded_to_interleaved(output, memory_config=self.model_config["LM_HEAD_OUTPUT_MEMCFG"])
             )
-            outputs.append(ttnn.sharded_to_interleaved(output, memory_config=ttnn.L1_MEMORY_CONFIG))
 
         # Concatenate the outputs
-        output = ttnn.concat(outputs, dim=-1, memory_config=ttnn.L1_MEMORY_CONFIG)
+        output = ttnn.concat(outputs, dim=-1, memory_config=self.model_config["LM_HEAD_OUTPUT_MEMCFG"])
 
         output = tt_all_reduce(
             output,
diff --git a/models/tt_transformers/tt/mlp.py b/models/tt_transformers/tt/mlp.py
@@ -72,7 +72,9 @@ def __init__(
         self.w3 = as_sharded_tensor("w3_sharded", ff1_3_dtype, dims=w1_dims)
 
         # Default activation is SILU
-        self.activation_type = self.args.mlp_activation_type
+        self.activation_type = (
+            args.mlp_activation_type if hasattr(args, "mlp_activation_type") else ttnn.UnaryOpType.SILU
+        )
 
     def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor:
         """
diff --git a/models/tt_transformers/tt/model.py b/models/tt_transformers/tt/model.py
@@ -49,15 +49,31 @@ def __init__(
             dtype=ttnn.bfloat16,  # Row major layout requires bfloat16
         )
 
-        ActualRopeSetupClass = rope_setup_class if rope_setup_class is not None else RotarySetup
-        self.rope_setup = ActualRopeSetupClass(
-            device=mesh_device,
-            batch_size=args.max_batch_size,
-            head_dim=args.head_dim,
-            max_seq_len=args.max_seq_len,
-            rope_theta=args.rope_theta,
-            rope_scaling=args.rope_scaling,
+        self.rope_setup = RotarySetup(
+            mesh_device,
+            args.max_batch_size,
+            args.head_dim,
+            args.max_seq_len,
+            args.rope_theta,
+            args.rope_scaling_factor,
+            args.orig_context_len,
+            args.rope_type,
         )
+
+        if args.rope_local_theta is not None:
+            self.rope_setup_local = RotarySetup(
+                mesh_device,
+                args.max_batch_size,
+                args.head_dim,
+                args.max_seq_len,
+                args.rope_local_theta,
+                args.rope_scaling_factor,
+                args.orig_context_len,
+                "default",
+            )
+        else:
+            self.rope_setup_local = None
+
         self.trans_mats_dict = self.rope_setup.get_both_trans_mats()
 
         self.layers = [
@@ -105,6 +121,8 @@ def __init__(
             max_columns_per_device=self.args.max_columns_per_device_lm_head,
         )
 
+        self.embed_scale = args.embed_scale
+
     def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_page_table=None):
         """
         Inputs are torch tensors or python types. This function returns ttnn
@@ -122,7 +140,8 @@ def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_pag
             layout=ttnn.ROW_MAJOR_LAYOUT,
             mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device),
         )
-        tokens_embd = self.embd(tokens)
+        tokens_embd = self.embd(tokens, self.embed_scale)
+
         tokens_embd = ttnn.unsqueeze_to_4D(tokens_embd)
 
         # Slice the rot mats to the prefill seqlen
@@ -133,6 +152,13 @@ def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_pag
             self.rope_setup.cos_matrix[:, :, start_pos : start_pos + S, :],
             self.rope_setup.sin_matrix[:, :, start_pos : start_pos + S, :],
         ]
+        if self.rope_setup_local is not None:
+            tt_rot_mats_prefill_local = [
+                self.rope_setup_local.cos_matrix[:, :, start_pos : start_pos + S, :],
+                self.rope_setup_local.sin_matrix[:, :, start_pos : start_pos + S, :],
+            ]
+        else:
+            tt_rot_mats_prefill_local = None
 
         if page_table is not None:
             tt_page_table = ttnn.from_torch(
@@ -156,7 +182,7 @@ def prepare_inputs_prefill(self, tokens, start_pos=0, page_table=None, chunk_pag
         else:
             tt_chunk_page_table = None
 
-        return tokens_embd, tt_rot_mats_prefill, tt_page_table, tt_chunk_page_table
+        return tokens_embd, [tt_rot_mats_prefill, tt_rot_mats_prefill_local], tt_page_table, tt_chunk_page_table
 
     def prepare_inputs_decode(self, *inputs):
         """
@@ -228,13 +254,18 @@ def transform_decode_inputs_device(self, tokens, current_pos, rope_idxs, page_ta
         Embed tokens
         """
         tt_rot_mats = self.rope_setup.get_rot_mats(rope_idxs)
-        tt_tokens = self.embd(tokens)
+        if self.rope_setup_local is not None:
+            tt_rot_mats_local = self.rope_setup_local.get_rot_mats(rope_idxs)
+        else:
+            tt_rot_mats_local = None
+        tt_tokens = self.embd(tokens, self.embed_scale)
+
         tt_tokens = ttnn.unsqueeze_to_4D(tt_tokens)
         tt_tokens = ttnn.to_memory_config(
             tt_tokens,
             self.args.model_config["DECODE_RESIDUAL_MEMCFG"],
         )
-        return tt_tokens, current_pos, tt_rot_mats, page_table
+        return tt_tokens, current_pos, [tt_rot_mats, tt_rot_mats_local], page_table
 
     def concat_device_output(self, tt_out):
         """
diff --git a/models/tt_transformers/tt/model_config.py b/models/tt_transformers/tt/model_config.py
diff --git a/models/tt_transformers/tt/rope.py b/models/tt_transformers/tt/rope.py

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ def __init__(`
`33`	`33`	`cache_file_name=cache_name,`
`34`	`34`	`)`
`35`	`35`
`36`		`- def forward(self, x: ttnn.Tensor) -> ttnn.Tensor:`
	`36`	`+ def forward(self, x: ttnn.Tensor, embed_scale: int = 1.0) -> ttnn.Tensor:`
`37`	`37`	`x = ttnn.embedding(x, self.weights, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG)`
	`38`	`+ x = ttnn.multiply(x, embed_scale)`
`38`	`39`	`return x`