Rebase Gemma-3-4b-it

MohammedTaherMcW · MohammedTaherMcW · commit c079a6790af9 · 2025-08-07T17:11:29.000Z
diff --git a/models/experimental/gemma3_4b/tests/test_attention.py b/models/experimental/gemma3_4b/tests/test_attention.py
@@ -92,8 +92,7 @@ def test_attention_inference(
         model_args.head_dim,
         model_args.max_seq_len,
         model_args.rope_theta,
-        model_args.rope_scaling_factor,
-        model_args.orig_context_len,
+        model_args.rope_scaling,
     )
 
     transformation_mats = rope_setup.get_both_trans_mats()
@@ -141,8 +140,8 @@ def test_attention_inference(
         model_args.head_dim,
         model_args.max_seq_len * 2,
         model_args.rope_theta,
-        model_args.rope_scaling_factor,
-        model_args.orig_context_len,
+        model_args.rope_scaling.factor if model_args.rope_scaling else None,
+        model_args.rope_scaling.original_max_position_embeddings if model_args.rope_scaling else None,
     )
     freqs_cis = torch.complex(cos, sin)
 
diff --git a/models/experimental/gemma3_4b/tests/test_decoder.py b/models/experimental/gemma3_4b/tests/test_decoder.py
@@ -82,8 +82,7 @@ def test_decoder_inference(
         model_args.head_dim,
         model_args.max_seq_len,
         model_args.rope_theta,
-        model_args.rope_scaling_factor,
-        model_args.orig_context_len,
+        model_args.rope_scaling,
     )
     transformation_mats = rope_setup.get_both_trans_mats()
 
diff --git a/models/experimental/gemma3_4b/tests/test_mlp.py b/models/experimental/gemma3_4b/tests/test_mlp.py
@@ -47,10 +47,13 @@ def test_mlp_inference(seq_len, batch_size, reset_seeds, device):
     state_dict = tt_model_args.load_state_dict()
 
     # # Ref model needs partial state dict, but our models use full state dict keys as cached weight names
-    first_layer_prefix = "layers.0.feed_forward"
+    # first_layer_prefix = "layers.0.feed_forward"
+    first_layer_prefix = tt_model_args.get_state_dict_prefix("MLP", 0)
+
     partial_state_dict = {
         k[len(first_layer_prefix) + 1 :]: v for k, v in state_dict.items() if (k.startswith(first_layer_prefix))
     }
+
     reference_model = tt_model_args.reference_mlp()  # Gemma3 MLP
     reference_model.load_state_dict(partial_state_dict)
 
diff --git a/models/experimental/gemma3_4b/tests/vision_tests/test_end2end.py b/models/experimental/gemma3_4b/tests/vision_tests/test_end2end.py
@@ -169,18 +169,18 @@ def setup_vision_prompts_and_tokenizer(model_args, instruct):
         }
     ]
 
-    # messages = [
-    #     {
-    #         "role": "user",
-    #         "content": [
-    #             {
-    #                 "type": "image",
-    #                 "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
-    #             },
-    #             {"type": "text", "text": "Describe this image in detail."},
-    #         ],
-    #     }
-    # ]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+                },
+                {"type": "text", "text": "Describe this image in detail."},
+            ],
+        }
+    ]
 
     tokenizer = model_args.tokenizer
     return messages, tokenizer
@@ -211,7 +211,7 @@ def process_real_vision_inputs(messages, model_args):
     ).to(dtype=torch.bfloat16)
 
     input_ids = encoded["input_ids"]
-    pixel_values = None
+    pixel_values = encoded["pixel_values"]
     attention_mask = encoded["attention_mask"]
 
     # logger.info(f"Processed vision inputs - input_ids: {input_ids.shape}, pixel_values: {pixel_values.shape}")
diff --git a/models/experimental/gemma3_4b/tt/_model.py b/models/experimental/gemma3_4b/tt/_model.py
diff --git a/models/experimental/gemma3_4b/tt/attention.py b/models/experimental/gemma3_4b/tt/attention.py
@@ -1,4 +1,6 @@
 """
+source: models/tt_transformers/tt/attention.py
+
 This is the attention implementation of the Gemma-3-4b-it
 
 We have re-used the Attention implementation of the TT-Transformers with few modifications.
diff --git a/models/experimental/gemma3_4b/tt/decoder.py b/models/experimental/gemma3_4b/tt/decoder.py
@@ -1,4 +1,6 @@
 """
+source: models/tt_transformers/tt/decoder.py
+
 This is the Decoder block for the gemma 3-4b-it model
 We couldn't use the existing implementation in TT-Transformers because the usage of submodules is different
 
diff --git a/models/experimental/gemma3_4b/tt/gemma3_generator.py b/models/experimental/gemma3_4b/tt/gemma3_generator.py
@@ -1,4 +1,6 @@
 """
+source: models/tt_transformers/tt/generator.py
+
 This is the Replica version of the Generator class for the Gemma Model.
 This adds support for kwargs that contains the procesed inputs and the vision submodule of the model.
 
diff --git a/models/experimental/gemma3_4b/tt/gemma_conv2d_patch.py b/models/experimental/gemma3_4b/tt/gemma_conv2d_patch.py
@@ -1,4 +1,5 @@
 """
+source: models/tt_transformers/tt/multimodal/llama_conv2d_patch.py
 This is the Conv2dPath of Gemma-3-4b-it
 We have reused the exisiting Conv2dPath of TtLlamaConv2dPath with few modifications.
 We have added a check for weight to convert 4D to 2D
diff --git a/models/experimental/gemma3_4b/tt/gemma_image_attention.py b/models/experimental/gemma3_4b/tt/gemma_image_attention.py
@@ -1,4 +1,6 @@
 """
+source: models/tt_transformers/tt/multimodal/llama_image_attention.py
+
 This is the ImageAttention block for Gemma-3-4b-it
 We have reused the TTLlamaImageAttention with some modification.
 We have made the linears (Q,K,V) to be executed separately and added bias support for O_projection, along with few
diff --git a/models/experimental/gemma3_4b/tt/gemma_image_block.py b/models/experimental/gemma3_4b/tt/gemma_image_block.py
@@ -1,4 +1,6 @@
 """
+source: models/tt_transformers/tt/multimodal/llama_image_block.py
+
 This is the ImageTransformer block for Gemma-3-4b-it.
 We have reused the TtLlamaImageTransformerBlock with incorporating the
 TtGemmaImageAttention and TtGemmaImageFeedForward
diff --git a/models/experimental/gemma3_4b/tt/gemma_image_mlp.py b/models/experimental/gemma3_4b/tt/gemma_image_mlp.py
@@ -1,4 +1,5 @@
 """
+source: models/tt_transformers/tt/multimodal/llama_image_mlp.py
 This is the FeedForward submodule for vision block in Gemma-3-4b-it
 We have reused the TtLlamaImageFeedForward with few changes in CoreGrid and program_config configurations
 """
diff --git a/models/experimental/gemma3_4b/tt/gemma_image_transformer.py b/models/experimental/gemma3_4b/tt/gemma_image_transformer.py
@@ -1,4 +1,6 @@
 """
+source: models/tt_transformers/tt/multimodal/llama_image_transformer.py
+
 This is the Entire ImageTransformer for Gemma-3-4b-it.
 We have adapted the TtGemmaImageTransformerBlock from TtLlamaImageTransformerBlock
 with changes incorporating the GemmaImageAttention and GemmaImageFeedForward
diff --git a/models/experimental/gemma3_4b/tt/lm_head.py b/models/experimental/gemma3_4b/tt/lm_head.py
@@ -1,4 +1,6 @@
 """
+source: models/tt_transformers/tt/lm_head.py
+
 This is the LMHead module for the Gemma-3-4B-it model.
 """
 # SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
diff --git a/models/experimental/gemma3_4b/tt/mlp.py b/models/experimental/gemma3_4b/tt/mlp.py
@@ -1,4 +1,6 @@
 """
+source: models/tt_transformers/tt/mlp.py
+
 This is the implementation of MLP (feed-forward) submodule of Gemma-3-4b-it.
 
 We have re-used the MLP implementation of the TT-Transformers library with few modifications.
diff --git a/models/experimental/gemma3_4b/tt/rmsnorm.py b/models/experimental/gemma3_4b/tt/rmsnorm.py
@@ -1,4 +1,6 @@
 """
+source: models/common/rmsnorm.py
+
 This is the modified version of the RMSNorm for Gemma-3-4b-it model.
 
 We have modified the RMSNorm implementation equivalent to RMSNorm in Gemma-3-4b-it.
diff --git a/models/experimental/gemma3_4b/tt/text_model.py b/models/experimental/gemma3_4b/tt/text_model.py
@@ -62,8 +62,7 @@ def __init__(
             args.head_dim,
             args.max_seq_len,
             args.rope_theta,
-            args.rope_scaling_factor,
-            args.orig_context_len,
+            args.rope_scaling,
         )
 
         self.rope_setup_local = RotarySetup(
@@ -73,7 +72,6 @@ def __init__(
             args.max_seq_len,
             10000,
             None,
-            args.orig_context_len,
         )
 
         self.trans_mats_dict = self.rope_setup.get_both_trans_mats()
diff --git a/models/tt_transformers/tt/common.py b/models/tt_transformers/tt/common.py
@@ -34,8 +34,8 @@ def __init__(self, block_size=32, max_num_blocks=1024):
 class RopeScalingType(str, Enum):
     """Types of RoPE scaling."""
 
-    # LINEAR = "linear"
     # DYNAMIC = "dynamic"
+    LINEAR = "linear"
     YARN = "yarn"
     LLAMA3 = "llama3"
     DEFAULT = "default"
@@ -57,6 +57,14 @@ class RopeScalingLlama3(RopeScaling):
     high_freq_factor: Optional[float] = 4.0
 
 
+class RopeScalingLinear(RopeScaling):
+    """RoPE scaling configuration for Linear."""
+
+    # Linear-specific parameters
+    factor: float = 8.0
+    original_max_position_embeddings: int = 2048
+
+
 class RopeScalingYarn(RopeScaling):
     """RoPE scaling configuration for Yarn."""
 
@@ -73,6 +81,8 @@ def rope_scaling_model_factory(rope_scaling_params: dict) -> RopeScaling:
         return RopeScalingLlama3(**rope_scaling_params)
     elif rope_scaling_type == RopeScalingType.YARN:
         return RopeScalingYarn(**rope_scaling_params)
+    elif rope_scaling_type == RopeScalingType.LINEAR:
+        return RopeScalingLinear(**rope_scaling_params)
     elif rope_scaling_type in ["default", "mrope"]:
         logger.warning(
             f"Rope scaling type was set to {rope_scaling_type}, defaulting to no rope scaling as this rope type is not supported yet by TTT"
@@ -259,14 +269,13 @@ def compute_default_parameters(freqs: torch.Tensor, scale_factor: float, orig_co
 def apply_scaling(freqs: torch.Tensor, scale_factor: float, orig_context_len: int):
     # FIXME: Llama-3.x specific scaling - we need to support yarn for Qwen2.5 models
 
-    # hf_model_env = os.getenv("HF_MODEL")
+    hf_model_env = os.getenv("HF_MODEL")
 
-    # if hf_model_env == "google/gemma-3-4b-it":
-    #     freqs = compute_linear_parameters(freqs, scale_factor, orig_context_len)
-    # elif "LLAMA_DIR" in os.environ or (hf_model_env and "llama" in hf_model_env.lower()):
-    #     freqs = compute_llama3_parameters(freqs, scale_factor, orig_context_len)
+    if hf_model_env == "google/gemma-3-4b-it":
+        freqs = compute_linear_parameters(freqs, scale_factor, orig_context_len)
+    elif "LLAMA_DIR" in os.environ or (hf_model_env and "llama" in hf_model_env.lower()):
+        freqs = compute_llama3_parameters(freqs, scale_factor, orig_context_len)
 
-    freqs /= scale_factor
     return freqs
 
 
diff --git a/models/tt_transformers/tt/load_checkpoints.py b/models/tt_transformers/tt/load_checkpoints.py
@@ -584,30 +584,37 @@ def replace_keys(state_dict, replacements):
     return state_dict
 
 
-def map_hf_to_meta_keys(loaded_weights):
-    """
-    Map Hugging Face checkpoint keys to Meta checkpoint keys.
-    You can use this to support other models by adding more mappings.
-    See replace_keys for more details on the format of replacements.
-    """
-    replacements = [
-        ("^emb.weight", "weight"),
-        ("model.", ""),
-        ("embed_tokens", "tok_embeddings"),
-        ("lm_head", "output"),
-        ("input_layernorm", "attention_norm"),
-        ("post_attention_layernorm", "ffn_norm"),
-        ("self_attn", "attention"),
-        ("mlp", "feed_forward"),
-        ("gate_proj", "w1"),
-        ("down_proj", "w2"),
-        ("up_proj", "w3"),
-        ("q_proj", "wq"),
-        ("k_proj", "wk"),
-        ("v_proj", "wv"),
-        ("o_proj", "wo"),
-    ]
-    return replace_keys(loaded_weights, replacements)
+# def map_hf_to_meta_keys(loaded_weights):
+#     """
+#     Map Hugging Face checkpoint keys to Meta checkpoint keys.
+#     You can use this to support other models by adding more mappings.
+#     See replace_keys for more details on the format of replacements.
+#     """
+#     replacements = [
+#         ("^emb.weight", "weight"),
+#         ("model.", ""),
+#         ("embed_tokens", "tok_embeddings"),
+#         ("lm_head", "output"),
+#         ("input_layernorm", "attention_norm"),
+#         ("post_attention_layernorm", "ffn_norm"),
+#         ("self_attn", "attention"),
+#         ("mlp", "feed_forward"),
+#         ("gate_proj", "w1"),
+#         ("down_proj", "w2"),
+#         ("up_proj", "w3"),
+#         ("q_proj", "wq"),
+#         ("k_proj", "wk"),
+#         ("v_proj", "wv"),
+#         ("o_proj", "wo"),
+#         ("q_norm", "wq_norm"),
+#         ("k_norm", "wk_norm"),
+#         ("wo_bias", "wo.bias"),
+#         ("w1_bias", "gate_proj.bias"),
+#         ("w2_bias", "down_proj.bias"),
+#         ("w3_bias", "up_proj.bias"),
+#         ("layers.", "model.layers."),
+#     ]
+#     return replace_keys(loaded_weights, replacements)
 
 
 def convert_vision_meta_to_hf(state_dict, head_dim):
diff --git a/models/tt_transformers/tt/model_config.py b/models/tt_transformers/tt/model_config.py
@@ -622,11 +622,6 @@ def __init__(
         # Update memory layouts (Tile, except MLP)
         self.model_config.update({f"{key}_TILE": ttnn.TILE_LAYOUT for key in self.OP_KEYS if "LAYOUT" in key})
 
-        # self.cos, self.sin = precompute_freqs(
-        #     self.head_dim, self.max_seq_len * 2, self.rope_theta, self.rope_scaling_factor, self.orig_context_len
-        # )  # for prefill
-        # # self.rot_emb = freqs_to_rotation_matrix(self.cos, self.sin)  # for decode
-
         self.tokenizer = None if dummy_weights else self.create_tokenizer()
 
         if device is not None:  # Avoid issue with test_torch.py not having a device
@@ -1676,22 +1671,24 @@ def merge_vision_config(base_config):
                 logger.info(
                     f"Loading state param for dummy {self.model_name} from {self.LOCAL_HF_PARAMS[self.model_name]}"
                 )
-                self.hf_config = AutoConfig.from_pretrained(self.LOCAL_HF_PARAMS[self.model_name])
+                self.hf_config = AutoConfig.from_pretrained(self.LOCAL_HF_PARAMS[self.model_name]).to_dict()
             else:
-                self.hf_config = AutoConfig.from_pretrained(self.CKPT_DIR)
-
-            if "text_config" in config or "vision_config" in config:
-                merged_text_config = merge_text_config(config)
-                self._set_params_from_dict(merged_text_config, is_hf=True)
+                self.hf_config = AutoConfig.from_pretrained(self.CKPT_DIR).to_dict()
 
-                if "gemma-3-4b-it" in self.base_model_name:
-                    self._set_vision_params(config["vision_config"])
+            if "text_config" in self.hf_config or "vision_config" in self.hf_config:
+                if "gemma-3-4b" in self.base_model_name:
+                    merged_text_config = merge_text_config(self.hf_config)
+                    self._set_params_from_dict(merged_text_config, is_hf=True)
+                    self._set_vision_params(self.hf_config)
                 else:
-                    if "vision_config" in config:
-                        merged_vision_config = merge_vision_config(config)
+                    merged_text_config = merge_text_config(self.hf_config)
+                    self._set_params_from_dict(merged_text_config, is_hf=True)
+                    if "vision_config" in self.hf_config:
+                        print("Setting vision params from HF config")
+                        merged_vision_config = merge_vision_config(self.hf_config)
                         self._set_vision_params(merged_vision_config)
             else:
-                self._set_params_from_dict(config, is_hf=True)
+                self._set_params_from_dict(self.hf_config, is_hf=True)
 
         else:
             config_file = os.path.join(checkpoint_dir, "config.json")
diff --git a/models/tt_transformers/tt/rope.py b/models/tt_transformers/tt/rope.py
@@ -218,20 +218,55 @@ def _set_cos_sin_cache(self, seq_len: int, device: Any, dtype: torch.dtype) -> N
         self.register_buffer("sin_cached", sin.to(dtype), persistent=False)
 
 
+class LinearRotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        dim: int,
+        max_position_embeddings: int,
+        base: float,
+        factor: float,
+        original_max_position_embeddings: int,
+        device: Optional[Any] = None,
+    ) -> None:
+        self.base = base
+        self.orig_context_len = original_max_position_embeddings
+        self.scaling_factor = factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def apply_scaling(self, freqs: torch.Tensor) -> torch.Tensor:
+        freqs = freqs / self.scaling_factor
+        return freqs
+
+    def _set_cos_sin_cache(self, seq_len: int, device: Any, dtype: torch.dtype) -> None:
+        self.max_seq_len_cached = seq_len
+        freqs = 1.0 / (self.base ** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim))
+        t = torch.arange(seq_len * 2.0)
+        freqs = self.apply_scaling(freqs)
+        freqs = torch.outer(t, freqs).float()
+        cos = torch.cos(freqs)
+        sin = torch.sin(freqs)
+        cos, sin = gather_cos_sin(torch.arange(seq_len), cos, sin)
+
+        self.register_buffer("cos_cached", cos.to(dtype), persistent=False)
+        self.register_buffer("sin_cached", sin.to(dtype), persistent=False)
+
+
 def rotary_embedding_factory(
     dim: int,
     max_position_embeddings: int,
     base: float,
     rope_scaling: Optional[RopeScaling] = None,
     device: Optional[Any] = None,
-) -> Union[RotaryEmbedding, YarnRotaryEmbedding, LlamaRotaryEmbedding]:
+) -> Union[RotaryEmbedding, YarnRotaryEmbedding, LlamaRotaryEmbedding, LinearRotaryEmbedding]:
     if rope_scaling is None:
         return RotaryEmbedding(dim, max_position_embeddings, base, device)
     else:
         if rope_scaling.rope_type.value == "llama3":
             rotary_embedding = LlamaRotaryEmbedding
         elif rope_scaling.rope_type.value == "yarn":
             rotary_embedding = YarnRotaryEmbedding
+        elif rope_scaling.rope_type.value == "linear":
+            rotary_embedding = LinearRotaryEmbedding
         else:
             raise ValueError(f"Invalid rope_scaling: {rope_scaling}")
         return rotary_embedding(

Original file line number	Diff line number	Diff line change
`@@ -82,8 +82,7 @@ def test_decoder_inference(`
`82`	`82`	`model_args.head_dim,`
`83`	`83`	`model_args.max_seq_len,`
`84`	`84`	`model_args.rope_theta,`
`85`		`- model_args.rope_scaling_factor,`
`86`		`- model_args.orig_context_len,`
	`85`	`+ model_args.rope_scaling,`
`87`	`86`	`)`
`88`	`87`	`transformation_mats = rope_setup.get_both_trans_mats()`
`89`	`88`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`"""`
	`2`	`+source: models/tt_transformers/tt/multimodal/llama_conv2d_patch.py`
`2`	`3`	`This is the Conv2dPath of Gemma-3-4b-it`
`3`	`4`	`We have reused the exisiting Conv2dPath of TtLlamaConv2dPath with few modifications.`
`4`	`5`	`We have added a check for weight to convert 4D to 2D`