Add vLLM support for Gemma-3-4b-it

MohammedTaherMcW · MohammedTaherMcW · commit 1994cf93521c · 2025-08-11T06:38:25.000Z
diff --git a/models/tt_transformers/tt/generator.py b/models/tt_transformers/tt/generator.py
@@ -97,7 +97,7 @@ def prefill_forward_text(
             model_kv_cache = kv_cache[model_id] if kv_cache is not None else None
 
             # Check if 'pixel_values' exists and index it safely
-            if "pixel_values" in local_kwargs:
+            if "pixel_values" in local_kwargs and local_kwargs["pixel_values"] is not None:
                 local_kwargs["pixel_values"] = local_kwargs["pixel_values"][idx]
                 if "image_grid_thw" in local_kwargs:
                     local_kwargs["image_grid_thw"] = local_kwargs["image_grid_thw"][idx]
@@ -413,6 +413,7 @@ def _prefill_forward_single_user(
         kv_cache=None,
         cross_page_table=None,
         model_id=-1,
+        **kwargs,
     ):
         """
         Performs vision encode step then text prefill.
@@ -434,6 +435,7 @@ def _prefill_forward_single_user(
                 batch_masks=[vision_mask],
                 total_len=total_len,
                 prefill_len=prefill_len,
+                **kwargs,
             )
 
             if cross_page_table is not None:
@@ -467,6 +469,8 @@ def _prefill_forward_single_user(
             page_table=page_table,
             cross_page_table=cross_page_table,
             text_only_inference=text_only_inference,
+            vision_tokens=vision_tokens,
+            **kwargs,
         )
 
         tt_logits = self.model[model_id].ttnn_prefill_forward(
@@ -565,6 +569,7 @@ def prefill_forward_llama_vision(
         kv_cache=None,
         cross_page_table=None,
         empty_slots=None,
+        **kwargs,
     ):
         """
         Batched version of _prefill_forward_single_user for vision model.
@@ -600,6 +605,11 @@ def prefill_forward_llama_vision(
             model_kv_cache = kv_cache[model_id] if kv_cache is not None else None
             model_xattn_cache = xattn_caches[model_id] if xattn_caches is not None else None
 
+            # prefill_seq_len = get_padded_prefill_len(seq_len)
+            # tokens = torch.cat(
+            #     [tokens[idx : idx + 1, :seq_len], torch.zeros(1, prefill_seq_len - seq_len).long()], dim=-1
+            # )
+
             (
                 model_xattn_cache,
                 prefill_cross_attention_masks,
@@ -619,6 +629,8 @@ def prefill_forward_llama_vision(
                 kv_cache=model_kv_cache,
                 cross_page_table=user_cross_page_table,
                 model_id=model_id,
+                image_grid_thw=kwargs["image_grid_thw"][idx] if kwargs.get("image_grid_thw") else None,
+                input_ids=kwargs["input_ids"][idx] if kwargs.get("input_ids") else None,
             )
 
             if xattn_caches is not None:
diff --git a/models/tt_transformers/tt/generator_vllm.py b/models/tt_transformers/tt/generator_vllm.py
@@ -373,3 +373,112 @@ def decode_forward(self, *args, **kwargs):
 
     def allocate_kv_cache(self, *args, **kwargs):
         return allocate_vllm_kv_cache(*args, **kwargs, dp_model=self.model, tt_cache_path=self.cache_path)
+
+
+def input_processor_for_gemma(ctx: InputContext, inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs]):
+    input_processor = ctx.get_hf_processor()
+    if "prompt" in inputs:
+        prompt_text = inputs["prompt"]
+    else:
+        assert "prompt_token_ids" in inputs, "prompt_token_ids must be available in server mode"
+        prompt_text = input_processor.decode(inputs["prompt_token_ids"], skip_special_tokens=False)
+
+    if "multi_modal_data" in inputs and "image" in inputs["multi_modal_data"]:
+        images = inputs["multi_modal_data"]["image"]
+    else:
+        images = None
+
+    processed_inputs = input_processor(
+        text=prompt_text,
+        images=images,
+        return_tensors="pt",
+    )
+
+    assert processed_inputs.input_ids.shape[0] == 1, "Only one image is processed at a time by vLLM"
+    return {
+        "type": inputs["type"],
+        "prompt_token_ids": processed_inputs.input_ids[0].tolist(),
+        "prompt": prompt_text,
+        "multi_modal_data": {"image": processed_inputs},  # [INFO] add processed_inputs
+    }
+
+
+from types import SimpleNamespace
+
+
+class CustomNamespace(SimpleNamespace):
+    def __contains__(self, key):
+        return key in self.__dict__
+
+
+@INPUT_REGISTRY.register_input_processor(input_processor_for_gemma)
+class Gemma3ForConditionalGeneration(Generator, SupportsMultiModal):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.GEMMA_IMAGE_TOKEN_ID = 262144
+        self.max_gen_len = self.model_args[0].max_seq_len - 1  # TODO: double check what this should be
+
+    @classmethod
+    def initialize_vllm_model(
+        cls, hf_config, mesh_device, max_batch_size, max_seq_len=131072, n_layers=None, tt_data_parallel=1
+    ):
+        submesh_devices = create_submeshes(mesh_device, tt_data_parallel)
+
+        model_args = []
+        model = []
+        state_dict = None
+
+        for submesh in submesh_devices:
+            model_args_i, model_i, state_dict = create_multimodal_model(
+                mesh_device=submesh,
+                max_batch_size=max_batch_size // tt_data_parallel,
+                max_seq_len=max_seq_len,
+                use_paged_kv_cache=True,
+                checkpoint=state_dict,
+            )
+            model_args.append(model_args_i)
+            model.append(model_i)
+
+        return cls(model, model_args, mesh_device)
+
+    @property
+    def cache_path(self):
+        return self.model_args[0].model_cache_path
+
+    def prefill_forward(self, *args, **kwargs):
+        self.tokenizer = self.model_args[0].tokenizer
+        pad_token_id = self.tokenizer.pad_token_id
+
+        tokens = kwargs["tokens"]
+        prompt_lens = kwargs["prompt_lens"]
+        inputs = CustomNamespace()
+        inputs.input_ids = tokens
+        data = kwargs.get("images", None)  # This contains the entire Data list, not just the pixel values
+        for i in range(tokens.shape[0]):  # for each user, fix their padding
+            tokens[i][prompt_lens[i] :] = pad_token_id
+        pixel_values = None
+
+        if hasattr(data[0], "pixel_values"):
+            # If inputs is a list of objects with .pixel_values, concatenate them
+            pixel_values = torch.concat([im.pixel_values for im in data if hasattr(im, "pixel_values")], dim=0)
+
+        page_table = kwargs.get("page_table", None)
+        kv_cache = kwargs.get("kv_cache", None)
+        vision_images = pixel_values
+
+        vision_images = [vision_images] if vision_images is not None else None
+
+        return super().prefill_forward_text(
+            tokens=inputs.input_ids,
+            page_table=page_table,
+            kv_cache=kv_cache,
+            prompt_lens=prompt_lens,
+            pixel_values=vision_images,
+        )
+
+    def allocate_kv_cache(self, *args, **kwargs):
+        return allocate_vllm_kv_cache(*args, **kwargs, dp_model=self.model, tt_cache_path=self.cache_path)
+
+    def decode_forward(self, *args, **kwargs):
+        return super().decode_forward_text(*args, **kwargs)
diff --git a/models/tt_transformers/tt/multimodal/gemma/gemma_e2e_model.py b/models/tt_transformers/tt/multimodal/gemma/gemma_e2e_model.py
@@ -73,15 +73,16 @@ def prepare_inputs_prefill(self, pt_tokens, start_pos=0, page_table=None, chunk_
 
         vision_output = self.compute_vision_token(**kwargs)
         tokens_embd = ttnn.to_torch(tokens_embd, mesh_composer=ttnn.ConcatMeshToTensor(self.mesh_device, dim=-1))
-        comp_vision_output = ttnn.to_torch(
-            vision_output, mesh_composer=ttnn.ConcatMeshToTensor(self.mesh_device, dim=0)
-        )[: vision_output.shape[0], :]
+        if vision_output is not None:
+            comp_vision_output = ttnn.to_torch(
+                vision_output, mesh_composer=ttnn.ConcatMeshToTensor(self.mesh_device, dim=0)
+            )[: vision_output.shape[0], :]
 
-        image_features = comp_vision_output.squeeze(0)
-        special_image_mask = (pt_tokens == self.args.image_token_index).unsqueeze(-1)
-        special_image_mask = special_image_mask.expand_as(tokens_embd)
-        image_features = image_features.to(tokens_embd.device, tokens_embd.dtype)
-        tokens_embd = tokens_embd.masked_scatter(special_image_mask, image_features)
+            image_features = comp_vision_output.squeeze(0)
+            special_image_mask = (pt_tokens == self.args.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(tokens_embd)
+            image_features = image_features.to(tokens_embd.device, tokens_embd.dtype)
+            tokens_embd = tokens_embd.masked_scatter(special_image_mask, image_features)
 
         tokens_embd = self.args.prepare_residual_tensor_prefill(
             tokens_embd,
@@ -127,6 +128,8 @@ def prepare_inputs_prefill(self, pt_tokens, start_pos=0, page_table=None, chunk_
 
         return tokens_embd, [tt_rot_mats_prefill_global, tt_rot_mats_prefill_local], tt_page_table, tt_chunk_page_table
 
-    def compute_vision_token(self, pixel_values):
+    def compute_vision_token(self, pixel_values=None):
+        if pixel_values is None:
+            return None
         vision_output = self.vision_model(pixel_values)
         return vision_output
diff --git a/models/tt_transformers/tt/multimodal/gemma/gemma_image_mlp.py b/models/tt_transformers/tt/multimodal/gemma/gemma_image_mlp.py
@@ -117,4 +117,11 @@ def forward(self, x: ttnn.Tensor) -> ttnn.Tensor:
             pre_bias_output = c_proj_out
 
         output = ttnn.add(pre_bias_output, self.c_proj_bias)
+
+        ttnn.deallocate(c_fc_out)
+        ttnn.deallocate(c_proj_out)
+        ttnn.deallocate(pre_bias_output)
+        # Deallocate input tensor to free memory
+        ttnn.deallocate(x_in)
+        # Reshape output back to original shape
         return output