flexaihq
diff --git a/‎models/common/rmsnorm.py‎
Lines changed: 7 additions & 2 deletions b/‎models/common/rmsnorm.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎models/experimental/gemma3_4b/tests/test_attention.py‎
Lines changed: 0 additions & 279 deletions b/‎models/experimental/gemma3_4b/tests/test_attention.py‎
Lines changed: 0 additions & 279 deletions
@@ -85,7 +85,7 @@ def __init__(
             torch_weight,
             device=device,
             dtype=weight_dtype,
-            layout=ttnn.ROW_MAJOR_LAYOUT,
+            layout=ttnn.TILE_LAYOUT,
             memory_config=weight_memory_config,
             cache_file_name=cache_name,
             mesh_mapper=ttnn.ReplicateTensorToMesh(device) if is_mesh_device else None,
@@ -96,7 +96,7 @@ def __init__(
                 torch_weight,
                 device=device,
                 dtype=weight_dtype,
-                layout=ttnn.ROW_MAJOR_LAYOUT,
+                layout=ttnn.TILE_LAYOUT,
                 memory_config=weight_memory_config,
                 cache_file_name=cache_name,
                 mesh_mapper=ttnn.ShardTensor2dMesh(device, dims=(None, 2), mesh_shape=list(device.shape))
@@ -128,6 +128,11 @@ def forward(self, x: ttnn.Tensor, mode, in_sharded=False, out_sharded=False) ->
         else:
             assert not out_sharded, "Non-sharded version of RMSNorm cannot output a sharded tensor"
 
+        if x.shape[-1] % weight.shape[-1] == 0:
+            # Reshape weight only if x's last dimension is divisible by weight's last dimension,
+            # to avoid padding errors in RMSNorm when dimensions are not aligned
+            weight = ttnn.reshape(weight, [1, 1, 1, -1])
+
         x = norm(
             x,
             epsilon=self.eps,