From 1c7590abb4b7a0eec06f407a2c5e804bede2ef93 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 27 Nov 2025 10:30:25 +0100 Subject: [PATCH 1/3] gemma3 --- src/transformers/models/gemma3/configuration_gemma3.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index eedca6a49624..ca7dfe5fb3e8 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -117,10 +117,10 @@ class Gemma3TextConfig(PreTrainedConfig): model_type = "gemma3_text" keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", + "layers.*.self_attn.q_proj": "colwise_rep", + "layers.*.self_attn.k_proj": "colwise_rep", + "layers.*.self_attn.v_proj": "colwise_rep", + "layers.*.self_attn.o_proj": "rowwise_rep", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", From 5023f0e7443e5771e599c92e8f4a02b69ffd15f3 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 27 Nov 2025 10:36:41 +0100 Subject: [PATCH 2/3] qwen3 and modulars --- src/transformers/models/gemma3/configuration_gemma3.py | 8 ++++---- src/transformers/models/gemma3/modular_gemma3.py | 9 +++++++++ src/transformers/models/qwen3/configuration_qwen3.py | 8 ++++---- .../qwen3_omni_moe/configuration_qwen3_omni_moe.py | 8 ++++---- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index ca7dfe5fb3e8..9c6ca945aec7 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -117,10 +117,10 @@ class Gemma3TextConfig(PreTrainedConfig): model_type = "gemma3_text" keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", - "layers.*.self_attn.k_proj": "colwise_rep", - "layers.*.self_attn.v_proj": "colwise_rep", - "layers.*.self_attn.o_proj": "rowwise_rep", + "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here du to the added norm "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 31f25550df03..2b106d3fe908 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -146,6 +146,15 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig): """ model_type = "gemma3_text" + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here du to the added norm + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } def __init__( self, diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index 3b4ef21bd386..5dfde825d39a 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -105,10 +105,10 @@ class Qwen3Config(PreTrainedConfig): # Default tensor parallel plan for base model `Qwen3` base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", + "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here du to the added norm "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 6d49a021e3de..df4fa92f6436 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -532,10 +532,10 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig): # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerCodePredictor` base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", + "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here du to the added norm "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", From 6874739d7a6e155a5d19bf8d946358c79450488b Mon Sep 17 00:00:00 2001 From: vasqu Date: Thu, 27 Nov 2025 12:05:06 +0100 Subject: [PATCH 3/3] fix tp plans --- src/transformers/models/apertus/configuration_apertus.py | 8 ++++---- src/transformers/models/apertus/modular_apertus.py | 8 ++++---- src/transformers/models/doge/configuration_doge.py | 5 ----- src/transformers/models/doge/modular_doge.py | 5 ----- src/transformers/models/gemma3/configuration_gemma3.py | 8 ++++---- src/transformers/models/gemma3/modular_gemma3.py | 9 --------- .../models/nanochat/configuration_nanochat.py | 8 ++++---- src/transformers/models/qwen3/configuration_qwen3.py | 8 ++++---- .../qwen3_omni_moe/configuration_qwen3_omni_moe.py | 8 ++++---- 9 files changed, 24 insertions(+), 43 deletions(-) diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 6588095e8521..22f7d51e1f88 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -100,10 +100,10 @@ class ApertusConfig(PreTrainedConfig): model_type = "apertus" keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the added norm on q and k + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index a60daa2f8194..fa605b6519d1 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -117,10 +117,10 @@ class ApertusConfig(LlamaConfig): model_type = "apertus" base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the added norm on q and k + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index 27e7b8404225..77a7209d528f 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -117,11 +117,6 @@ class DogeConfig(PreTrainedConfig): "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.dt_proj": "rowwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.input_layernorm.weight": "sequence_parallel", - "layers.*.input_residual": "sequence_parallel", - "layers.*.post_attention_layernorm.weight": "sequence_parallel", - "layers.*.post_attention_residual": "sequence_parallel", - "norm.weight": "sequence_parallel", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index eacea60cf442..bf10d15c003e 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -146,11 +146,6 @@ class DogeConfig(PreTrainedConfig): "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.dt_proj": "rowwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.input_layernorm.weight": "sequence_parallel", - "layers.*.input_residual": "sequence_parallel", - "layers.*.post_attention_layernorm.weight": "sequence_parallel", - "layers.*.post_attention_residual": "sequence_parallel", - "norm.weight": "sequence_parallel", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index 9c6ca945aec7..eedca6a49624 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -117,10 +117,10 @@ class Gemma3TextConfig(PreTrainedConfig): model_type = "gemma3_text" keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 2b106d3fe908..31f25550df03 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -146,15 +146,6 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig): """ model_type = "gemma3_text" - base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here du to the added norm - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } def __init__( self, diff --git a/src/transformers/models/nanochat/configuration_nanochat.py b/src/transformers/models/nanochat/configuration_nanochat.py index 998b08b31959..ba8a0f124e95 100644 --- a/src/transformers/models/nanochat/configuration_nanochat.py +++ b/src/transformers/models/nanochat/configuration_nanochat.py @@ -94,10 +94,10 @@ class NanoChatConfig(PretrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", - "layers.*.self_attn.k_proj": "colwise_rep", - "layers.*.self_attn.v_proj": "colwise_rep", - "layers.*.self_attn.o_proj": "rowwise_rep", + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.fc1": "colwise", "layers.*.mlp.fc2": "rowwise", } diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index 5dfde825d39a..3b4ef21bd386 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -105,10 +105,10 @@ class Qwen3Config(PreTrainedConfig): # Default tensor parallel plan for base model `Qwen3` base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index df4fa92f6436..6d49a021e3de 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -532,10 +532,10 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig): # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerCodePredictor` base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here du to the added norm - "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here du to the added norm + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise",