diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 6588095e8521..22f7d51e1f88 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -100,10 +100,10 @@ class ApertusConfig(PreTrainedConfig): model_type = "apertus" keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the added norm on q and k + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index a60daa2f8194..fa605b6519d1 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -117,10 +117,10 @@ class ApertusConfig(LlamaConfig): model_type = "apertus" base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.v_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k - "layers.*.self_attn.o_proj": "rowwise_rep", # we need to replicate here due to the added norm on q and k + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index 27e7b8404225..77a7209d528f 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -117,11 +117,6 @@ class DogeConfig(PreTrainedConfig): "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.dt_proj": "rowwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.input_layernorm.weight": "sequence_parallel", - "layers.*.input_residual": "sequence_parallel", - "layers.*.post_attention_layernorm.weight": "sequence_parallel", - "layers.*.post_attention_residual": "sequence_parallel", - "norm.weight": "sequence_parallel", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index eacea60cf442..bf10d15c003e 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -146,11 +146,6 @@ class DogeConfig(PreTrainedConfig): "layers.*.self_attn.v_proj": "colwise", "layers.*.self_attn.dt_proj": "rowwise", "layers.*.self_attn.o_proj": "rowwise", - "layers.*.input_layernorm.weight": "sequence_parallel", - "layers.*.input_residual": "sequence_parallel", - "layers.*.post_attention_layernorm.weight": "sequence_parallel", - "layers.*.post_attention_residual": "sequence_parallel", - "norm.weight": "sequence_parallel", "layers.*.mlp.gate_proj": "colwise", "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", diff --git a/src/transformers/models/nanochat/configuration_nanochat.py b/src/transformers/models/nanochat/configuration_nanochat.py index 998b08b31959..ba8a0f124e95 100644 --- a/src/transformers/models/nanochat/configuration_nanochat.py +++ b/src/transformers/models/nanochat/configuration_nanochat.py @@ -94,10 +94,10 @@ class NanoChatConfig(PretrainedConfig): keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise_rep", - "layers.*.self_attn.k_proj": "colwise_rep", - "layers.*.self_attn.v_proj": "colwise_rep", - "layers.*.self_attn.o_proj": "rowwise_rep", + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", "layers.*.mlp.fc1": "colwise", "layers.*.mlp.fc2": "rowwise", }