huggingface · molbap · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -751,6 +751,10 @@ def from_dict(
 
         config = cls(**config_dict)
 
+        # default tie_word_embeddings to False if None, see https://github.com/huggingface/transformers/issues/42313
+        if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings is None:
+            config.tie_word_embeddings = False
+
         # Update config with kwargs if needed
         if "num_labels" in kwargs and "id2label" in kwargs:
             num_labels = kwargs["num_labels"]

diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
@@ -116,6 +116,7 @@ def _build_checkpoint_conversion_mapping():
     mapping["qwen3_next"] = mapping["qwen2_moe"].copy()
     mapping["qwen3_vl_moe"] = mapping["qwen2_moe"].copy()
     mapping["hunyuan_v1_moe"] = mapping["qwen2_moe"].copy()
+    mapping["olmoe"] = mapping["qwen2_moe"].copy()
     mapping["minimax"] = mapping["mixtral"].copy()
 
     return mapping

diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
@@ -504,7 +504,6 @@ def set_param_for_module(
             missing_keys.discard(target_name)
             if ref is not None and ref.shape != param_value.shape and hf_quantizer is None:
                 mismatch_keys.add((target_name, param_value.shape, ref.shape))
-                module_obj.param_name._is_hf_initialized = False  # Needs to be initialized
             else:
                 # super important otherwise _init_weight will re-init the param
                 param_value._is_hf_initialized = True

diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py
@@ -290,6 +290,12 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
             mapping[kernel_name] = kernel
         except FileNotFoundError:
             mapping[kernel_name] = None
+        except AssertionError as error:
+            logger.warning_once(
+                f"Failed to load the '{kernel_name}' kernel from '{repo_id}' because the current environment does not "
+                f"support the required backend: {error}"
+            )
+            mapping[kernel_name] = None
 
     else:
         # Try to import is_{kernel_name}_available from ..utils

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -2254,8 +2254,17 @@ def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict:
             return expanded_tied_weights
 
         tied_mapping = self._tied_weights_keys
+        text_config = self.config.get_text_config(decoder=True)
+        if not hasattr(text_config, "tie_word_embeddings"):
+            logger.warning(
+                f"Text config {text_config.__class__.__name__} does not have 'tie_word_embeddings' attribute. "
+                "This may cause issues with weight tying."
+            )
+        tie_word_embeddings = getattr(text_config, "tie_word_embeddings", None)
+        tie_encoder_decoder = getattr(self.config, "tie_encoder_decoder", False)
+        should_tie = tie_encoder_decoder if tie_word_embeddings is None else tie_word_embeddings
         # If the config does not specify any tying, return empty dict
-        if not self.config.tie_word_embeddings and not self.config.tie_encoder_decoder:
+        if not should_tie:
             return {}
         # If None, return empty dict
         elif tied_mapping is None:
@@ -3178,7 +3187,11 @@ def save_pretrained(
             shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
 
             # Recursively descend to find tied weight keys
-            _tied_weights_keys = set(_get_tied_weight_keys(self))
+            tied_keys_attr = getattr(self, "all_tied_weights_keys", None)
+            if tied_keys_attr is not None:
+                _tied_weights_keys = set(tied_keys_attr.keys())
+            else:
+                _tied_weights_keys = set(_get_tied_weight_keys(self))
             error_names = []
             to_delete_names = set()
             for names in shared_ptrs.values():
@@ -4410,7 +4423,9 @@ def _move_missing_keys_from_meta_to_cpu(
         # The tied weight keys are in the "missing" usually, but they should not be moved (they will be tied anyway)
         # This is especially important because if they are moved, they will lose the `_is_hf_initialized` flag, and they
         # will be re-initialized for nothing (which can be quite long)
-        for key in missing_keys - self.all_tied_weights_keys.keys():
+        tied_keys_attr = getattr(self, "all_tied_weights_keys", {}) or {}
+        tied_keys = set(tied_keys_attr.keys())
+        for key in missing_keys - tied_keys:
             param = model_state_dict[key]
             # Buffers are not initialized on the meta device, so we still need this check to avoid overwriting them
             if param.device == torch.device("meta"):

diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -194,6 +194,7 @@ def __init__(
             bos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
             num_hidden_layers=encoder_layers,
+            tie_word_embeddings=tie_word_embeddings,
         )
         if "decoder" in common_kwargs:
             del common_kwargs["decoder"]

diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
@@ -1069,7 +1069,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
 @auto_docstring
 class KyutaiSpeechToTextForConditionalGeneration(KyutaiSpeechToTextPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.embed_tokens.weight"}
     _tp_plan = {"lm_head": "colwise_rep"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
     _keep_in_fp32_modules_strict = ["codec_model"]

diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py
@@ -221,5 +221,11 @@ def __init__(self, text_encoder, audio_encoder, decoder, **kwargs):
     def sampling_rate(self):
         return self.audio_encoder.sampling_rate
 
+    # overriding these because they crash - not 100% sure of that one
+    def get_text_config(self, decoder=None, encoder=None):
+        if decoder is None and encoder is None:
+            decoder = True
+        return super().get_text_config(decoder=decoder, encoder=encoder)
+
 
 __all__ = ["MusicgenConfig", "MusicgenDecoderConfig"]
diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
@@ -234,5 +234,11 @@ def __init__(
     def sampling_rate(self):
         return self.audio_encoder.sampling_rate
 
+    # overriding these because they crash - not 100% sure of that one
+    def get_text_config(self, decoder=None, encoder=None):
+        if decoder is None and encoder is None:
+            decoder = True
+        return super().get_text_config(decoder=decoder, encoder=encoder)
+
 
 __all__ = ["MusicgenMelodyConfig", "MusicgenMelodyDecoderConfig"]
diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py
@@ -125,6 +125,7 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
+            tie_word_embeddings=True,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -254,6 +255,7 @@ def test_ensure_weights_are_shared(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
 
         config.tie_word_embeddings = True
+        config.decoder.tie_word_embeddings = True
         model = FSMTForConditionalGeneration(config)
 
         # FSMT shares three weights.
@@ -270,6 +272,7 @@ def test_ensure_weights_are_shared(self):
         )
 
         config.tie_word_embeddings = False
+        config.decoder.tie_word_embeddings = False
         model = FSMTForConditionalGeneration(config)
 
         # FSMT shares three weights.

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
@@ -2100,6 +2100,49 @@ def test_tied_weights_keys(self):
                 f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.",
             )
 
+    def test_tie_word_embeddings_is_authoritative(self):
+        original_config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            tied_config = copy.deepcopy(original_config)
+            tied_config.get_text_config().tie_word_embeddings = True
+
+            untied_config = copy.deepcopy(original_config)
+            untied_config.get_text_config().tie_word_embeddings = False
+
+            model_tied = model_class(tied_config)
+            model_untied = model_class(untied_config)
+
+            if not hasattr(model_tied, "_tied_weights_keys") or not model_tied._tied_weights_keys:
+                continue
+
+            tied_keys = model_tied._tied_weights_keys
+            state_dict_tied = model_tied.state_dict()
+            state_dict_untied = model_untied.state_dict()
+
+            for target_key, source_key in tied_keys.items():
+                if target_key not in state_dict_tied or source_key not in state_dict_tied:
+                    continue
+                if target_key not in state_dict_untied or source_key not in state_dict_untied:
+                    continue
+
+                target_tied_ptr = id_tensor_storage(state_dict_tied[target_key])
+                source_tied_ptr = id_tensor_storage(state_dict_tied[source_key])
+                target_untied_ptr = id_tensor_storage(state_dict_untied[target_key])
+                source_untied_ptr = id_tensor_storage(state_dict_untied[source_key])
+
+                self.assertEqual(
+                    target_tied_ptr,
+                    source_tied_ptr,
+                    f"{model_class}: With tie_word_embeddings=True, '{target_key}' should share storage with '{source_key}'",
+                )
+                self.assertNotEqual(
+                    target_untied_ptr,
+                    source_untied_ptr,
+                    f"{model_class}: With tie_word_embeddings=False, '{target_key}' should NOT share storage with '{source_key}'. "
+                    f"Config tie_word_embeddings must be authoritative over class-level _tied_weights_keys.",
+                )
+
     def test_model_weights_reload_no_missing_tied_weights(self):
         for model_class in self.all_model_classes:
             config, _ = self.model_tester.prepare_config_and_inputs_for_common()