huggingface
diff --git a/‎src/transformers/conversion_mapping.py‎
Lines changed: 105 additions & 22 deletions b/‎src/transformers/conversion_mapping.py‎
Lines changed: 105 additions & 22 deletions
@@ -13,7 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from copy import deepcopy
+from typing import TYPE_CHECKING
 
 from .core_model_loading import Concatenate, MergeModulelist, WeightConverter, WeightRenaming
 from .utils import is_torch_available
@@ -23,16 +26,21 @@
     import torch
 
 
+if TYPE_CHECKING:
+    from .modeling_utils import PreTrainedModel
+    from .quantizers import HfQuantizer
+
+
 def _build_checkpoint_conversion_mapping():
     mapping = {
         "mixtral": [
             WeightRenaming(".block_sparse_moe.gate", ".mlp.gate"),
             WeightConverter(
-                source_keys=[
+                source_patterns=[
                     "block_sparse_moe.experts.*.w1.weight",
                     "block_sparse_moe.experts.*.w3.weight",
                 ],  # you give me a list of 2 keys, I collect a list of a list of tensors
-                target_keys="mlp.experts.gate_up_proj",  # target key gets the list of two tensors
+                target_patterns="mlp.experts.gate_up_proj",  # target key gets the list of two tensors
                 operations=[
                     MergeModulelist(
                         dim=0
@@ -41,10 +49,10 @@ def _build_checkpoint_conversion_mapping():
                 ],  # we want the loading to add this shard operation here. Though we can't shard after concats and merge, needs to be first
             ),
             WeightConverter(
-                source_keys=[
+                source_patterns=[
                     "block_sparse_moe.experts.*.w2.weight",
                 ],
-                target_keys="mlp.experts.down_proj",  # target key gets the list of two tensors
+                target_patterns="mlp.experts.down_proj",  # target key gets the list of two tensors
                 operations=[
                     MergeModulelist(
                         dim=0
@@ -54,50 +62,58 @@ def _build_checkpoint_conversion_mapping():
         ],
         "qwen2_moe": [
             WeightConverter(
-                source_keys=[
+                source_patterns=[
                     "mlp.experts.*.gate_proj.weight",
                     "mlp.experts.*.up_proj.weight",
                 ],
-                target_keys="mlp.experts.gate_up_proj",
+                target_patterns="mlp.experts.gate_up_proj",
                 operations=[MergeModulelist(dim=0), Concatenate(dim=1)],
             ),
             WeightConverter(
-                source_keys=["mlp.experts.*.down_proj.weight"],
-                target_keys="mlp.experts.down_proj",
+                source_patterns=["mlp.experts.*.down_proj.weight"],
+                target_patterns="mlp.experts.down_proj",
                 operations=[MergeModulelist(dim=0)],
             ),
         ],
+        "timm_wrapper": [
+            # Simply add the prefix `timm_model`
+            # TODO: Would be probably much cleaner with a `add_prefix` argument in WeightRenaming
+            WeightRenaming(
+                source_patterns=r"(.+)",
+                target_patterns=r"timm_model.\1",
+            )
+        ],
         "legacy": [
             WeightRenaming(
-                source_keys="LayerNorm.gamma",
-                target_keys="LayerNorm.weight",
+                source_patterns="LayerNorm.gamma",
+                target_patterns="LayerNorm.weight",
             ),
             WeightRenaming(
-                source_keys="LayerNorm.beta",
-                target_keys="LayerNorm.bias",
+                source_patterns="LayerNorm.beta",
+                target_patterns="LayerNorm.bias",
             ),
         ],
     }
     if hasattr(torch.nn.utils.parametrizations, "weight_norm"):
         mapping["legacy"] += [
             WeightRenaming(
-                source_keys="weight_g",
-                target_keys="parametrizations.weight.original0",
+                source_patterns="weight_g",
+                target_patterns="parametrizations.weight.original0",
             ),
             WeightRenaming(
-                source_keys="weight_v",
-                target_keys="parametrizations.weight.original1",
+                source_patterns="weight_v",
+                target_patterns="parametrizations.weight.original1",
             ),
         ]
     else:
         mapping["legacy"] += [
             WeightRenaming(
-                source_keys="parametrizations.weight.original0",
-                target_keys="weight_g",
+                source_patterns="parametrizations.weight.original0",
+                target_patterns="weight_g",
             ),
             WeightRenaming(
-                source_keys="parametrizations.weight.original1",
-                target_keys="weight_v",
+                source_patterns="parametrizations.weight.original1",
+                target_patterns="weight_v",
             ),
         ]
 
@@ -127,5 +143,72 @@ def _build_checkpoint_conversion_mapping():
 def get_checkpoint_conversion_mapping(model_type):
     global _checkpoint_conversion_mapping_cache
     _checkpoint_conversion_mapping_cache = _build_checkpoint_conversion_mapping()
-    globals()["_checkpoint_conversion_mapping"] = _checkpoint_conversion_mapping_cache
-    return deepcopy(_checkpoint_conversion_mapping_cache.get(model_type, None))
+    return deepcopy(_checkpoint_conversion_mapping_cache.get(model_type))
+
+
+# DO NOT MODIFY, KEPT FOR BC ONLY
+VLMS = [
+    "aria",
+    "ayavision",
+    "colpali",
+    "emu3",
+    "fuyu",
+    "gotocr2",
+    "gemma3",
+    "internvl",
+    "llava",  # all llava prefixed models fall under this check
+    "mistral3",
+    "mllama",
+    "paligemma",
+    "shieldgemma2",
+    "qwen2vl",
+    "qwen2_5_vl",
+    "videollava",
+    "vipllava",
+    "sam3_video",
+    "sam3",
+    "sam3_tracker",
+    "sam3_tracker_video",
+]
+
+
+def get_model_conversion_mapping(
+    model: PreTrainedModel,
+    key_mapping: dict[str, str] | None = None,
+    hf_quantizer: HfQuantizer | None = None,
+    add_legacy: bool = True,
+) -> list[WeightConverter | WeightRenaming]:
+    """
+    For a given `model`, obtain the weight conversion mapping if any are registered either as a simple renaming
+    `_checkpoint_conversion_mapping` class argument, or in the general WeightConverter mapping.
+    """
+    weight_conversions = []
+
+    # Load models with key mapping
+    if key_mapping is not None:
+        weight_conversions = [WeightRenaming(source_patterns=k, target_patterns=v) for k, v in key_mapping.items()]
+    elif any(
+        allowed_name in class_name.__name__.lower()
+        for class_name in model.__class__.__mro__[:-1]
+        for allowed_name in VLMS
+    ):
+        weight_conversions = [
+            WeightRenaming(source_patterns=k, target_patterns=v)
+            for k, v in model._checkpoint_conversion_mapping.items()
+        ]
+
+    # TODO: should be checked recursively on submodels!!
+    model_type = getattr(model.config, "model_type", None)
+    if model_type is not None:
+        model_specific_conversions = get_checkpoint_conversion_mapping(model_type)
+        if model_specific_conversions is not None:
+            weight_conversions.extend(model_specific_conversions)
+
+    if add_legacy:
+        weight_conversions.extend(get_checkpoint_conversion_mapping("legacy"))
+
+    # Add the ones from the quantizer as well if provided
+    if hf_quantizer is not None:
+        weight_conversions.extend(hf_quantizer.get_weight_conversions())
+
+    return weight_conversions