From 1a6bf6626add0de1fca9c067d66393629013a9c1 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Wed, 18 Jun 2025 20:12:29 +0000
Subject: [PATCH 1/8] AWQ CohereForCausalLM mappings

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/mappings.py | 24 +++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py
index 6390445c8..41a4d8b5a 100644
--- a/src/llmcompressor/modifiers/awq/mappings.py
+++ b/src/llmcompressor/modifiers/awq/mappings.py
@@ -74,7 +74,31 @@ class AWQMapping:
     ),
 ]
 
+
+# Cohere architecture is similar to default, with a very fundamental difference.
+# The MLP block is executed in parallel to the attention. So the tensor goes
+# through input_layernorm and then from there it goes directly to the attention
+# module and to the MLP module.
+_cohere_mappings = [
+    AWQMapping(
+        "re:.*input_layernorm$",
+        [
+            "re:.*self_attn.q_proj$",
+            "re:.*self_attn.k_proj$",
+            "re:.*self_attn.v_proj$",
+            "re:.*mlp.gate_proj$",
+            "re:.*mlp.up_proj$",
+        ],
+    ),
+    AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]),
+    AWQMapping(
+        "re:.*up_proj$",
+        ["re:.*down_proj$"],
+    ),
+]
+
 AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = {
+    "CohereForCausalLM": _cohere_mappings,
     "LlamaForCausalLM": _default_mappings,
     "MistralForCausalLM": _default_mappings,
     "Phi3ForCausalLM": _phi_mappings,

From 6af62173ddacf0d301da07fec475ef6a11984699 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Mon, 23 Jun 2025 14:48:01 +0000
Subject: [PATCH 2/8] add Cohere2ForCausalLM

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/mappings.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py
index 41a4d8b5a..b7afffd3f 100644
--- a/src/llmcompressor/modifiers/awq/mappings.py
+++ b/src/llmcompressor/modifiers/awq/mappings.py
@@ -99,6 +99,7 @@ class AWQMapping:
 
 AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = {
     "CohereForCausalLM": _cohere_mappings,
+    "Cohere2ForCausalLM": _cohere_mappings,
     "LlamaForCausalLM": _default_mappings,
     "MistralForCausalLM": _default_mappings,
     "Phi3ForCausalLM": _phi_mappings,

From 94eac30207e2bee95ffdf6a9e9be9ad313711dc5 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Mon, 23 Jun 2025 19:28:23 +0000
Subject: [PATCH 3/8] gemma2/gemma3 mappings

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/mappings.py | 22 +++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py
index b7afffd3f..1fae42e0d 100644
--- a/src/llmcompressor/modifiers/awq/mappings.py
+++ b/src/llmcompressor/modifiers/awq/mappings.py
@@ -74,6 +74,25 @@ class AWQMapping:
     ),
 ]
 
+# Gemma includes a pre_feedforward_layernorm in between
+#  post_attention_layernorm and the mlp down/gate proj layers
+#  use that instead of post_attention_layernorm in 3rd mapping:
+_gemma_mappings = [
+    AWQMapping(
+        "re:.*input_layernorm$",
+        ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"],
+    ),
+    AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]),
+    AWQMapping(
+        "re:.*pre_feedforward_layernorm$",
+        ["re:.*gate_proj$", "re:.*up_proj$"],
+    ),
+    AWQMapping(
+        "re:.*up_proj$",
+        ["re:.*down_proj$"],
+    ),
+]
+
 
 # Cohere architecture is similar to default, with a very fundamental difference.
 # The MLP block is executed in parallel to the attention. So the tensor goes
@@ -100,6 +119,9 @@ class AWQMapping:
 AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = {
     "CohereForCausalLM": _cohere_mappings,
     "Cohere2ForCausalLM": _cohere_mappings,
+    "Gemma2ForCausalLM": _gemma_mappings,
+    "Gemma3ForCausalLM": _gemma_mappings,
+    "Gemma3ForConditionalGeneration": _gemma_mappings,
     "LlamaForCausalLM": _default_mappings,
     "MistralForCausalLM": _default_mappings,
     "Phi3ForCausalLM": _phi_mappings,

From e8739ea34979fe5f66d78acbec09379f1a72d57b Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Wed, 25 Jun 2025 17:36:22 +0000
Subject: [PATCH 4/8] Mistral3ForConditionalGeneration mapping

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/mappings.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py
index 1fae42e0d..693406ec3 100644
--- a/src/llmcompressor/modifiers/awq/mappings.py
+++ b/src/llmcompressor/modifiers/awq/mappings.py
@@ -123,6 +123,7 @@ class AWQMapping:
     "Gemma3ForCausalLM": _gemma_mappings,
     "Gemma3ForConditionalGeneration": _gemma_mappings,
     "LlamaForCausalLM": _default_mappings,
+    "Mistral3ForConditionalGeneration": _default_mappings,
     "MistralForCausalLM": _default_mappings,
     "Phi3ForCausalLM": _phi_mappings,
     "Phi3VForCausalLM": _phi_mappings,

From 76a464d738810922980d639a7bfb9c427eb596d7 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Wed, 25 Jun 2025 21:19:08 +0000
Subject: [PATCH 5/8] exclude observer modules

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
index 51e8cf8b9..9e0cc20f0 100644
--- a/src/llmcompressor/modifiers/awq/base.py
+++ b/src/llmcompressor/modifiers/awq/base.py
@@ -332,6 +332,9 @@ def _set_resolved_mappings(self, model: Module) -> None:
                         balance_regex,
                         smooth_parent,
                     ).items():
+                        if balance_suffix.endswith("observer"):
+                            continue
+
                         balance_name = f"{smooth_parent_name}.{balance_suffix}"
 
                         # exclude v_proj->o_proj mappings whose shapes are incompatible

From 202070f0fe1bcb465d9eb7f93711495d3c6ab4b9 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Thu, 26 Jun 2025 18:27:16 +0000
Subject: [PATCH 6/8] safeguard against resolved mappings including
 observer/transform layers

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/modifiers/awq/base.py   | 12 +++---
 src/llmcompressor/utils/pytorch/module.py | 48 +++++++++++++++++++----
 2 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
index 9e0cc20f0..7d5c3671f 100644
--- a/src/llmcompressor/modifiers/awq/base.py
+++ b/src/llmcompressor/modifiers/awq/base.py
@@ -304,13 +304,13 @@ def _set_resolved_mappings(self, model: Module) -> None:
         """
         resolved_mappings: list[ResolvedMapping] = []
         for mapping_idx, mapping in enumerate(self.mappings):
-            smooth_layers = get_layers(mapping.smooth_layer, model)
+            smooth_layers = get_layers(
+                mapping.smooth_layer, model, exclude_internal_modules=True
+            )
             smooth_names = [
                 smooth_name
                 for smooth_name in smooth_layers
-                if not find_name_or_class_matches(
-                    smooth_name, model, self.ignore + ["re:.*_observer$"]
-                )
+                if not find_name_or_class_matches(smooth_name, model, self.ignore)
             ]
 
             num_skipped_mappings = 0
@@ -331,10 +331,8 @@ def _set_resolved_mappings(self, model: Module) -> None:
                     for balance_suffix, balance_layer in get_layers(
                         balance_regex,
                         smooth_parent,
+                        exclude_internal_modules=True,
                     ).items():
-                        if balance_suffix.endswith("observer"):
-                            continue
-
                         balance_name = f"{smooth_parent_name}.{balance_suffix}"
 
                         # exclude v_proj->o_proj mappings whose shapes are incompatible
diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py
index 835493fa3..e0e37d78e 100644
--- a/src/llmcompressor/utils/pytorch/module.py
+++ b/src/llmcompressor/utils/pytorch/module.py
@@ -9,7 +9,6 @@
 
 import torch
 from compressed_tensors.quantization.utils import is_module_quantized
-from packaging import version
 from torch.nn import Linear, Module, Parameter
 from torch.nn.modules.conv import _ConvNd
 from transformers import PreTrainedModel
@@ -64,10 +63,6 @@
     "get_layer_by_name",
 ]
 
-
-_PARSED_TORCH_VERSION = version.parse(torch.__version__)
-
-
 ALL_TARGET = "__ALL__"
 ALL_PRUNABLE_TARGET = "__ALL_PRUNABLE__"
 ALL_QUANTIZABLE_TARGET = "__ALL_QUANTIZABLE__"
@@ -164,8 +159,47 @@ def match_layers_params(
     return resolved
 
 
-def get_layers(targets: Union[str, List[str]], module: Module) -> Dict[str, Module]:
-    return match_layers_params(targets, module)
+def is_internal_module(name: str) -> bool:
+    """
+    llm-compressor adds additional modules to a model, like observers
+    and transforms, as part of its operation.
+    Return whether module is internally instantiated by llm-compressor,
+    based on its name.
+
+    :param name: name of module
+    :return: True if name indicates a module instantiated
+    """
+    return name.endswith(("_observer", "_transform", "perm"))
+
+
+def get_layers(
+    targets: Union[str, List[str]],
+    module: Module,
+    exclude_internal_modules: bool = False,
+) -> Dict[str, Module]:
+    """
+    Get layers (also known as submodules) of module based on targets
+
+    :param targets: names or regexes to search for
+        Can be regex, e.g. "re:.*input_layernorm$" to find all layers
+        in module whose names end in string "input_layernorm"
+    :param module: Parent module in which to search for targets
+    :param exclude_internal_modules: If True, don't include internal
+        modules added by llm-compressor, e.g. Observers and Transforms.
+        Defaults to False to maintain backward compatibility
+
+    :return: dict of layer name -> layer module of all layers in module
+        that match targets
+    """
+    layer_dict = match_layers_params(targets, module)
+    if exclude_internal_modules:
+        layer_dict = {
+            layer_name: layer
+            for layer_name, layer in layer_dict.items()
+            if not is_internal_module(layer_name)
+        }
+
+    return layer_dict
 
 
 def get_layer(target: str, module: Module) -> Tuple[str, Module]:

From f366925977a995fd91b991f842d555bdbfad5ba7 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Thu, 26 Jun 2025 18:48:44 +0000
Subject: [PATCH 7/8] switch is_internal_module check to use instance rather
 than name

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/utils/pytorch/module.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py
index e0e37d78e..d013092b3 100644
--- a/src/llmcompressor/utils/pytorch/module.py
+++ b/src/llmcompressor/utils/pytorch/module.py
@@ -8,11 +8,13 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
+from compressed_tensors.transform import TransformBase
 from compressed_tensors.quantization.utils import is_module_quantized
 from torch.nn import Linear, Module, Parameter
 from torch.nn.modules.conv import _ConvNd
 from transformers import PreTrainedModel
 
+from llmcompressor.observers import Observer
 from llmcompressor.core import ModelParameterizedLayer
 from llmcompressor.utils.fsdp.context import (
     fix_fsdp_module_name,
@@ -159,17 +161,16 @@ def match_layers_params(
     return resolved
 
 
-def is_internal_module(name: str) -> bool:
+def is_internal_module(module: Module) -> bool:
     """
     llm-compressor adds additional modules to a model, like observers
-    and transforms, as part of its operation.
-    Return whether module is internally instantiated by llm-compressor,
-    based on its name.
+    and transforms, as part of its normal operation
 
     :param name: name of module
-    :return: True if name indicates a module instantiated
+    :return: True if name indicates a module internally instantiated by
+        llm-compressor, otherwise False
     """
-    return name.endswith(("_observer", "_transform", "perm"))
+    return isinstance(module, (TransformBase, Observer))
 
 
 def get_layers(
@@ -188,15 +189,15 @@ def get_layers(
         modules added by llm-compressor, e.g. Observers and Transforms.
         Defaults to False to maintain backward compatibility
 
-    :return: dict of layer name -> layer module of all layers in module
+    :return: dict of {layer name -> module} of all layers in module
         that match targets
     """
     layer_dict = match_layers_params(targets, module)
     if exclude_internal_modules:
         layer_dict = {
-            layer_name: layer
-            for layer_name, layer in layer_dict.items()
-            if not is_internal_module(layer_name)
+            name: layer
+            for name, layer in layer_dict.items()
+            if not is_internal_module(layer)
         }
 
     return layer_dict

From 808b69c943bce3a33089f6f22d6c9eb1b4cb3e61 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Thu, 26 Jun 2025 18:51:34 +0000
Subject: [PATCH 8/8] stylefixes

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 src/llmcompressor/utils/pytorch/module.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py
index d013092b3..c923af413 100644
--- a/src/llmcompressor/utils/pytorch/module.py
+++ b/src/llmcompressor/utils/pytorch/module.py
@@ -8,14 +8,14 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
-from compressed_tensors.transform import TransformBase
 from compressed_tensors.quantization.utils import is_module_quantized
+from compressed_tensors.transform import TransformBase
 from torch.nn import Linear, Module, Parameter
 from torch.nn.modules.conv import _ConvNd
 from transformers import PreTrainedModel
 
-from llmcompressor.observers import Observer
 from llmcompressor.core import ModelParameterizedLayer
+from llmcompressor.observers import Observer
 from llmcompressor.utils.fsdp.context import (
     fix_fsdp_module_name,
     summon_full_params_context,