From 1a6bf6626add0de1fca9c067d66393629013a9c1 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 18 Jun 2025 20:12:29 +0000 Subject: [PATCH 1/8] AWQ CohereForCausalLM mappings Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/mappings.py | 24 +++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index 6390445c8..41a4d8b5a 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -74,7 +74,31 @@ class AWQMapping: ), ] + +# Cohere architecture is similar to default, with a very fundamental difference. +# The MLP block is executed in parallel to the attention. So the tensor goes +# through input_layernorm and then from there it goes directly to the attention +# module and to the MLP module. +_cohere_mappings = [ + AWQMapping( + "re:.*input_layernorm$", + [ + "re:.*self_attn.q_proj$", + "re:.*self_attn.k_proj$", + "re:.*self_attn.v_proj$", + "re:.*mlp.gate_proj$", + "re:.*mlp.up_proj$", + ], + ), + AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), + AWQMapping( + "re:.*up_proj$", + ["re:.*down_proj$"], + ), +] + AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = { + "CohereForCausalLM": _cohere_mappings, "LlamaForCausalLM": _default_mappings, "MistralForCausalLM": _default_mappings, "Phi3ForCausalLM": _phi_mappings, From 6af62173ddacf0d301da07fec475ef6a11984699 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 23 Jun 2025 14:48:01 +0000 Subject: [PATCH 2/8] add Cohere2ForCausalLM Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/mappings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index 41a4d8b5a..b7afffd3f 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -99,6 +99,7 @@ class AWQMapping: AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = { "CohereForCausalLM": _cohere_mappings, + "Cohere2ForCausalLM": _cohere_mappings, "LlamaForCausalLM": _default_mappings, "MistralForCausalLM": _default_mappings, "Phi3ForCausalLM": _phi_mappings, From 94eac30207e2bee95ffdf6a9e9be9ad313711dc5 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 23 Jun 2025 19:28:23 +0000 Subject: [PATCH 3/8] gemma2/gemma3 mappings Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/mappings.py | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index b7afffd3f..1fae42e0d 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -74,6 +74,25 @@ class AWQMapping: ), ] +# Gemma includes a pre_feedforward_layernorm in between +# post_attention_layernorm and the mlp down/gate proj layers +# use that instead of post_attention_layernorm in 3rd mapping: +_gemma_mappings = [ + AWQMapping( + "re:.*input_layernorm$", + ["re:.*q_proj$", "re:.*k_proj$", "re:.*v_proj$"], + ), + AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]), + AWQMapping( + "re:.*pre_feedforward_layernorm$", + ["re:.*gate_proj$", "re:.*up_proj$"], + ), + AWQMapping( + "re:.*up_proj$", + ["re:.*down_proj$"], + ), +] + # Cohere architecture is similar to default, with a very fundamental difference. # The MLP block is executed in parallel to the attention. So the tensor goes @@ -100,6 +119,9 @@ class AWQMapping: AWQ_MAPPING_REGISTRY: Dict[str, list[AWQMapping]] = { "CohereForCausalLM": _cohere_mappings, "Cohere2ForCausalLM": _cohere_mappings, + "Gemma2ForCausalLM": _gemma_mappings, + "Gemma3ForCausalLM": _gemma_mappings, + "Gemma3ForConditionalGeneration": _gemma_mappings, "LlamaForCausalLM": _default_mappings, "MistralForCausalLM": _default_mappings, "Phi3ForCausalLM": _phi_mappings, From e8739ea34979fe5f66d78acbec09379f1a72d57b Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 25 Jun 2025 17:36:22 +0000 Subject: [PATCH 4/8] Mistral3ForConditionalGeneration mapping Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/mappings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llmcompressor/modifiers/awq/mappings.py b/src/llmcompressor/modifiers/awq/mappings.py index 1fae42e0d..693406ec3 100644 --- a/src/llmcompressor/modifiers/awq/mappings.py +++ b/src/llmcompressor/modifiers/awq/mappings.py @@ -123,6 +123,7 @@ class AWQMapping: "Gemma3ForCausalLM": _gemma_mappings, "Gemma3ForConditionalGeneration": _gemma_mappings, "LlamaForCausalLM": _default_mappings, + "Mistral3ForConditionalGeneration": _default_mappings, "MistralForCausalLM": _default_mappings, "Phi3ForCausalLM": _phi_mappings, "Phi3VForCausalLM": _phi_mappings, From 76a464d738810922980d639a7bfb9c427eb596d7 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 25 Jun 2025 21:19:08 +0000 Subject: [PATCH 5/8] exclude observer modules Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 51e8cf8b9..9e0cc20f0 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -332,6 +332,9 @@ def _set_resolved_mappings(self, model: Module) -> None: balance_regex, smooth_parent, ).items(): + if balance_suffix.endswith("observer"): + continue + balance_name = f"{smooth_parent_name}.{balance_suffix}" # exclude v_proj->o_proj mappings whose shapes are incompatible From 202070f0fe1bcb465d9eb7f93711495d3c6ab4b9 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 26 Jun 2025 18:27:16 +0000 Subject: [PATCH 6/8] safeguard against resolved mappings including observer/transform layers Signed-off-by: Brian Dellabetta --- src/llmcompressor/modifiers/awq/base.py | 12 +++--- src/llmcompressor/utils/pytorch/module.py | 48 +++++++++++++++++++---- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py index 9e0cc20f0..7d5c3671f 100644 --- a/src/llmcompressor/modifiers/awq/base.py +++ b/src/llmcompressor/modifiers/awq/base.py @@ -304,13 +304,13 @@ def _set_resolved_mappings(self, model: Module) -> None: """ resolved_mappings: list[ResolvedMapping] = [] for mapping_idx, mapping in enumerate(self.mappings): - smooth_layers = get_layers(mapping.smooth_layer, model) + smooth_layers = get_layers( + mapping.smooth_layer, model, exclude_internal_modules=True + ) smooth_names = [ smooth_name for smooth_name in smooth_layers - if not find_name_or_class_matches( - smooth_name, model, self.ignore + ["re:.*_observer$"] - ) + if not find_name_or_class_matches(smooth_name, model, self.ignore) ] num_skipped_mappings = 0 @@ -331,10 +331,8 @@ def _set_resolved_mappings(self, model: Module) -> None: for balance_suffix, balance_layer in get_layers( balance_regex, smooth_parent, + exclude_internal_modules=True, ).items(): - if balance_suffix.endswith("observer"): - continue - balance_name = f"{smooth_parent_name}.{balance_suffix}" # exclude v_proj->o_proj mappings whose shapes are incompatible diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py index 835493fa3..e0e37d78e 100644 --- a/src/llmcompressor/utils/pytorch/module.py +++ b/src/llmcompressor/utils/pytorch/module.py @@ -9,7 +9,6 @@ import torch from compressed_tensors.quantization.utils import is_module_quantized -from packaging import version from torch.nn import Linear, Module, Parameter from torch.nn.modules.conv import _ConvNd from transformers import PreTrainedModel @@ -64,10 +63,6 @@ "get_layer_by_name", ] - -_PARSED_TORCH_VERSION = version.parse(torch.__version__) - - ALL_TARGET = "__ALL__" ALL_PRUNABLE_TARGET = "__ALL_PRUNABLE__" ALL_QUANTIZABLE_TARGET = "__ALL_QUANTIZABLE__" @@ -164,8 +159,47 @@ def match_layers_params( return resolved -def get_layers(targets: Union[str, List[str]], module: Module) -> Dict[str, Module]: - return match_layers_params(targets, module) +def is_internal_module(name: str) -> bool: + """ + llm-compressor adds additional modules to a model, like observers + and transforms, as part of its operation. + Return whether module is internally instantiated by llm-compressor, + based on its name. + + :param name: name of module + :return: True if name indicates a module instantiated + """ + return name.endswith(("_observer", "_transform", "perm")) + + +def get_layers( + targets: Union[str, List[str]], + module: Module, + exclude_internal_modules: bool = False, +) -> Dict[str, Module]: + """ + Get layers (also known as submodules) of module based on targets + + :param targets: names or regexes to search for + Can be regex, e.g. "re:.*input_layernorm$" to find all layers + in module whose names end in string "input_layernorm" + :param module: Parent module in which to search for targets + :param exclude_internal_modules: If True, don't include internal + modules added by llm-compressor, e.g. Observers and Transforms. + Defaults to False to maintain backward compatibility + + :return: dict of layer name -> layer module of all layers in module + that match targets + """ + layer_dict = match_layers_params(targets, module) + if exclude_internal_modules: + layer_dict = { + layer_name: layer + for layer_name, layer in layer_dict.items() + if not is_internal_module(layer_name) + } + + return layer_dict def get_layer(target: str, module: Module) -> Tuple[str, Module]: From f366925977a995fd91b991f842d555bdbfad5ba7 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 26 Jun 2025 18:48:44 +0000 Subject: [PATCH 7/8] switch is_internal_module check to use instance rather than name Signed-off-by: Brian Dellabetta --- src/llmcompressor/utils/pytorch/module.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py index e0e37d78e..d013092b3 100644 --- a/src/llmcompressor/utils/pytorch/module.py +++ b/src/llmcompressor/utils/pytorch/module.py @@ -8,11 +8,13 @@ from typing import Dict, List, Optional, Tuple, Union import torch +from compressed_tensors.transform import TransformBase from compressed_tensors.quantization.utils import is_module_quantized from torch.nn import Linear, Module, Parameter from torch.nn.modules.conv import _ConvNd from transformers import PreTrainedModel +from llmcompressor.observers import Observer from llmcompressor.core import ModelParameterizedLayer from llmcompressor.utils.fsdp.context import ( fix_fsdp_module_name, @@ -159,17 +161,16 @@ def match_layers_params( return resolved -def is_internal_module(name: str) -> bool: +def is_internal_module(module: Module) -> bool: """ llm-compressor adds additional modules to a model, like observers - and transforms, as part of its operation. - Return whether module is internally instantiated by llm-compressor, - based on its name. + and transforms, as part of its normal operation :param name: name of module - :return: True if name indicates a module instantiated + :return: True if name indicates a module internally instantiated by + llm-compressor, otherwise False """ - return name.endswith(("_observer", "_transform", "perm")) + return isinstance(module, (TransformBase, Observer)) def get_layers( @@ -188,15 +189,15 @@ def get_layers( modules added by llm-compressor, e.g. Observers and Transforms. Defaults to False to maintain backward compatibility - :return: dict of layer name -> layer module of all layers in module + :return: dict of {layer name -> module} of all layers in module that match targets """ layer_dict = match_layers_params(targets, module) if exclude_internal_modules: layer_dict = { - layer_name: layer - for layer_name, layer in layer_dict.items() - if not is_internal_module(layer_name) + name: layer + for name, layer in layer_dict.items() + if not is_internal_module(layer) } return layer_dict From 808b69c943bce3a33089f6f22d6c9eb1b4cb3e61 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Thu, 26 Jun 2025 18:51:34 +0000 Subject: [PATCH 8/8] stylefixes Signed-off-by: Brian Dellabetta --- src/llmcompressor/utils/pytorch/module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py index d013092b3..c923af413 100644 --- a/src/llmcompressor/utils/pytorch/module.py +++ b/src/llmcompressor/utils/pytorch/module.py @@ -8,14 +8,14 @@ from typing import Dict, List, Optional, Tuple, Union import torch -from compressed_tensors.transform import TransformBase from compressed_tensors.quantization.utils import is_module_quantized +from compressed_tensors.transform import TransformBase from torch.nn import Linear, Module, Parameter from torch.nn.modules.conv import _ConvNd from transformers import PreTrainedModel -from llmcompressor.observers import Observer from llmcompressor.core import ModelParameterizedLayer +from llmcompressor.observers import Observer from llmcompressor.utils.fsdp.context import ( fix_fsdp_module_name, summon_full_params_context,