From ebf9ff38d37cbfb05f2a1784d6a8c0ce0f6450bf Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Thu, 6 Nov 2025 08:32:11 -0500
Subject: [PATCH 01/13] Granit4 FP8 Block Quantization

---
 .../granite4_block_quantization_example.py    |  47 +++
 src/llmcompressor/modeling/granite4.py        | 280 ++++++++++++++++++
 2 files changed, 327 insertions(+)
 create mode 100644 examples/quantization_w8a8_fp8/granite4_block_quantization_example.py

diff --git a/examples/quantization_w8a8_fp8/granite4_block_quantization_example.py b/examples/quantization_w8a8_fp8/granite4_block_quantization_example.py
new file mode 100644
index 0000000000..d5e4e2382d
--- /dev/null
+++ b/examples/quantization_w8a8_fp8/granite4_block_quantization_example.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+from llmcompressor.modeling.granite4 import replace_granite_moe_with_linear_experts, pack_3d_experts
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+MODEL_ID = "ibm-granite/granite-4.0-h-small"
+
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+model = replace_granite_moe_with_linear_experts(model)
+
+ignore_lay = ["lm_head"]
+ignore_lay += ["re:.*block_sparse_moe.router"]
+ignore_lay += ["re:.*mamba.in_proj"]
+ignore_lay += ["re:.*shared_mlp.input_linear"]
+
+recipe = QuantizationModifier(
+    targets=["Linear"],
+    scheme="FP8_BLOCK",
+    ignore=ignore_lay,
+)
+
+oneshot(model=model, recipe=recipe)
+dispatch_for_generation(model)
+
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer(
+    "Describe Large Language Model", return_tensors="pt"
+).input_ids.to(model.device)
+output = model.generate(input_ids, max_new_tokens=35)
+print(tokenizer.decode(output[0]))
+print("==========================================")
+
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-block"
+print(f"Saving to {SAVE_DIR}")
+
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
+
+pack_3d_experts(SAVE_DIR)
diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index 75f03d5837..d5366c2a81 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -1,9 +1,289 @@
 import torch
+import json
+import os
+import shutil
+from pathlib import Path
+from collections import defaultdict
+from safetensors.torch import load_file, save_file
+
 from transformers.models.granitemoehybrid.modeling_granitemoehybrid import (
     GraniteMoeHybridParallelExperts,
 )
 
 
+#for fp8 block quantization
+def replace_granite_moe_with_linear(model):
+    """
+    Convert GraniteMoeHybridParallelExperts modules into individual expert layers.
+    Each expert will be stored as a separate nn.Linear module.
+    """
+    
+    class SeparatedExperts(nn.Module):
+        """Replacement module with individual expert linear layers"""
+        def __init__(self, num_experts, input_size, output_size, original_weight):
+            super().__init__()
+            self.num_experts = num_experts
+            self.input_size = input_size
+            self.output_size = output_size
+            
+            # Create individual linear layers for each expert
+            self.experts = nn.ModuleList([
+                nn.Linear(input_size, output_size, bias=False)
+                for _ in range(num_experts)
+            ])
+            
+            # Copy weights from the original 3D tensor
+            # Original format: [num_experts, output_size, input_size]
+            for i in range(num_experts):
+                self.experts[i].weight.data = original_weight[i].clone()
+        
+        def forward(self, inputs, expert_size):
+            """Forward pass using individual expert layers"""
+            input_list = inputs.split(expert_size, dim=0)
+            output_list = []
+            for i in range(self.num_experts):
+                output_list.append(self.experts[i](input_list[i]))
+            results = torch.cat(output_list, dim=0)
+            return results
+    
+    # Find and replace all GraniteMoeHybridParallelExperts modules
+    def replace_parallel_experts(module, name=''):
+        for child_name, child in module.named_children():
+            full_name = f"{name}.{child_name}" if name else child_name
+            
+            if child.__class__.__name__ == 'GraniteMoeHybridParallelExperts':
+                # Create replacement module with separated experts
+                separated = SeparatedExperts(
+                    num_experts=child.num_experts,
+                    input_size=child.input_size,
+                    output_size=child.output_size,
+                    original_weight=child.weight.data
+                )
+                # Replace the module
+                setattr(module, child_name, separated)
+                print(f"Replaced {full_name}: {child.num_experts} experts, "
+                      f"input_size={child.input_size}, output_size={child.output_size}")
+            else:
+                # Recursively process children
+                replace_parallel_experts(child, full_name)
+    
+    replace_parallel_experts(model)
+    return model
+
+
+
+def pack_3d_experts(source_dir):
+    """
+    Transform Granite MoE model from per-expert storage to stacked 3D tensor storage
+    
+    From: model.layers.{L}.block_sparse_moe.{linear_type}.experts.{E}.{param}
+    To:   model.layers.{L}.block_sparse_moe.{linear_type}.{param}
+    
+    """
+    source_dir = Path(source_dir)
+    
+    # Load the index file
+    index_file = source_dir / "model.safetensors.index.json"
+    with open(index_file, "r") as f:
+        index_data = json.load(f)
+    
+    weight_map = index_data["weight_map"]
+    
+    # Group tensors by layer, linear type, and parameter
+    # Structure: {(layer_num, linear_type, param): {expert_num: (tensor_name, file_name)}}
+    grouped_tensors = defaultdict(dict)
+    other_tensors = {}  # Non-expert tensors (router, embeddings, etc.)
+    
+    for tensor_name, file_name in weight_map.items():
+        # Check if this is an expert tensor
+        # Pattern: model.layers.{L}.block_sparse_moe.{linear_type}.experts.{E}.{param}
+        if ".block_sparse_moe." in tensor_name and ".experts." in tensor_name:
+            parts = tensor_name.split(".")
+            
+            try:
+                # Find the indices of key parts
+                layers_idx = parts.index("layers")
+                layer_num = int(parts[layers_idx + 1])
+                
+                experts_idx = parts.index("experts")
+                expert_num = int(parts[experts_idx + 1])
+                
+                # The linear type is right before "experts"
+                # e.g., "input_linear" or "output_linear"
+                linear_type = parts[experts_idx - 1]
+                
+                # The parameter is after expert number
+                # e.g., "weight" or "weight_scale"
+                param = ".".join(parts[experts_idx + 2:])
+                
+                # Create grouping key
+                group_key = (layer_num, linear_type, param)
+                grouped_tensors[group_key][expert_num] = (tensor_name, file_name)
+                
+            except (ValueError, IndexError) as e:
+                # If parsing fails, treat as other tensor
+                print(f"  Warning: Could not parse expert tensor: {tensor_name}")
+                other_tensors[tensor_name] = file_name
+        else:
+            other_tensors[tensor_name] = file_name
+    
+    # Load all safetensors files
+    print("Loading source safetensors files...")
+    loaded_tensors = {}
+    unique_files = set(weight_map.values())
+    old_files = list(unique_files)  # Store list of old files to delete later
+    
+    for file_name in unique_files:
+        file_path = source_dir / file_name
+        print(f"  Loading {file_name}...")
+        loaded_tensors[file_name] = load_file(str(file_path))
+    
+    # Create new tensors by stacking experts
+    print("\nStacking expert tensors...")
+    new_tensors = {}
+    
+    # Process each grouped tensor
+    for (layer_num, linear_type, param), experts_dict in sorted(grouped_tensors.items()):
+        print(f"  Processing layer {layer_num}, {linear_type}.{param}...")
+        
+        # Get all expert tensors for this group
+        expert_nums = sorted(experts_dict.keys())
+        expert_tensors = []
+        
+        for expert_num in expert_nums:
+            tensor_name, file_name = experts_dict[expert_num]
+            tensor = loaded_tensors[file_name][tensor_name]
+            expert_tensors.append(tensor)
+        
+        # Stack along first dimension to create 3D tensor
+        stacked_tensor = torch.stack(expert_tensors, dim=0)
+        
+        # Create new tensor name (remove .experts.{E} part)
+        new_tensor_name = f"model.layers.{layer_num}.block_sparse_moe.{linear_type}.{param}"
+        new_tensors[new_tensor_name] = stacked_tensor
+        
+        print(f"    {new_tensor_name}: {list(stacked_tensor.shape)} (stacked {len(expert_tensors)} experts)")
+    
+    # Copy non-expert tensors (router, embeddings, etc.)
+    print("\nCopying non-expert tensors...")
+    for tensor_name, file_name in other_tensors.items():
+        tensor = loaded_tensors[file_name][tensor_name]
+        new_tensors[tensor_name] = tensor
+        print(f"  Copied: {tensor_name}")
+    
+    # Determine file distribution for new tensors
+    # Simple strategy: distribute roughly equally across same number of files
+    num_output_files = len(unique_files)
+    tensors_list = list(new_tensors.items())
+    
+    # Calculate approximate size per file
+    total_numel = sum(t.numel() * t.element_size() for _, t in tensors_list)
+    target_size_per_file = total_numel / num_output_files
+    
+    # Distribute tensors across files
+    print(f"\nDistributing tensors across {num_output_files} files...")
+    file_tensors = [{}  for _ in range(num_output_files)]
+    file_sizes = [0] * num_output_files
+    new_weight_map = {}
+    
+    for tensor_name, tensor in tensors_list:
+        # Find file with smallest current size
+        min_idx = file_sizes.index(min(file_sizes))
+        file_tensors[min_idx][tensor_name] = tensor
+        file_sizes[min_idx] += tensor.numel() * tensor.element_size()
+        
+        # Update weight map
+        file_name = f"model-{min_idx+1:05d}-of-{num_output_files:05d}.safetensors"
+        new_weight_map[tensor_name] = file_name
+    
+    # Save new safetensors files with temporary names
+    print("\nSaving new safetensors files (temporary)...")
+    temp_files = []
+    for i, tensors_dict in enumerate(file_tensors):
+        if tensors_dict:  # Only save if not empty
+            file_name = f"model-{i+1:05d}-of-{num_output_files:05d}.safetensors"
+            temp_file_name = f"model-{i+1:05d}-of-{num_output_files:05d}.safetensors.tmp"
+            output_path = source_dir / temp_file_name
+            print(f"  Saving {temp_file_name} ({len(tensors_dict)} tensors)...")
+            save_file(tensors_dict, str(output_path))
+            temp_files.append((temp_file_name, file_name))
+    
+    # Save updated index file with temporary name
+    print("\nSaving updated index file (temporary)...")
+    new_index_data = {
+        "metadata": index_data.get("metadata", {}),
+        "weight_map": new_weight_map
+    }
+    
+    temp_index_file = source_dir / "model.safetensors.index.json.tmp"
+    with open(temp_index_file, "w") as f:
+        json.dump(new_index_data, f, indent=2)
+    
+    # Now delete old files
+    print("\nDeleting old safetensors files...")
+    for old_file in old_files:
+        old_file_path = source_dir / old_file
+        if old_file_path.exists():
+            old_file_path.unlink()
+            print(f"  Deleted {old_file}")
+    
+    # Delete old index file
+    if index_file.exists():
+        index_file.unlink()
+        print(f"  Deleted model.safetensors.index.json")
+    
+    # Rename temporary files to final names
+    print("\nRenaming temporary files to final names...")
+    for temp_name, final_name in temp_files:
+        temp_path = source_dir / temp_name
+        final_path = source_dir / final_name
+        temp_path.rename(final_path)
+        print(f"  Renamed {temp_name} -> {final_name}")
+    
+    # Rename temporary index file
+    temp_index_file.rename(index_file)
+    print(f"  Renamed model.safetensors.index.json.tmp -> model.safetensors.index.json")
+    
+    # Update config.json to rename mamba layers to mixer
+    print("\nUpdating config.json to rename mamba layers to mixer...")
+    config_file = source_dir / "config.json"
+    if config_file.exists():
+        with open(config_file, "r") as f:
+            config_data = json.load(f)
+        
+        # Check if quantization_config exists and has ignore list
+        if "quantization_config" in config_data and "ignore" in config_data["quantization_config"]:
+            ignore_list = config_data["quantization_config"]["ignore"]
+            updated_count = 0
+            
+            # Replace mamba.in_proj with mixer.in_proj and mamba.out_proj with mixer.out_proj
+            for i, entry in enumerate(ignore_list):
+                if "mamba.in_proj" in entry or "mamba.out_proj" in entry:
+                    new_entry = entry.replace("mamba.in_proj", "mixer.in_proj").replace("mamba.out_proj", "mixer.out_proj")
+                    ignore_list[i] = new_entry
+                    updated_count += 1
+                    print(f"  Updated: {entry} -> {new_entry}")
+            
+            # Save updated config
+            with open(config_file, "w") as f:
+                json.dump(config_data, f, indent=2)
+            
+            print(f"  Updated {updated_count} entries in config.json")
+        else:
+            print("  No quantization_config.ignore found in config.json")
+    else:
+        print("  config.json not found")
+    
+    # Print summary
+    num_stacked = len(grouped_tensors)
+    num_other = len(other_tensors)
+    print(f"\n📊 Summary:")
+    print(f"   Stacked expert groups: {num_stacked}")
+    print(f"   Non-expert tensors: {num_other}")
+    print(f"\nCheckpoint Updated for vLLM Compatibility")
+
+
+
 class GraniteMoeHybridParallelExpertsLinear(torch.nn.Linear):
     def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
         """Use a real Linear so that llmcompressor and vllm can handle it easier.

From fc14b3aa101d5bf85627902f90d7645130f0b59b Mon Sep 17 00:00:00 2001
From: Krishna Teja Chitty-Venkata
 <44275589+krishnateja95@users.noreply.github.com>
Date: Thu, 6 Nov 2025 09:51:41 -0500
Subject: [PATCH 02/13] Update src/llmcompressor/modeling/granite4.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Krishna Teja Chitty-Venkata <44275589+krishnateja95@users.noreply.github.com>
---
 src/llmcompressor/modeling/granite4.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index d5366c2a81..0a97efbfa1 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -12,7 +12,7 @@
 
 
 #for fp8 block quantization
-def replace_granite_moe_with_linear(model):
+def replace_granite_moe_with_linear_experts(model):
     """
     Convert GraniteMoeHybridParallelExperts modules into individual expert layers.
     Each expert will be stored as a separate nn.Linear module.

From c22bbd49a6b2fc1cf8357fb471d622f1b02a422e Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 10 Nov 2025 07:43:33 -0500
Subject: [PATCH 03/13] Update modeling/granite4.py

---
 src/llmcompressor/modeling/granite4.py | 37 --------------------------
 1 file changed, 37 deletions(-)

diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index 0a97efbfa1..83a9a9a645 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -243,43 +243,6 @@ def pack_3d_experts(source_dir):
     # Rename temporary index file
     temp_index_file.rename(index_file)
     print(f"  Renamed model.safetensors.index.json.tmp -> model.safetensors.index.json")
-    
-    # Update config.json to rename mamba layers to mixer
-    print("\nUpdating config.json to rename mamba layers to mixer...")
-    config_file = source_dir / "config.json"
-    if config_file.exists():
-        with open(config_file, "r") as f:
-            config_data = json.load(f)
-        
-        # Check if quantization_config exists and has ignore list
-        if "quantization_config" in config_data and "ignore" in config_data["quantization_config"]:
-            ignore_list = config_data["quantization_config"]["ignore"]
-            updated_count = 0
-            
-            # Replace mamba.in_proj with mixer.in_proj and mamba.out_proj with mixer.out_proj
-            for i, entry in enumerate(ignore_list):
-                if "mamba.in_proj" in entry or "mamba.out_proj" in entry:
-                    new_entry = entry.replace("mamba.in_proj", "mixer.in_proj").replace("mamba.out_proj", "mixer.out_proj")
-                    ignore_list[i] = new_entry
-                    updated_count += 1
-                    print(f"  Updated: {entry} -> {new_entry}")
-            
-            # Save updated config
-            with open(config_file, "w") as f:
-                json.dump(config_data, f, indent=2)
-            
-            print(f"  Updated {updated_count} entries in config.json")
-        else:
-            print("  No quantization_config.ignore found in config.json")
-    else:
-        print("  config.json not found")
-    
-    # Print summary
-    num_stacked = len(grouped_tensors)
-    num_other = len(other_tensors)
-    print(f"\n📊 Summary:")
-    print(f"   Stacked expert groups: {num_stacked}")
-    print(f"   Non-expert tensors: {num_other}")
     print(f"\nCheckpoint Updated for vLLM Compatibility")
 
 

From 4097e075272f694f1c0997f32db7045e2b464176 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Thu, 13 Nov 2025 15:20:44 -0500
Subject: [PATCH 04/13] Granite4 MoECalibrationModule Update

---
 .../granite4_block_quantization_example.py    |  17 +-
 .../granite4_dynamic_quantization_example.py  |  42 ++
 src/llmcompressor/modeling/granite4.py        | 437 +++++++++---------
 src/llmcompressor/modeling/prepare.py         |   2 +
 4 files changed, 272 insertions(+), 226 deletions(-)
 create mode 100644 examples/quantization_w8a8_fp8/granite4_dynamic_quantization_example.py

diff --git a/examples/quantization_w8a8_fp8/granite4_block_quantization_example.py b/examples/quantization_w8a8_fp8/granite4_block_quantization_example.py
index d5e4e2382d..73aae83aa3 100644
--- a/examples/quantization_w8a8_fp8/granite4_block_quantization_example.py
+++ b/examples/quantization_w8a8_fp8/granite4_block_quantization_example.py
@@ -2,24 +2,21 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
-from llmcompressor.modeling.granite4 import replace_granite_moe_with_linear_experts, pack_3d_experts
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from llmcompressor.modeling import replace_modules_for_calibration
 
 MODEL_ID = "ibm-granite/granite-4.0-h-small"
 
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
-model = replace_granite_moe_with_linear_experts(model)
+model = replace_modules_for_calibration(model)
 
-ignore_lay = ["lm_head"]
-ignore_lay += ["re:.*block_sparse_moe.router"]
-ignore_lay += ["re:.*mamba.in_proj"]
-ignore_lay += ["re:.*shared_mlp.input_linear"]
+ignore_lay = ["lm_head", "re:.*block_sparse_moe.router", "re:.*mamba.in_proj", "re:.*shared_mlp.input_linear"]
 
 recipe = QuantizationModifier(
     targets=["Linear"],
@@ -28,9 +25,9 @@
 )
 
 oneshot(model=model, recipe=recipe)
-dispatch_for_generation(model)
 
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer(
     "Describe Large Language Model", return_tensors="pt"
 ).input_ids.to(model.device)
@@ -43,5 +40,3 @@
 
 model.save_pretrained(SAVE_DIR)
 tokenizer.save_pretrained(SAVE_DIR)
-
-pack_3d_experts(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/granite4_dynamic_quantization_example.py b/examples/quantization_w8a8_fp8/granite4_dynamic_quantization_example.py
new file mode 100644
index 0000000000..154898653f
--- /dev/null
+++ b/examples/quantization_w8a8_fp8/granite4_dynamic_quantization_example.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+from llmcompressor.modeling import replace_modules_for_calibration
+
+MODEL_ID = "ibm-granite/granite-4.0-h-small"
+
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+model = replace_modules_for_calibration(model)
+
+ignore_lay = ["lm_head"]
+
+recipe = QuantizationModifier(
+    targets=["Linear"],
+    scheme="FP8_DYNAMIC",
+    ignore=ignore_lay,
+)
+
+oneshot(model=model, recipe=recipe)
+
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer(
+    "Describe Large Language Model", return_tensors="pt"
+).input_ids.to(model.device)
+output = model.generate(input_ids, max_new_tokens=35)
+print(tokenizer.decode(output[0]))
+print("==========================================")
+
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-block"
+print(f"Saving to {SAVE_DIR}")
+
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index 83a9a9a645..66c2a41df2 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -1,249 +1,256 @@
-import torch
-import json
 import os
-import shutil
-from pathlib import Path
-from collections import defaultdict
-from safetensors.torch import load_file, save_file
-
+import torch
+import torch.nn as nn
+from transformers.models.granitemoehybrid.configuration_granitemoehybrid import GraniteMoeHybridConfig
 from transformers.models.granitemoehybrid.modeling_granitemoehybrid import (
     GraniteMoeHybridParallelExperts,
+    GraniteMoeHybridMoE,
 )
 
+from llmcompressor.modeling.moe_context import MoECalibrationModule
 
-#for fp8 block quantization
-def replace_granite_moe_with_linear_experts(model):
+class SequentialGraniteMoeExperts(nn.Module):
     """
-    Convert GraniteMoeHybridParallelExperts modules into individual expert layers.
-    Each expert will be stored as a separate nn.Linear module.
+    Unpacked version of GraniteMoeHybridParallelExperts with individual expert layers.
+
+    This module:
+    1. Unpacks the packed expert weights (3D -> individual Linear layers)
+    2. Processes experts sequentially
+    3. Compatible with FP8 block quantization and vLLM
     """
-    
-    class SeparatedExperts(nn.Module):
-        """Replacement module with individual expert linear layers"""
-        def __init__(self, num_experts, input_size, output_size, original_weight):
-            super().__init__()
-            self.num_experts = num_experts
-            self.input_size = input_size
-            self.output_size = output_size
-            
-            # Create individual linear layers for each expert
-            self.experts = nn.ModuleList([
-                nn.Linear(input_size, output_size, bias=False)
-                for _ in range(num_experts)
-            ])
-            
-            # Copy weights from the original 3D tensor
-            # Original format: [num_experts, output_size, input_size]
-            for i in range(num_experts):
-                self.experts[i].weight.data = original_weight[i].clone()
+
+    def __init__(
+        self,
+        original: GraniteMoeHybridParallelExperts,
+        calibrate_all_experts: bool = True,
+    ):
+        super().__init__()
+        self.num_experts = original.num_experts
+        self.input_size = original.input_size
+        self.output_size = original.output_size
+        self.calibrate_all_experts = calibrate_all_experts
+
+        # Create individual linear layers for each expert
+        self.experts = nn.ModuleList([
+            nn.Linear(self.input_size, self.output_size, bias=False)
+            for _ in range(self.num_experts)
+        ])
+
+        # Copy weights from the original 3D tensor
+        # Original format: [num_experts, output_size, input_size]
+        for i in range(self.num_experts):
+            self.experts[i].weight.data = original.weight.data[i].clone()
+
+    def forward(self, inputs, expert_size, batch_index=None):
+        """
+        Forward pass using individual expert layers.
+        
+        Args:
+            inputs: Input tensor to be processed by experts
+            expert_size: List containing the size of inputs for each expert
+            batch_index: Token indices for routing (needed in calibration mode)
         
-        def forward(self, inputs, expert_size):
-            """Forward pass using individual expert layers"""
+        Returns:
+            Concatenated output from all experts
+        """
+        if self.calibrate_all_experts:
+            # During calibration, process all inputs through each expert
+            # but only keep the outputs corresponding to tokens routed to that expert
+            output_list = []
+            start_idx = 0
+            for i in range(self.num_experts):
+                end_idx = start_idx + expert_size[i]
+                # Get token indices assigned to this expert
+                expert_token_indices = batch_index[start_idx:end_idx]
+                # Process ALL tokens through this expert
+                expert_out_all = self.experts[i](inputs)
+                # Only keep outputs for tokens assigned to this expert
+                expert_out = expert_out_all[expert_token_indices]
+                output_list.append(expert_out)
+                start_idx = end_idx
+            results = torch.cat(output_list, dim=0)
+        else:
+            # Normal routing: only process tokens assigned to this expert
             input_list = inputs.split(expert_size, dim=0)
             output_list = []
             for i in range(self.num_experts):
                 output_list.append(self.experts[i](input_list[i]))
             results = torch.cat(output_list, dim=0)
-            return results
-    
-    # Find and replace all GraniteMoeHybridParallelExperts modules
-    def replace_parallel_experts(module, name=''):
-        for child_name, child in module.named_children():
-            full_name = f"{name}.{child_name}" if name else child_name
-            
-            if child.__class__.__name__ == 'GraniteMoeHybridParallelExperts':
-                # Create replacement module with separated experts
-                separated = SeparatedExperts(
-                    num_experts=child.num_experts,
-                    input_size=child.input_size,
-                    output_size=child.output_size,
-                    original_weight=child.weight.data
-                )
-                # Replace the module
-                setattr(module, child_name, separated)
-                print(f"Replaced {full_name}: {child.num_experts} experts, "
-                      f"input_size={child.input_size}, output_size={child.output_size}")
-            else:
-                # Recursively process children
-                replace_parallel_experts(child, full_name)
-    
-    replace_parallel_experts(model)
-    return model
-
+        
+        return results
 
 
-def pack_3d_experts(source_dir):
+@MoECalibrationModule.register("GraniteMoeHybridMoE")
+class CalibrationGraniteMoeHybridMoE(MoECalibrationModule):
     """
-    Transform Granite MoE model from per-expert storage to stacked 3D tensor storage
-    
-    From: model.layers.{L}.block_sparse_moe.{linear_type}.experts.{E}.{param}
-    To:   model.layers.{L}.block_sparse_moe.{linear_type}.{param}
-    
+    Calibration version of GraniteMoeHybridMoE that unpacks both input_linear and output_linear experts.
+
+    This module:
+    1. Replaces both GraniteMoeHybridParallelExperts modules with unpacked versions
+    2. Optionally sends all tokens to all experts during calibration
+    3. Stays in unpacked form (permanent) for vLLM compatibility and FP8 block quantization
     """
-    source_dir = Path(source_dir)
-    
-    # Load the index file
-    index_file = source_dir / "model.safetensors.index.json"
-    with open(index_file, "r") as f:
-        index_data = json.load(f)
-    
-    weight_map = index_data["weight_map"]
-    
-    # Group tensors by layer, linear type, and parameter
-    # Structure: {(layer_num, linear_type, param): {expert_num: (tensor_name, file_name)}}
-    grouped_tensors = defaultdict(dict)
-    other_tensors = {}  # Non-expert tensors (router, embeddings, etc.)
-    
-    for tensor_name, file_name in weight_map.items():
-        # Check if this is an expert tensor
-        # Pattern: model.layers.{L}.block_sparse_moe.{linear_type}.experts.{E}.{param}
-        if ".block_sparse_moe." in tensor_name and ".experts." in tensor_name:
-            parts = tensor_name.split(".")
-            
-            try:
-                # Find the indices of key parts
-                layers_idx = parts.index("layers")
-                layer_num = int(parts[layers_idx + 1])
-                
-                experts_idx = parts.index("experts")
-                expert_num = int(parts[experts_idx + 1])
-                
-                # The linear type is right before "experts"
-                # e.g., "input_linear" or "output_linear"
-                linear_type = parts[experts_idx - 1]
-                
-                # The parameter is after expert number
-                # e.g., "weight" or "weight_scale"
-                param = ".".join(parts[experts_idx + 2:])
-                
-                # Create grouping key
-                group_key = (layer_num, linear_type, param)
-                grouped_tensors[group_key][expert_num] = (tensor_name, file_name)
-                
-            except (ValueError, IndexError) as e:
-                # If parsing fails, treat as other tensor
-                print(f"  Warning: Could not parse expert tensor: {tensor_name}")
-                other_tensors[tensor_name] = file_name
-        else:
-            other_tensors[tensor_name] = file_name
-    
-    # Load all safetensors files
-    print("Loading source safetensors files...")
-    loaded_tensors = {}
-    unique_files = set(weight_map.values())
-    old_files = list(unique_files)  # Store list of old files to delete later
-    
-    for file_name in unique_files:
-        file_path = source_dir / file_name
-        print(f"  Loading {file_name}...")
-        loaded_tensors[file_name] = load_file(str(file_path))
-    
-    # Create new tensors by stacking experts
-    print("\nStacking expert tensors...")
-    new_tensors = {}
-    
-    # Process each grouped tensor
-    for (layer_num, linear_type, param), experts_dict in sorted(grouped_tensors.items()):
-        print(f"  Processing layer {layer_num}, {linear_type}.{param}...")
-        
-        # Get all expert tensors for this group
-        expert_nums = sorted(experts_dict.keys())
-        expert_tensors = []
+
+    is_permanent = True
+
+    def __init__(
+        self,
+        original: GraniteMoeHybridMoE,
+        config: GraniteMoeHybridConfig,
+        calibrate_all_experts: bool = True,
+    ):
+        super().__init__()
+        self.input_size = original.input_size
+        self.hidden_size = original.hidden_size
+        self.activation = original.activation
+        self.calibrate_all_experts = calibrate_all_experts
+
+        # Replace input_linear and output_linear with unpacked versions
+        self.input_linear = SequentialGraniteMoeExperts(
+            original.input_linear,
+            calibrate_all_experts=calibrate_all_experts,
+        )
+        self.output_linear = SequentialGraniteMoeExperts(
+            original.output_linear,
+            calibrate_all_experts=calibrate_all_experts,
+        )
+
+        # Keep the router unchanged
+        self.router = original.router
+
+    def forward(self, layer_input):
+        """
+        Forward pass of the MoE layer.
         
-        for expert_num in expert_nums:
-            tensor_name, file_name = experts_dict[expert_num]
-            tensor = loaded_tensors[file_name][tensor_name]
-            expert_tensors.append(tensor)
+        Args:
+            layer_input: Input tensor of shape [batch_size, seq_len, hidden_size]
         
-        # Stack along first dimension to create 3D tensor
-        stacked_tensor = torch.stack(expert_tensors, dim=0)
+        Returns:
+            Tuple of (output tensor, router_logits) where:
+                - output tensor has shape [batch_size, seq_len, hidden_size]
+                - router_logits has shape [batch_size * seq_len, num_experts]
+        """
+        bsz, length, emb_size = layer_input.size()
+        layer_input_flat = layer_input.reshape(-1, emb_size)
         
-        # Create new tensor name (remove .experts.{E} part)
-        new_tensor_name = f"model.layers.{layer_num}.block_sparse_moe.{linear_type}.{param}"
-        new_tensors[new_tensor_name] = stacked_tensor
+        # Router determines expert assignments
+        _, batch_index, batch_gates, expert_size, router_logits = self.router(layer_input_flat)
+
+        if self.calibrate_all_experts:
+            # During calibration, send all tokens to all experts
+            # Pass batch_index so experts know which outputs to keep
+            hidden_states = self.input_linear(layer_input_flat, expert_size, batch_index)
+            
+            # Apply activation (SwiGLU-style)
+            chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+            hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+            
+            # Process through output_linear experts
+            expert_outputs = self.output_linear(hidden_states, expert_size, batch_index)
+            
+            # Apply gating weights
+            expert_outputs_gated = expert_outputs * batch_gates[:, None]
+        else:
+            # Normal routing: only send tokens to assigned experts
+            expert_inputs = layer_input_flat[batch_index]
+            
+            # Process through input_linear experts
+            hidden_states = self.input_linear(expert_inputs, expert_size)
+            
+            # Apply activation (SwiGLU-style)
+            chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+            hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+            
+            # Process through output_linear experts
+            expert_outputs = self.output_linear(hidden_states, expert_size)
+            
+            # Apply gating weights
+            expert_outputs_gated = expert_outputs * batch_gates[:, None]
+
+        # Aggregate expert outputs
+        zeros = torch.zeros(
+            (bsz * length, self.input_size),
+            dtype=expert_outputs_gated.dtype,
+            device=expert_outputs_gated.device
+        )
+        layer_output = zeros.index_add(0, batch_index, expert_outputs_gated)
+        layer_output = layer_output.view(bsz, length, self.input_size)
         
-        print(f"    {new_tensor_name}: {list(stacked_tensor.shape)} (stacked {len(expert_tensors)} experts)")
-    
-    # Copy non-expert tensors (router, embeddings, etc.)
-    print("\nCopying non-expert tensors...")
-    for tensor_name, file_name in other_tensors.items():
-        tensor = loaded_tensors[file_name][tensor_name]
-        new_tensors[tensor_name] = tensor
-        print(f"  Copied: {tensor_name}")
-    
-    # Determine file distribution for new tensors
-    # Simple strategy: distribute roughly equally across same number of files
-    num_output_files = len(unique_files)
-    tensors_list = list(new_tensors.items())
-    
-    # Calculate approximate size per file
-    total_numel = sum(t.numel() * t.element_size() for _, t in tensors_list)
-    target_size_per_file = total_numel / num_output_files
+        return layer_output, router_logits
+
+
+# Legacy function for backward compatibility with prepare.py
+def replace(
+    config: GraniteMoeHybridConfig,
+    module: GraniteMoeHybridMoE,
+    calibrate_all_experts: bool,
+):
+    """
+    Legacy replacement function for use with prepare.py.
     
-    # Distribute tensors across files
-    print(f"\nDistributing tensors across {num_output_files} files...")
-    file_tensors = [{}  for _ in range(num_output_files)]
-    file_sizes = [0] * num_output_files
-    new_weight_map = {}
+    This function is deprecated. Use moe_calibration_context instead:
     
-    for tensor_name, tensor in tensors_list:
-        # Find file with smallest current size
-        min_idx = file_sizes.index(min(file_sizes))
-        file_tensors[min_idx][tensor_name] = tensor
-        file_sizes[min_idx] += tensor.numel() * tensor.element_size()
+    Example:
+        from llmcompressor.modeling.moe_context import moe_calibration_context
         
-        # Update weight map
-        file_name = f"model-{min_idx+1:05d}-of-{num_output_files:05d}.safetensors"
-        new_weight_map[tensor_name] = file_name
+        with moe_calibration_context(model, calibrate_all_experts=True):
+            # Run calibration
+            pass
     
-    # Save new safetensors files with temporary names
-    print("\nSaving new safetensors files (temporary)...")
-    temp_files = []
-    for i, tensors_dict in enumerate(file_tensors):
-        if tensors_dict:  # Only save if not empty
-            file_name = f"model-{i+1:05d}-of-{num_output_files:05d}.safetensors"
-            temp_file_name = f"model-{i+1:05d}-of-{num_output_files:05d}.safetensors.tmp"
-            output_path = source_dir / temp_file_name
-            print(f"  Saving {temp_file_name} ({len(tensors_dict)} tensors)...")
-            save_file(tensors_dict, str(output_path))
-            temp_files.append((temp_file_name, file_name))
+    Args:
+        config: The GraniteMoeHybridConfig for the model
+        module: The GraniteMoeHybridMoE module to replace
+        calibrate_all_experts: Whether to calibrate all experts
     
-    # Save updated index file with temporary name
-    print("\nSaving updated index file (temporary)...")
-    new_index_data = {
-        "metadata": index_data.get("metadata", {}),
-        "weight_map": new_weight_map
-    }
-    
-    temp_index_file = source_dir / "model.safetensors.index.json.tmp"
-    with open(temp_index_file, "w") as f:
-        json.dump(new_index_data, f, indent=2)
+    Returns:
+        CalibrationGraniteMoeHybridMoE calibration module
+    """
+    return CalibrationGraniteMoeHybridMoE(
+        module,
+        config,
+        calibrate_all_experts=calibrate_all_experts,
+    )
+
+
+def replace_granite_moe_with_linear_experts(model):
+    """
+    Legacy replacement function that recursively replaces all GraniteMoeHybridMoE modules.
     
-    # Now delete old files
-    print("\nDeleting old safetensors files...")
-    for old_file in old_files:
-        old_file_path = source_dir / old_file
-        if old_file_path.exists():
-            old_file_path.unlink()
-            print(f"  Deleted {old_file}")
+    This function is deprecated. Use moe_calibration_context instead:
     
-    # Delete old index file
-    if index_file.exists():
-        index_file.unlink()
-        print(f"  Deleted model.safetensors.index.json")
+    Example:
+        from llmcompressor.modeling.moe_context import moe_calibration_context
+        
+        with moe_calibration_context(model, calibrate_all_experts=True):
+            # Run calibration
+            pass
     
-    # Rename temporary files to final names
-    print("\nRenaming temporary files to final names...")
-    for temp_name, final_name in temp_files:
-        temp_path = source_dir / temp_name
-        final_path = source_dir / final_name
-        temp_path.rename(final_path)
-        print(f"  Renamed {temp_name} -> {final_name}")
+    Args:
+        model: The model containing GraniteMoeHybridMoE modules
     
-    # Rename temporary index file
-    temp_index_file.rename(index_file)
-    print(f"  Renamed model.safetensors.index.json.tmp -> model.safetensors.index.json")
-    print(f"\nCheckpoint Updated for vLLM Compatibility")
+    Returns:
+        The modified model with replaced expert modules
+    """
+    def replace_moe_modules(module, name=''):
+        for child_name, child in module.named_children():
+            full_name = f"{name}.{child_name}" if name else child_name
+            
+            if child.__class__.__name__ == 'GraniteMoeHybridMoE':
+                # Create replacement module with unpacked experts
+                calibrated = CalibrationGraniteMoeHybridMoE(
+                    original=child,
+                    config=model.config,
+                    calibrate_all_experts=True,
+                )
+                # Replace the module
+                setattr(module, child_name, calibrated)
+                print(f"Replaced {full_name}: GraniteMoeHybridMoE with unpacked experts")
+            else:
+                # Recursively process children
+                replace_moe_modules(child, full_name)
+     
+    replace_moe_modules(model)
+    return model
 
 
 
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
index af9920d1b8..09e8202921 100644
--- a/src/llmcompressor/modeling/prepare.py
+++ b/src/llmcompressor/modeling/prepare.py
@@ -14,6 +14,7 @@
 from llmcompressor.modeling.deepseek_v3 import replace as replace_deepseekv3
 from llmcompressor.modeling.llama4 import replace as replace_llama4
 from llmcompressor.modeling.qwen3_vl_moe import replace as replace_Qwen3VLMoE
+from llmcompressor.modeling.granite4 import replace as replace_GraniteMoeHybridMoE
 
 __all__ = ["replace_modules_for_calibration"]
 
@@ -22,6 +23,7 @@
     "DeepseekV3MoE": replace_deepseekv3,
     "Llama4TextMoe": replace_llama4,
     "Qwen3VLMoeTextSparseMoeBlock": replace_Qwen3VLMoE,
+    "GraniteMoeHybridMoE": replace_GraniteMoeHybridMoE,
 }
 
 

From 313e83cd95e1018772b84e76a80a35a77633c760 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Date: Mon, 17 Nov 2025 12:01:47 -0500
Subject: [PATCH 05/13] Remove old granite 4 example

---
 .../granite4_dynamic_quantization_example.py  | 42 --------------
 .../quantization_w8a8_fp8/granite4_example.py | 55 +++++--------------
 2 files changed, 15 insertions(+), 82 deletions(-)
 delete mode 100644 examples/quantization_w8a8_fp8/granite4_dynamic_quantization_example.py

diff --git a/examples/quantization_w8a8_fp8/granite4_dynamic_quantization_example.py b/examples/quantization_w8a8_fp8/granite4_dynamic_quantization_example.py
deleted file mode 100644
index 154898653f..0000000000
--- a/examples/quantization_w8a8_fp8/granite4_dynamic_quantization_example.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
-from llmcompressor.modeling import replace_modules_for_calibration
-
-MODEL_ID = "ibm-granite/granite-4.0-h-small"
-
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-model = replace_modules_for_calibration(model)
-
-ignore_lay = ["lm_head"]
-
-recipe = QuantizationModifier(
-    targets=["Linear"],
-    scheme="FP8_DYNAMIC",
-    ignore=ignore_lay,
-)
-
-oneshot(model=model, recipe=recipe)
-
-print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
-input_ids = tokenizer(
-    "Describe Large Language Model", return_tensors="pt"
-).input_ids.to(model.device)
-output = model.generate(input_ids, max_new_tokens=35)
-print(tokenizer.decode(output[0]))
-print("==========================================")
-
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-block"
-print(f"Saving to {SAVE_DIR}")
-
-model.save_pretrained(SAVE_DIR)
-tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/granite4_example.py b/examples/quantization_w8a8_fp8/granite4_example.py
index fff4ada5f1..154898653f 100644
--- a/examples/quantization_w8a8_fp8/granite4_example.py
+++ b/examples/quantization_w8a8_fp8/granite4_example.py
@@ -1,67 +1,42 @@
-from compressed_tensors.utils import replace_module
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.models.granitemoehybrid.modeling_granitemoehybrid import (
-    GraniteMoeHybridParallelExperts,
-)
 
 from llmcompressor import oneshot
-from llmcompressor.modeling.granite4 import GraniteMoeHybridParallelExpertsLinear
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
+from llmcompressor.modeling import replace_modules_for_calibration
 
-"""Please see details in `README_granite4.md`."""
+MODEL_ID = "ibm-granite/granite-4.0-h-small"
 
-MODEL_ID = "ibm-granite/granite-4.0-tiny-preview"
-
-# Load model.
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
-skip_router_only = True  # assume we want to quantize input/output moe layers
-ignore_lay = [
-    "lm_head",
-]
-if skip_router_only:
-    # swap moe linears to a custom class
-    for n, m in model.named_modules():
-        if isinstance(m, GraniteMoeHybridParallelExperts):
-            new_mod = GraniteMoeHybridParallelExpertsLinear.from_3d_expert(m)
-            replace_module(model, n, new_mod)
-    ignore_lay += ["re:.*block_sparse_moe.router"]
-    SAVE_DIR = "ibm-granite-4-tiny-fp8-dynamic-skipMoeRouter"
-else:
-    # Skip all .input_linear, .output-linear, and router layers.
-    ignore_lay += ["re:.*block_sparse_moe"]
-    SAVE_DIR = "ibm-granite-4-tiny-fp8-dynamic-skipMoe"
+model = replace_modules_for_calibration(model)
+
+ignore_lay = ["lm_head"]
 
 recipe = QuantizationModifier(
-    targets=["Linear", "GraniteMoeHybridParallelExpertsLinear"],
+    targets=["Linear"],
     scheme="FP8_DYNAMIC",
     ignore=ignore_lay,
 )
 
-# Apply quantization.
 oneshot(model=model, recipe=recipe)
 
-# Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 dispatch_for_generation(model)
 input_ids = tokenizer(
-    "What is your favorite TV show?", return_tensors="pt"
-).input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
+    "Describe Large Language Model", return_tensors="pt"
+).input_ids.to(model.device)
+output = model.generate(input_ids, max_new_tokens=35)
 print(tokenizer.decode(output[0]))
 print("==========================================")
 
-# Revert weights of MoE experts to 3D format (num_experts, output_size, input_size)
-for n, m in model.named_modules():
-    if isinstance(m, GraniteMoeHybridParallelExpertsLinear):
-        # NOTE: can assert type != "meta" instead, which is sign of offloading
-        assert m.weight.device.type == "cuda", (
-            "Found some offloaded weights. This is not compatible with reshaping "
-            "experts to 3D prior model save. Ensure the model is fully on cuda."
-        )
-        m.to_3d_expert()
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-block"
+print(f"Saving to {SAVE_DIR}")
 
 model.save_pretrained(SAVE_DIR)
 tokenizer.save_pretrained(SAVE_DIR)

From 6e389e24258b2dd52048fea0c21eeb2ca462959f Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Date: Mon, 17 Nov 2025 12:03:20 -0500
Subject: [PATCH 06/13] rename block example file

---
 ...lock_quantization_example.py => granite4_fp8_block_example.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/quantization_w8a8_fp8/{granite4_block_quantization_example.py => granite4_fp8_block_example.py} (100%)

diff --git a/examples/quantization_w8a8_fp8/granite4_block_quantization_example.py b/examples/quantization_w8a8_fp8/granite4_fp8_block_example.py
similarity index 100%
rename from examples/quantization_w8a8_fp8/granite4_block_quantization_example.py
rename to examples/quantization_w8a8_fp8/granite4_fp8_block_example.py

From 7c7a188311169df4dd4da013eadcb48492c187f4 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 1 Dec 2025 10:53:17 -0500
Subject: [PATCH 07/13] Update pack experts

---
 .../granite4_fp8_block_example.py             |   3 +
 src/llmcompressor/modeling/granite4.py        | 182 ++++++++++++++++++
 2 files changed, 185 insertions(+)

diff --git a/examples/quantization_w8a8_fp8/granite4_fp8_block_example.py b/examples/quantization_w8a8_fp8/granite4_fp8_block_example.py
index 73aae83aa3..d3b1237e04 100644
--- a/examples/quantization_w8a8_fp8/granite4_fp8_block_example.py
+++ b/examples/quantization_w8a8_fp8/granite4_fp8_block_example.py
@@ -8,6 +8,7 @@
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
 from llmcompressor.modeling import replace_modules_for_calibration
+from llmcompressor.modeling.granite4 import pack_3d_experts
 
 MODEL_ID = "ibm-granite/granite-4.0-h-small"
 
@@ -40,3 +41,5 @@
 
 model.save_pretrained(SAVE_DIR)
 tokenizer.save_pretrained(SAVE_DIR)
+pack_3d_experts(SAVE_DIR)
+
diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index 66c2a41df2..759fc3a301 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -253,7 +253,189 @@ def replace_moe_modules(module, name=''):
     return model
 
 
+def pack_3d_experts(
+    source_dir,
+    validate=True,
+    backup=True,
+    allow_missing_experts=False,
+    verbose=True
+):
+    """
+    Transform MoE model from per-expert storage to 3D stacked tensors.
+    
+    From: model.layers.{L}.block_sparse_moe.{linear_type}.experts.{E}.{param}
+    To:   model.layers.{L}.block_sparse_moe.{linear_type}.{param}
+    
+    Args:
+        source_dir: Model directory path
+        validate: Validate shapes and expert continuity
+        backup: Create backup before modification (RECOMMENDED)
+        allow_missing_experts: Don't fail if some experts are missing
+        verbose: Print progress messages
+    """
+    source_dir = Path(source_dir)
+    index_file = source_dir / "model.safetensors.index.json"
+    backup_dir = None
+    temp_files = []
+    
+    def log(msg):
+        if verbose: print(msg)
+    
+    try:
+        # === BACKUP ===
+        if backup:
+            backup_dir = source_dir.parent / f"{source_dir.name}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+            backup_dir.mkdir(parents=True)
+            for f in source_dir.glob("*.safetensors*"):
+                shutil.copy2(f, backup_dir / f.name)
+            log(f"✓ Backup created at {backup_dir}")
+        
+        # === LOAD INDEX ===
+        with open(index_file) as f:
+            index_data = json.load(f)
+        weight_map = index_data["weight_map"]
+        
+        # === GROUP TENSORS ===
+        grouped = defaultdict(dict)  # {(layer, linear_type, param): {expert_num: (name, file)}}
+        other = {}
+        
+        for name, file in weight_map.items():
+            if ".block_sparse_moe." in name and ".experts." in name:
+                parts = name.split(".")
+                try:
+                    layer = int(parts[parts.index("layers") + 1])
+                    expert = int(parts[parts.index("experts") + 1])
+                    linear_type = parts[parts.index("experts") - 1]
+                    param = ".".join(parts[parts.index("experts") + 2:])
+                    grouped[(layer, linear_type, param)][expert] = (name, file)
+                except (ValueError, IndexError):
+                    other[name] = file
+            else:
+                other[name] = file
+        
+        log(f"✓ Found {len(grouped)} expert groups, {len(other)} other tensors")
+        
+        # === LOAD FILES ===
+        log("Loading files...")
+        loaded = {}
+        old_files = set(weight_map.values())
+        for file in old_files:
+            loaded[file] = load_file(str(source_dir / file))
+        
+        # === STACK EXPERTS ===
+        log("Stacking experts...")
+        new_tensors = {}
+        
+        for (layer, linear_type, param), experts in sorted(grouped.items()):
+            expert_nums = sorted(experts.keys())
+            
+            # Validate
+            if validate:
+                # Check continuity
+                expected = list(range(len(expert_nums)))
+                if expert_nums != expected:
+                    missing = set(expected) - set(expert_nums)
+                    if missing and not allow_missing_experts:
+                        raise ValueError(f"Missing experts {missing} in layer {layer}, {linear_type}.{param}")
+                
+                # Check shapes and dtypes
+                shapes = [loaded[experts[e][1]][experts[e][0]].shape for e in expert_nums]
+                dtypes = [loaded[experts[e][1]][experts[e][0]].dtype for e in expert_nums]
+                if len(set(shapes)) > 1:
+                    raise ValueError(f"Shape mismatch in layer {layer}, {linear_type}.{param}: {set(shapes)}")
+                if len(set(dtypes)) > 1:
+                    raise ValueError(f"Dtype mismatch in layer {layer}, {linear_type}.{param}: {set(dtypes)}")
+            
+            # Stack
+            tensors = [loaded[experts[e][1]][experts[e][0]] for e in expert_nums]
+            stacked = torch.stack(tensors, dim=0)
+            new_name = f"model.layers.{layer}.block_sparse_moe.{linear_type}.{param}"
+            new_tensors[new_name] = stacked
+            log(f"  Layer {layer} {linear_type}.{param}: {list(stacked.shape)}")
+        
+        # Copy other tensors
+        for name, file in other.items():
+            new_tensors[name] = loaded[file][name]
+        
+        # === DISTRIBUTE ACROSS FILES ===
+        log("Distributing tensors...")
+        num_files = len(old_files)
+        tensor_sizes = [(n, t.numel() * t.element_size()) for n, t in new_tensors.items()]
+        tensor_sizes.sort(key=lambda x: x[1], reverse=True)
+        
+        file_tensors = [{} for _ in range(num_files)]
+        file_sizes = [0] * num_files
+        new_weight_map = {}
+        
+        for name, size in tensor_sizes:
+            min_idx = file_sizes.index(min(file_sizes))
+            file_tensors[min_idx][name] = new_tensors[name]
+            file_sizes[min_idx] += size
+            new_weight_map[name] = f"model-{min_idx+1:05d}-of-{num_files:05d}.safetensors"
+        
+        # === SAVE FILES (TEMP) ===
+        log("Saving files...")
+        saved_files = []
+        for i, tensors in enumerate(file_tensors):
+            if tensors:
+                file_name = f"model-{i+1:05d}-of-{num_files:05d}.safetensors"
+                temp_name = f"{file_name}.tmp"
+                temp_path = source_dir / temp_name
+                save_file(tensors, str(temp_path))
+                temp_files.append(temp_path)
+                saved_files.append((temp_name, file_name))
+        
+        # Save index (temp)
+        temp_index = source_dir / "model.safetensors.index.json.tmp"
+        with open(temp_index, "w") as f:
+            json.dump({"metadata": index_data.get("metadata", {}), "weight_map": new_weight_map}, f, indent=2)
+        temp_files.append(temp_index)
+        
+        # === FINALIZE (DELETE OLD, RENAME TEMP) ===
+        log("Finalizing...")
+        # Delete old
+        for old in old_files:
+            (source_dir / old).unlink()
+        index_file.unlink()
+        
+        # Rename temp
+        for temp, final in saved_files:
+            (source_dir / temp).rename(source_dir / final)
+        temp_index.rename(index_file)
+        temp_files.clear()
+        
+        # === VERIFY ===
+        if validate:
+            with open(index_file) as f:
+                check = json.load(f)
+            remaining_experts = [n for n in check["weight_map"] if ".experts." in n]
+            if remaining_experts:
+                raise ValueError(f"Verification failed: {len(remaining_experts)} unpacked experts remain")
+        
+        log(f"✓ Success! Transformed {len(grouped)} expert groups")
+        
+    except Exception as e:
+        log(f"✗ Error: {e}")
+        
+        # === ROLLBACK ===
+        if backup and backup_dir and backup_dir.exists():
+            log("Rolling back...")
+            for temp in temp_files:
+                if temp.exists(): temp.unlink()
+            for f in source_dir.glob("*.safetensors*"):
+                f.unlink()
+            for f in backup_dir.glob("*"):
+                shutil.copy2(f, source_dir / f.name)
+            log("✓ Rolled back to backup")
+        
+        raise
+    
+    finally:
+        # Cleanup temp files
+        for temp in temp_files:
+            if temp.exists(): temp.unlink()
 
+            
 class GraniteMoeHybridParallelExpertsLinear(torch.nn.Linear):
     def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
         """Use a real Linear so that llmcompressor and vllm can handle it easier.

From c0dad631dff6c2297c8b92b87b6cfbfac93a92a0 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 1 Dec 2025 17:23:42 -0500
Subject: [PATCH 08/13] Add Path

---
 src/llmcompressor/modeling/granite4.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index 759fc3a301..4efc8037c6 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -8,6 +8,7 @@
 )
 
 from llmcompressor.modeling.moe_context import MoECalibrationModule
+from pathlib import Path
 
 class SequentialGraniteMoeExperts(nn.Module):
     """

From 4a83ff964ee563d0430e0bbb2c4d9d0897d5d5ca Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 1 Dec 2025 17:33:48 -0500
Subject: [PATCH 09/13] Add Datetime

---
 src/llmcompressor/modeling/granite4.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index 4efc8037c6..dd0851aa4e 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -9,6 +9,7 @@
 
 from llmcompressor.modeling.moe_context import MoECalibrationModule
 from pathlib import Path
+from datetime import datetime
 
 class SequentialGraniteMoeExperts(nn.Module):
     """

From 1e67acd33eb975d45cf0fee1728e9716b6f57588 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 1 Dec 2025 20:44:18 -0500
Subject: [PATCH 10/13] Add Imports

---
 src/llmcompressor/modeling/granite4.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index dd0851aa4e..9034af8add 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -8,8 +8,13 @@
 )
 
 from llmcompressor.modeling.moe_context import MoECalibrationModule
+
 from pathlib import Path
 from datetime import datetime
+from collections import defaultdict
+import shutil
+import json
+from safetensors.torch import load_file, save_file
 
 class SequentialGraniteMoeExperts(nn.Module):
     """

From 41577309d1f0421215acc814d2fabf1398971e6d Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 1 Dec 2025 20:58:33 -0500
Subject: [PATCH 11/13] Fix Pack Experts

---
 src/llmcompressor/modeling/granite4.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index 9034af8add..6c77529721 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -294,7 +294,7 @@ def log(msg):
             backup_dir = source_dir.parent / f"{source_dir.name}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
             backup_dir.mkdir(parents=True)
             for f in source_dir.glob("*.safetensors*"):
-                shutil.copy2(f, backup_dir / f.name)
+                shutil.copy(f, backup_dir / f.name)  # FIXED: Changed from copy2 to copy
             log(f"✓ Backup created at {backup_dir}")
         
         # === LOAD INDEX ===
@@ -432,7 +432,7 @@ def log(msg):
             for f in source_dir.glob("*.safetensors*"):
                 f.unlink()
             for f in backup_dir.glob("*"):
-                shutil.copy2(f, source_dir / f.name)
+                shutil.copy(f, source_dir / f.name)  # FIXED: Changed from copy2 to copy
             log("✓ Rolled back to backup")
         
         raise

From b43c83ed6890f9635180fce56e0a5ab24428ec49 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 1 Dec 2025 21:13:23 -0500
Subject: [PATCH 12/13] Fix Pack Experts

---
 src/llmcompressor/modeling/granite4.py | 350 +++++++++++++------------
 1 file changed, 189 insertions(+), 161 deletions(-)

diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index 6c77529721..0acaa3fd9f 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -260,187 +260,215 @@ def replace_moe_modules(module, name=''):
     return model
 
 
-def pack_3d_experts(
-    source_dir,
-    validate=True,
-    backup=True,
-    allow_missing_experts=False,
-    verbose=True
-):
+def pack_3d_experts(source_dir):
     """
-    Transform MoE model from per-expert storage to 3D stacked tensors.
+    Transform Granite MoE model from per-expert storage to stacked 3D tensor storage
     
     From: model.layers.{L}.block_sparse_moe.{linear_type}.experts.{E}.{param}
     To:   model.layers.{L}.block_sparse_moe.{linear_type}.{param}
     
-    Args:
-        source_dir: Model directory path
-        validate: Validate shapes and expert continuity
-        backup: Create backup before modification (RECOMMENDED)
-        allow_missing_experts: Don't fail if some experts are missing
-        verbose: Print progress messages
     """
     source_dir = Path(source_dir)
+    
+    # Load the index file
     index_file = source_dir / "model.safetensors.index.json"
-    backup_dir = None
-    temp_files = []
+    with open(index_file, "r") as f:
+        index_data = json.load(f)
     
-    def log(msg):
-        if verbose: print(msg)
-    
-    try:
-        # === BACKUP ===
-        if backup:
-            backup_dir = source_dir.parent / f"{source_dir.name}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-            backup_dir.mkdir(parents=True)
-            for f in source_dir.glob("*.safetensors*"):
-                shutil.copy(f, backup_dir / f.name)  # FIXED: Changed from copy2 to copy
-            log(f"✓ Backup created at {backup_dir}")
-        
-        # === LOAD INDEX ===
-        with open(index_file) as f:
-            index_data = json.load(f)
-        weight_map = index_data["weight_map"]
-        
-        # === GROUP TENSORS ===
-        grouped = defaultdict(dict)  # {(layer, linear_type, param): {expert_num: (name, file)}}
-        other = {}
-        
-        for name, file in weight_map.items():
-            if ".block_sparse_moe." in name and ".experts." in name:
-                parts = name.split(".")
-                try:
-                    layer = int(parts[parts.index("layers") + 1])
-                    expert = int(parts[parts.index("experts") + 1])
-                    linear_type = parts[parts.index("experts") - 1]
-                    param = ".".join(parts[parts.index("experts") + 2:])
-                    grouped[(layer, linear_type, param)][expert] = (name, file)
-                except (ValueError, IndexError):
-                    other[name] = file
-            else:
-                other[name] = file
-        
-        log(f"✓ Found {len(grouped)} expert groups, {len(other)} other tensors")
-        
-        # === LOAD FILES ===
-        log("Loading files...")
-        loaded = {}
-        old_files = set(weight_map.values())
-        for file in old_files:
-            loaded[file] = load_file(str(source_dir / file))
-        
-        # === STACK EXPERTS ===
-        log("Stacking experts...")
-        new_tensors = {}
-        
-        for (layer, linear_type, param), experts in sorted(grouped.items()):
-            expert_nums = sorted(experts.keys())
+    weight_map = index_data["weight_map"]
+    
+    # Group tensors by layer, linear type, and parameter
+    # Structure: {(layer_num, linear_type, param): {expert_num: (tensor_name, file_name)}}
+    grouped_tensors = defaultdict(dict)
+    other_tensors = {}  # Non-expert tensors (router, embeddings, etc.)
+    
+    for tensor_name, file_name in weight_map.items():
+        # Check if this is an expert tensor
+        # Pattern: model.layers.{L}.block_sparse_moe.{linear_type}.experts.{E}.{param}
+        if ".block_sparse_moe." in tensor_name and ".experts." in tensor_name:
+            parts = tensor_name.split(".")
             
-            # Validate
-            if validate:
-                # Check continuity
-                expected = list(range(len(expert_nums)))
-                if expert_nums != expected:
-                    missing = set(expected) - set(expert_nums)
-                    if missing and not allow_missing_experts:
-                        raise ValueError(f"Missing experts {missing} in layer {layer}, {linear_type}.{param}")
+            try:
+                # Find the indices of key parts
+                layers_idx = parts.index("layers")
+                layer_num = int(parts[layers_idx + 1])
                 
-                # Check shapes and dtypes
-                shapes = [loaded[experts[e][1]][experts[e][0]].shape for e in expert_nums]
-                dtypes = [loaded[experts[e][1]][experts[e][0]].dtype for e in expert_nums]
-                if len(set(shapes)) > 1:
-                    raise ValueError(f"Shape mismatch in layer {layer}, {linear_type}.{param}: {set(shapes)}")
-                if len(set(dtypes)) > 1:
-                    raise ValueError(f"Dtype mismatch in layer {layer}, {linear_type}.{param}: {set(dtypes)}")
-            
-            # Stack
-            tensors = [loaded[experts[e][1]][experts[e][0]] for e in expert_nums]
-            stacked = torch.stack(tensors, dim=0)
-            new_name = f"model.layers.{layer}.block_sparse_moe.{linear_type}.{param}"
-            new_tensors[new_name] = stacked
-            log(f"  Layer {layer} {linear_type}.{param}: {list(stacked.shape)}")
-        
-        # Copy other tensors
-        for name, file in other.items():
-            new_tensors[name] = loaded[file][name]
+                experts_idx = parts.index("experts")
+                expert_num = int(parts[experts_idx + 1])
+                
+                # The linear type is right before "experts"
+                # e.g., "input_linear" or "output_linear"
+                linear_type = parts[experts_idx - 1]
+                
+                # The parameter is after expert number
+                # e.g., "weight" or "weight_scale"
+                param = ".".join(parts[experts_idx + 2:])
+                
+                # Create grouping key
+                group_key = (layer_num, linear_type, param)
+                grouped_tensors[group_key][expert_num] = (tensor_name, file_name)
+                
+            except (ValueError, IndexError) as e:
+                # If parsing fails, treat as other tensor
+                print(f"  Warning: Could not parse expert tensor: {tensor_name}")
+                other_tensors[tensor_name] = file_name
+        else:
+            other_tensors[tensor_name] = file_name
+    
+    # Load all safetensors files
+    print("Loading source safetensors files...")
+    loaded_tensors = {}
+    unique_files = set(weight_map.values())
+    old_files = list(unique_files)  # Store list of old files to delete later
+    
+    for file_name in unique_files:
+        file_path = source_dir / file_name
+        print(f"  Loading {file_name}...")
+        loaded_tensors[file_name] = load_file(str(file_path))
+    
+    # Create new tensors by stacking experts
+    print("\nStacking expert tensors...")
+    new_tensors = {}
+    
+    # Process each grouped tensor
+    for (layer_num, linear_type, param), experts_dict in sorted(grouped_tensors.items()):
+        print(f"  Processing layer {layer_num}, {linear_type}.{param}...")
         
-        # === DISTRIBUTE ACROSS FILES ===
-        log("Distributing tensors...")
-        num_files = len(old_files)
-        tensor_sizes = [(n, t.numel() * t.element_size()) for n, t in new_tensors.items()]
-        tensor_sizes.sort(key=lambda x: x[1], reverse=True)
+        # Get all expert tensors for this group
+        expert_nums = sorted(experts_dict.keys())
+        expert_tensors = []
         
-        file_tensors = [{} for _ in range(num_files)]
-        file_sizes = [0] * num_files
-        new_weight_map = {}
+        for expert_num in expert_nums:
+            tensor_name, file_name = experts_dict[expert_num]
+            tensor = loaded_tensors[file_name][tensor_name]
+            expert_tensors.append(tensor)
         
-        for name, size in tensor_sizes:
-            min_idx = file_sizes.index(min(file_sizes))
-            file_tensors[min_idx][name] = new_tensors[name]
-            file_sizes[min_idx] += size
-            new_weight_map[name] = f"model-{min_idx+1:05d}-of-{num_files:05d}.safetensors"
+        # Stack along first dimension to create 3D tensor
+        stacked_tensor = torch.stack(expert_tensors, dim=0)
         
-        # === SAVE FILES (TEMP) ===
-        log("Saving files...")
-        saved_files = []
-        for i, tensors in enumerate(file_tensors):
-            if tensors:
-                file_name = f"model-{i+1:05d}-of-{num_files:05d}.safetensors"
-                temp_name = f"{file_name}.tmp"
-                temp_path = source_dir / temp_name
-                save_file(tensors, str(temp_path))
-                temp_files.append(temp_path)
-                saved_files.append((temp_name, file_name))
+        # Create new tensor name (remove .experts.{E} part)
+        new_tensor_name = f"model.layers.{layer_num}.block_sparse_moe.{linear_type}.{param}"
+        new_tensors[new_tensor_name] = stacked_tensor
         
-        # Save index (temp)
-        temp_index = source_dir / "model.safetensors.index.json.tmp"
-        with open(temp_index, "w") as f:
-            json.dump({"metadata": index_data.get("metadata", {}), "weight_map": new_weight_map}, f, indent=2)
-        temp_files.append(temp_index)
+        print(f"    {new_tensor_name}: {list(stacked_tensor.shape)} (stacked {len(expert_tensors)} experts)")
+    
+    # Copy non-expert tensors (router, embeddings, etc.)
+    print("\nCopying non-expert tensors...")
+    for tensor_name, file_name in other_tensors.items():
+        tensor = loaded_tensors[file_name][tensor_name]
+        new_tensors[tensor_name] = tensor
+        print(f"  Copied: {tensor_name}")
+    
+    # Determine file distribution for new tensors
+    # Simple strategy: distribute roughly equally across same number of files
+    num_output_files = len(unique_files)
+    tensors_list = list(new_tensors.items())
+    
+    # Calculate approximate size per file
+    total_numel = sum(t.numel() * t.element_size() for _, t in tensors_list)
+    target_size_per_file = total_numel / num_output_files
+    
+    # Distribute tensors across files
+    print(f"\nDistributing tensors across {num_output_files} files...")
+    file_tensors = [{}  for _ in range(num_output_files)]
+    file_sizes = [0] * num_output_files
+    new_weight_map = {}
+    
+    for tensor_name, tensor in tensors_list:
+        # Find file with smallest current size
+        min_idx = file_sizes.index(min(file_sizes))
+        file_tensors[min_idx][tensor_name] = tensor
+        file_sizes[min_idx] += tensor.numel() * tensor.element_size()
         
-        # === FINALIZE (DELETE OLD, RENAME TEMP) ===
-        log("Finalizing...")
-        # Delete old
-        for old in old_files:
-            (source_dir / old).unlink()
+        # Update weight map
+        file_name = f"model-{min_idx+1:05d}-of-{num_output_files:05d}.safetensors"
+        new_weight_map[tensor_name] = file_name
+    
+    # Save new safetensors files with temporary names
+    print("\nSaving new safetensors files (temporary)...")
+    temp_files = []
+    for i, tensors_dict in enumerate(file_tensors):
+        if tensors_dict:  # Only save if not empty
+            file_name = f"model-{i+1:05d}-of-{num_output_files:05d}.safetensors"
+            temp_file_name = f"model-{i+1:05d}-of-{num_output_files:05d}.safetensors.tmp"
+            output_path = source_dir / temp_file_name
+            print(f"  Saving {temp_file_name} ({len(tensors_dict)} tensors)...")
+            save_file(tensors_dict, str(output_path))
+            temp_files.append((temp_file_name, file_name))
+    
+    # Save updated index file with temporary name
+    print("\nSaving updated index file (temporary)...")
+    new_index_data = {
+        "metadata": index_data.get("metadata", {}),
+        "weight_map": new_weight_map
+    }
+    
+    temp_index_file = source_dir / "model.safetensors.index.json.tmp"
+    with open(temp_index_file, "w") as f:
+        json.dump(new_index_data, f, indent=2)
+    
+    # Now delete old files
+    print("\nDeleting old safetensors files...")
+    for old_file in old_files:
+        old_file_path = source_dir / old_file
+        if old_file_path.exists():
+            old_file_path.unlink()
+            print(f"  Deleted {old_file}")
+    
+    # Delete old index file
+    if index_file.exists():
         index_file.unlink()
+        print(f"  Deleted model.safetensors.index.json")
+    
+    # Rename temporary files to final names
+    print("\nRenaming temporary files to final names...")
+    for temp_name, final_name in temp_files:
+        temp_path = source_dir / temp_name
+        final_path = source_dir / final_name
+        temp_path.rename(final_path)
+        print(f"  Renamed {temp_name} -> {final_name}")
+    
+    # Rename temporary index file
+    temp_index_file.rename(index_file)
+    print(f"  Renamed model.safetensors.index.json.tmp -> model.safetensors.index.json")
+    
+    # Update config.json to rename mamba layers to mixer
+    print("\nUpdating config.json to rename mamba layers to mixer...")
+    config_file = source_dir / "config.json"
+    if config_file.exists():
+        with open(config_file, "r") as f:
+            config_data = json.load(f)
         
-        # Rename temp
-        for temp, final in saved_files:
-            (source_dir / temp).rename(source_dir / final)
-        temp_index.rename(index_file)
-        temp_files.clear()
-        
-        # === VERIFY ===
-        if validate:
-            with open(index_file) as f:
-                check = json.load(f)
-            remaining_experts = [n for n in check["weight_map"] if ".experts." in n]
-            if remaining_experts:
-                raise ValueError(f"Verification failed: {len(remaining_experts)} unpacked experts remain")
-        
-        log(f"✓ Success! Transformed {len(grouped)} expert groups")
-        
-    except Exception as e:
-        log(f"✗ Error: {e}")
-        
-        # === ROLLBACK ===
-        if backup and backup_dir and backup_dir.exists():
-            log("Rolling back...")
-            for temp in temp_files:
-                if temp.exists(): temp.unlink()
-            for f in source_dir.glob("*.safetensors*"):
-                f.unlink()
-            for f in backup_dir.glob("*"):
-                shutil.copy(f, source_dir / f.name)  # FIXED: Changed from copy2 to copy
-            log("✓ Rolled back to backup")
-        
-        raise
+        # Check if quantization_config exists and has ignore list
+        if "quantization_config" in config_data and "ignore" in config_data["quantization_config"]:
+            ignore_list = config_data["quantization_config"]["ignore"]
+            updated_count = 0
+            
+            # Replace mamba.in_proj with mixer.in_proj and mamba.out_proj with mixer.out_proj
+            for i, entry in enumerate(ignore_list):
+                if "mamba.in_proj" in entry or "mamba.out_proj" in entry:
+                    new_entry = entry.replace("mamba.in_proj", "mixer.in_proj").replace("mamba.out_proj", "mixer.out_proj")
+                    ignore_list[i] = new_entry
+                    updated_count += 1
+                    print(f"  Updated: {entry} -> {new_entry}")
+            
+            # Save updated config
+            with open(config_file, "w") as f:
+                json.dump(config_data, f, indent=2)
+            
+            print(f"  Updated {updated_count} entries in config.json")
+        else:
+            print("  No quantization_config.ignore found in config.json")
+    else:
+        print("  config.json not found")
     
-    finally:
-        # Cleanup temp files
-        for temp in temp_files:
-            if temp.exists(): temp.unlink()
+    # Print summary
+    num_stacked = len(grouped_tensors)
+    num_other = len(other_tensors)
+    print(f"\n📊 Summary:")
+    print(f"   Stacked expert groups: {num_stacked}")
+    print(f"   Non-expert tensors: {num_other}")
+    print(f"\nCheckpoint Updated for vLLM Compatibility")
 
             
 class GraniteMoeHybridParallelExpertsLinear(torch.nn.Linear):

From 5c97c222f588b22d590a73974f4da51f5bc0afc6 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 1 Dec 2025 21:24:18 -0500
Subject: [PATCH 13/13] Fix Pack Experts

---
 src/llmcompressor/modeling/granite4.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/llmcompressor/modeling/granite4.py b/src/llmcompressor/modeling/granite4.py
index 0acaa3fd9f..5204b8413d 100644
--- a/src/llmcompressor/modeling/granite4.py
+++ b/src/llmcompressor/modeling/granite4.py
@@ -10,9 +10,7 @@
 from llmcompressor.modeling.moe_context import MoECalibrationModule
 
 from pathlib import Path
-from datetime import datetime
 from collections import defaultdict
-import shutil
 import json
 from safetensors.torch import load_file, save_file