open-lm-engine · mayank31398 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/lm_engine/hf_models/__init__.py b/lm_engine/hf_models/__init__.py
@@ -2,6 +2,8 @@
 # Copyright (c) 2025, Mayank Mishra
 # **************************************************
 
+import torch
+
 from .config import CommonConfig
 from .loss import get_autoregressive_language_modeling_loss, is_aux_loss_zero
 from .mixins import CausalLMOutputWithPast, PipelineParallelInput, PipelineParallelOutput
@@ -39,3 +41,31 @@
 
 
 register_model_classes()
+
+
+def _patch_granitemoehybrid_weight_init() -> None:
+    try:
+        from transformers.models.granitemoehybrid.modeling_granitemoehybrid import (
+            GraniteMoeHybridMambaLayer,
+            GraniteMoeHybridPreTrainedModel,
+        )
+    except Exception:
+        return
+
+    if getattr(GraniteMoeHybridPreTrainedModel._init_weights, "_lm_engine_patched", False):
+        return
+
+    def _init_weights(self, module):
+        super(GraniteMoeHybridPreTrainedModel, self)._init_weights(module)
+        if isinstance(module, GraniteMoeHybridMambaLayer):
+            module.dt_bias.data.fill_(1.0)
+            module.A_log.data.copy_(
+                torch.log(torch.arange(1, module.num_heads + 1, dtype=module.A_log.dtype, device=module.A_log.device))
+            )
+            module.D.data.fill_(1.0)
+
+    _init_weights._lm_engine_patched = True
+    GraniteMoeHybridPreTrainedModel._init_weights = _init_weights
-    def _init_weights(self, module):
-        super(GraniteMoeHybridPreTrainedModel, self)._init_weights(module)
-        if isinstance(module, GraniteMoeHybridMambaLayer):
-            module.dt_bias.data.fill_(1.0)
-            module.A_log.data.copy_(
-                torch.log(torch.arange(1, module.num_heads + 1, dtype=module.A_log.dtype, device=module.A_log.device))
-            )
-            module.D.data.fill_(1.0)
-
-    _init_weights._lm_engine_patched = True
-    GraniteMoeHybridPreTrainedModel._init_weights = _init_weights
+    original_init_weights = GraniteMoeHybridPreTrainedModel._init_weights
+
+    def _init_weights(self, module):
+        original_init_weights(self, module)
+        if isinstance(module, GraniteMoeHybridMambaLayer):
+            module.dt_bias.data.fill_(1.0)
+            module.A_log.data.copy_(
+                torch.log(torch.arange(1, module.num_heads + 1, dtype=module.A_log.dtype, device=module.A_log.device))
+            )
+            module.D.data.fill_(1.0)
-    def _init_weights(self, module):
-        super(GraniteMoeHybridPreTrainedModel, self)._init_weights(module)
-        if isinstance(module, GraniteMoeHybridMambaLayer):
-            module.dt_bias.data.fill_(1.0)
-            module.A_log.data.copy_(
-                torch.log(torch.arange(1, module.num_heads + 1, dtype=module.A_log.dtype, device=module.A_log.device))
-            )
-            module.D.data.fill_(1.0)
-
-    _init_weights._lm_engine_patched = True
-    GraniteMoeHybridPreTrainedModel._init_weights = _init_weights
+    original_init_weights = GraniteMoeHybridPreTrainedModel._init_weights
+
+    def _init_weights(self, module):
+        original_init_weights(self, module)
+        if isinstance(module, GraniteMoeHybridMambaLayer):
+            module.dt_bias.data.fill_(1.0)
+            module.A_log.data.copy_(
+                torch.log(torch.arange(1, module.num_heads + 1, dtype=module.A_log.dtype, device=module.A_log.device))
+            )
+            module.D.data.fill_(1.0)
+
+
+_patch_granitemoehybrid_weight_init()
diff --git a/lm_engine/hf_models/config/__init__.py b/lm_engine/hf_models/config/__init__.py
@@ -11,15 +11,7 @@
 
 from ...utils import BaseArgs, divide_if_divisible
 from .mlp import _MLPArgs, _MoEArgs
-from .sequence_mixer import (
-    _CausalConvolution,
-    _GatedDeltaNetArgs,
-    _GRUArgs,
-    _M2RNNArgs,
-    _Mamba2Args,
-    _RNNArgs,
-    _SoftmaxAttentionArgs,
-)
+from .sequence_mixer import _GatedDeltaNetArgs, _GRUArgs, _M2RNNArgs, _Mamba2Args, _RNNArgs, _SoftmaxAttentionArgs
 
 
 def _hold_base_args(key: str) -> Callable:
@@ -37,7 +29,6 @@ def _run(self, *args, **kwargs):
 
 
 _SEQUENCE_MIXER_CONFIG_CLASSES = {
-    "causal_convolution": _CausalConvolution,
     "gru": _GRUArgs,
     "m2rnn": _M2RNNArgs,
     "mamba2": _Mamba2Args,
@@ -183,13 +174,7 @@ def _set_sequence_mixer_blocks(self) -> None:
             self.sequence_mixer_blocks = [{} for _ in range(self.num_layers)]
 
         sequence_mixer_blocks: list[
-            _CausalConvolution
-            | _GRUArgs
-            | _Mamba2Args
-            | _RNNArgs
-            | _M2RNNArgs
-            | _SoftmaxAttentionArgs
-            | _GatedDeltaNetArgs
+            _GRUArgs | _Mamba2Args | _RNNArgs | _M2RNNArgs | _SoftmaxAttentionArgs | _GatedDeltaNetArgs
         ] = []
         for i in range(self.num_layers):
             sequence_mixer_block = deepcopy(self.sequence_mixer_blocks[i])

diff --git a/lm_engine/hf_models/config/sequence_mixer.py b/lm_engine/hf_models/config/sequence_mixer.py
@@ -134,19 +134,6 @@ def model_post_init(self, __context: Any) -> None:
         assert self.sequence_mixer_type == "m2rnn"
 
 
-class _CausalConvolution(BaseArgs):
-    sequence_mixer_type: str = "causal_convolution"
-    activation_function: str = "silu"
-    in_channels: int
-    out_channels: int
-    kernel_size: int
-    num_groups: int
-    add_bias: bool = False
-
-    def model_post_init(self, __context: Any) -> None:
-        assert self.sequence_mixer_type == "causal_convolution"
-
-
 class _GatedDeltaNetArgs(_SoftPlusDecayArgs):
     sequence_mixer_type: str = "gated_deltanet"
     k_head_dim: int

diff --git a/lm_engine/hf_models/modeling_utils/__init__.py b/lm_engine/hf_models/modeling_utils/__init__.py
@@ -3,7 +3,6 @@
 # **************************************************
 
 from .activations import get_activation_function, is_glu
-from .convolution import ParameterizedConv1d
 from .dropout import Dropout
 from .dtensor_module import DTensorModule
 from .embedding import ParameterizedEmbedding, get_tensor_parallel_vocab_info

diff --git a/lm_engine/hf_models/modeling_utils/convolution.py b/lm_engine/hf_models/modeling_utils/convolution.py
diff --git a/lm_engine/hf_models/modeling_utils/depthwise_causal_convolution.py b/lm_engine/hf_models/modeling_utils/depthwise_causal_convolution.py
@@ -0,0 +1,155 @@
+# **************************************************
+# Copyright (c) 2025, Mayank Mishra
+# **************************************************
+
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...enums import Kernel
+from ...kernels import is_kernel_allowed
+from ...utils import divide_if_divisible, is_causal_conv1d_available
+from ..parameter import (
+    mark_parameter_as_initialized,
+    mark_parameter_as_mup_learning_rate,
+    mark_parameter_as_no_weight_decay,
+)
+from .activations import get_activation_function
+
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+
+
+def _apply_mask_to_padding_states(hidden_states: torch.Tensor, attention_mask: torch.Tensor | None) -> torch.Tensor:
+    """
+    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
+    """
+    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
-    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+    if attention_mask is not None and attention_mask.shape[1] > 1:
-    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+    if attention_mask is not None and attention_mask.shape[1] > 1:
+        dtype = hidden_states.dtype
+        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+    return hidden_states
+
+
+class DepthwiseCausalConvolution(nn.Conv1d):
+    def __init__(
+        self,
+        hidden_size: int,
+        kernel_size: int,
+        activation_function: str,
+        add_bias: bool,
+        std: float | None,
+        use_padding_free_transformer: bool,
+    ) -> DepthwiseCausalConvolution:
+        if use_padding_free_transformer:
+            raise NotImplementedError()
+
+        self.std = std
+
+        super().__init__(
+            in_channels=hidden_size,
+            out_channels=hidden_size,
+            kernel_size=kernel_size,
+            padding=kernel_size - 1,
+            groups=hidden_size,
+            bias=add_bias,
+        )
+
+        self.activation_string = activation_function
+        self.activation_function = get_activation_function(self.activation_string)
+        self.use_activation_inside_kernel = self.activation_string in [None, "silu", "swish"]
+        self.kernel_size = kernel_size
+
+        mark_parameter_as_no_weight_decay(self.bias)
-        mark_parameter_as_no_weight_decay(self.bias)
+        if self.bias is not None:
+            mark_parameter_as_no_weight_decay(self.bias)
-        mark_parameter_as_no_weight_decay(self.bias)
+        if self.bias is not None:
+            mark_parameter_as_no_weight_decay(self.bias)
+
+        self.reset_parameters()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_state: torch.Tensor | None,
+        attention_mask: torch.Tensor | None,
+        output_state: bool,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        S = hidden_states.size(1)
+        hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
+
+        if is_kernel_allowed(Kernel.causal_conv1d):
+            if input_state is None:
+                hidden_states = hidden_states.transpose(-1, -2)
+
+                if output_state:
+                    # F.pad trims the hidden_states if sequence_length > kernel_size
+                    input_state = F.pad(hidden_states, (self.kernel_size - S, 0))
+
+                hidden_states = causal_conv1d_fn(
+                    x=hidden_states,
+                    weight=self.weight.squeeze(1),
+                    bias=self.bias,
+                    activation=self.activation_string if self.use_activation_inside_kernel else None,
+                )
+
+                hidden_states = hidden_states.transpose(-1, -2)
+            else:
+                assert S == 1
+
+                input_state_buffer = input_state.clone()
+
+                hidden_states = causal_conv1d_update(
+                    x=hidden_states,
+                    conv_state=input_state_buffer,
+                    weight=self.weight.squeeze(1),
+                    bias=self.bias,
+                    activation=self.activation_string if self.use_activation_inside_kernel else None,
+                )
+
+                input_state = input_state_buffer if output_state else None
+
+            if not self.use_activation_inside_kernel:
+                hidden_states = self.activation_function(hidden_states)
+        else:
+            if input_state is None:
+                hidden_states = hidden_states.transpose(-1, -2)
+
+                if output_state:
+                    # F.pad trims the hidden_states if sequence_length > kernel_size
+                    input_state = F.pad(hidden_states, (self.kernel_size - S, 0))
+
+                hidden_states = super().forward(hidden_states)
+
+                # removes padding on the right side of the sequence
+                hidden_states = hidden_states[..., : 1 - self.kernel_size]
-                hidden_states = hidden_states[..., : 1 - self.kernel_size]
+                # removes padding on the right side of the sequence
+                if self.kernel_size > 1:
+                    hidden_states = hidden_states[..., : 1 - self.kernel_size]
-                hidden_states = hidden_states[..., : 1 - self.kernel_size]
+                # removes padding on the right side of the sequence
+                if self.kernel_size > 1:
+                    hidden_states = hidden_states[..., : 1 - self.kernel_size]
+                hidden_states = hidden_states.transpose(-1, -2)
+            else:
+                assert S == 1
+
+                input_state = input_state.roll(shifts=-1, dims=-1)
+                input_state[..., -1] = hidden_states[:, 0]
+
+                hidden_states = (input_state * self.weight.squeeze(1)).sum(dim=-1)
+                hidden_states = hidden_states[:, None, :]
+                if self.bias is not None:
+                    hidden_states = hidden_states + self.bias
+
+                if not output_state:
+                    input_state = None
+
+            hidden_states = self.activation_function(hidden_states)
+            hidden_states = _apply_mask_to_padding_states(hidden_states, attention_mask)
+
+        return hidden_states, input_state
+
+    @torch.no_grad()
+    def reset_parameters(self) -> None:
+        if self.std is None:
+            super().reset_parameters()
+        else:
+            nn.init.normal_(self.weight, mean=0, std=self.std)
+            if hasattr(self, "bias") and self.bias is not None:
+                self.bias.zero_()
+
+        mark_parameter_as_initialized(self.weight)
+        mark_parameter_as_initialized(self.bias)
-        mark_parameter_as_initialized(self.bias)
+        mark_parameter_as_initialized(self.weight)
+        if self.bias is not None:
+            mark_parameter_as_initialized(self.bias)
-        mark_parameter_as_initialized(self.bias)
+        mark_parameter_as_initialized(self.weight)
+        if self.bias is not None:
+            mark_parameter_as_initialized(self.bias)
diff --git a/lm_engine/hf_models/modeling_utils/sequence_mixer_blocks/__init__.py b/lm_engine/hf_models/modeling_utils/sequence_mixer_blocks/__init__.py
@@ -9,7 +9,6 @@
     interleave_query_key_value_tensor_for_attention,
     split_query_key_value_tensor_for_attention,
 )
-from .causal_convolution import CausalConvolution
 from .gated_deltanet import GatedDeltaNet
 from .gru import GRU
 from .m2rnn import M2RNN
@@ -18,7 +17,7 @@
 from .utils import flash_attention
 
 
-SEQUENCE_MIXER_TYPE = Attention | CausalConvolution | GRU | Mamba2 | RNN | GatedDeltaNet
+SEQUENCE_MIXER_TYPE = Attention | GRU | Mamba2 | RNN | GatedDeltaNet
 
 
 def get_sequence_mixer(
@@ -29,25 +28,7 @@ def get_sequence_mixer(
 
     is_tp_enabled = ProcessGroupManager.is_tensor_parallel_enabled()
 
-    if sequence_mixer_type == "causal_convolution":
-        assert not is_tp_enabled
-        return CausalConvolution(
-            hidden_size=config.hidden_size,
-            in_channels=block.in_channels,
-            out_channels=block.out_channels,
-            kernel_size=block.kernel_size,
-            num_groups=block.num_groups,
-            activation_function=block.activation_function,
-            add_bias=block.add_bias,
-            initializer_range=config.initializer_range,
-            m_width=config.m_width,
-            init_method=config.init_method,
-            num_layers=config.num_layers,
-            layer_idx=layer_idx,
-            use_depth_scaled_init=config.use_depth_scaled_init,
-            use_padding_free_transformer=use_padding_free_transformer,
-        )
-    elif sequence_mixer_type == "gru":
+    if sequence_mixer_type == "gru":
         assert not is_tp_enabled
         return GRU(
             input_size=config.hidden_size,