From ac39fd2f141764fa0ce5951e5318f16fa43882a8 Mon Sep 17 00:00:00 2001
From: octo-patch <octo-patch@github.com>
Date: Wed, 22 Apr 2026 20:26:31 +0800
Subject: [PATCH] feat: add MiniMax-M2.7 architecture config and tokenizer
 support

- Add minimax_m2_config() to open_mythos/variants.py: MythosConfig
  expressing MiniMax-M2.7 structural dimensions (dim=6144, n_heads=48,
  n_kv_heads=8, head_dim=128 via MLA rope+nope split, 32 routed + 2 shared
  experts top-4, 40K context, rope_theta=10M)
- Add MINIMAX_M2_MODEL_ID constant to open_mythos/tokenizer.py
- Export both symbols from open_mythos/__init__.py
- Add tests/test_minimax_m2.py: 39 unit tests covering config dimensions,
  MoE structure, forward-pass correctness, MLA cache compression, LTI
  spectral radius stability, and high rope_theta numerical safety
---
 open_mythos/__init__.py  |   5 +-
 open_mythos/tokenizer.py |   5 +
 open_mythos/variants.py  |  62 ++++++++
 tests/test_minimax_m2.py | 314 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_minimax_m2.py

diff --git a/open_mythos/__init__.py b/open_mythos/__init__.py
index 73c2c04..edc9f82 100644
--- a/open_mythos/__init__.py
+++ b/open_mythos/__init__.py
@@ -15,8 +15,9 @@
     loop_index_embedding,
     precompute_rope_freqs,
 )
-from open_mythos.tokenizer import MythosTokenizer
+from open_mythos.tokenizer import MINIMAX_M2_MODEL_ID, MythosTokenizer
 from open_mythos.variants import (
+    minimax_m2_config,
     mythos_1b,
     mythos_1t,
     mythos_3b,
@@ -49,6 +50,8 @@
     "mythos_100b",
     "mythos_500b",
     "mythos_1t",
+    "minimax_m2_config",
+    "MINIMAX_M2_MODEL_ID",
     "load_tokenizer",
     "get_vocab_size",
     "MythosTokenizer",
diff --git a/open_mythos/tokenizer.py b/open_mythos/tokenizer.py
index fadb3a5..bb7022a 100644
--- a/open_mythos/tokenizer.py
+++ b/open_mythos/tokenizer.py
@@ -2,6 +2,11 @@
 
 DEFAULT_MODEL_ID = "openai/gpt-oss-20b"
 
+# HuggingFace model ID for the MiniMax-M2.7 tokenizer.
+# Supports vocab_size=200064 — pass this to MythosTokenizer when working
+# with the minimax_m2_config() variant.
+MINIMAX_M2_MODEL_ID = "MiniMaxAI/MiniMax-M2.7"
+
 
 class MythosTokenizer:
     """
diff --git a/open_mythos/variants.py b/open_mythos/variants.py
index 83f7dd4..a020f3f 100644
--- a/open_mythos/variants.py
+++ b/open_mythos/variants.py
@@ -4,6 +4,10 @@
 #   total ≈ embed + prelude/coda dense blocks + recurrent MLA + MoE
 #   MoE   = 3 * dim * expert_dim * (n_experts + n_shared * n_experts_per_tok)
 # expert_dim is solved from the residual budget after all other terms.
+#
+# MiniMax-M2.7 reference config (see minimax_m2_config below):
+#   Architecture source: MiniMax-01 technical report (MiniMaxAI/MiniMax-M2.7 on HuggingFace)
+#   230B total params, ~7B active per token, 48Q/8KV heads, head_dim=128, 32 routed experts
 
 
 def mythos_1b() -> MythosConfig:
@@ -196,3 +200,61 @@ def mythos_1t() -> MythosConfig:
         lora_rank=256,
         max_output_tokens=131072,
     )
+
+
+def minimax_m2_config() -> MythosConfig:
+    """OpenMythos configuration matching the MiniMax-M2.7 architecture.
+
+    MiniMax-M2.7 (MiniMaxAI/MiniMax-M2.7 on HuggingFace) is a 230B-parameter sparse
+    MoE language model.  Key structural dimensions expressed in OpenMythos terms:
+
+    * ``dim = 6144``  — matches MiniMax-M2.7's hidden size
+    * ``n_heads = 48`` / ``n_kv_heads = 8``  — 6× GQA ratio (48 query heads, 8 KV heads)
+    * ``qk_rope_head_dim = 64``, ``qk_nope_head_dim = 64``, ``v_head_dim = 128``
+      — the per-head dimensions satisfy MiniMax-M2.7's head_dim = 128 for both
+      the Q/K attention key size (rope + nope = 128) and the value projection (128)
+    * ``n_experts = 32``, ``n_shared_experts = 2``, ``n_experts_per_tok = 4``
+      — mirrors MiniMax-M2.7's fine-grained MoE design (32 routed + 2 always-on)
+    * ``max_seq_len = 40960``  — MiniMax-M2.7's 40K-token native context window
+    * ``rope_theta = 10_000_000``  — high RoPE base for long-context stability
+
+    Use this config to instantiate an OpenMythos model whose structural dimensions
+    reflect the MiniMax-M2.7 design point.  Weights are randomly initialized; load
+    ``MiniMaxAI/MiniMax-M2.7`` from HuggingFace for production inference.
+
+    Example::
+
+        from open_mythos.variants import minimax_m2_config
+        from open_mythos.main import OpenMythos
+        import torch
+
+        cfg = minimax_m2_config()
+        model = OpenMythos(cfg)
+        ids = torch.randint(0, cfg.vocab_size, (1, 16))
+        logits = model(ids, n_loops=4)
+        print(logits.shape)  # (1, 16, 200064)
+    """
+    return MythosConfig(
+        vocab_size=200064,
+        dim=6144,
+        n_heads=48,
+        n_kv_heads=8,
+        max_seq_len=40960,
+        max_loop_iters=16,
+        prelude_layers=4,
+        coda_layers=4,
+        attn_type="mla",
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        qk_nope_head_dim=64,
+        v_head_dim=128,
+        n_experts=32,
+        n_shared_experts=2,
+        n_experts_per_tok=4,
+        expert_dim=8192,
+        act_threshold=0.99,
+        rope_theta=10_000_000.0,
+        lora_rank=32,
+        max_output_tokens=4096,
+    )
diff --git a/tests/test_minimax_m2.py b/tests/test_minimax_m2.py
new file mode 100644
index 0000000..4198c29
--- /dev/null
+++ b/tests/test_minimax_m2.py
@@ -0,0 +1,314 @@
+"""
+Unit tests for the MiniMax-M2.7 OpenMythos configuration.
+
+All tests run on CPU using synthetic tensors — no model weights are required.
+The MiniMax-M2.7 config (minimax_m2_config) is instantiated with a scaled-down
+hidden dim so the test suite runs quickly; a separate class validates the full
+architecture dimensions without actually allocating the model.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from open_mythos.main import (
+    MythosConfig,
+    OpenMythos,
+    precompute_rope_freqs,
+)
+from open_mythos.tokenizer import MINIMAX_M2_MODEL_ID
+from open_mythos.variants import minimax_m2_config
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+B, T = 2, 8  # batch size and sequence length used in forward-pass tests
+
+
+def _small_m2_cfg(**overrides) -> MythosConfig:
+    """Return a tiny MiniMax-M2.7-shaped config suitable for CPU tests.
+
+    The structural ratios are preserved:
+    - GQA ratio 6:1 (n_heads / n_kv_heads = 48 / 8)
+    - MLA with rope+nope split totalling head_dim = 128
+    - 32 routed experts + 2 shared, top-4 activated
+    Only dim, expert_dim, lora_rank, and vocab_size are shrunk so the test
+    suite can allocate and run the model on a laptop CPU.
+    """
+    base = dict(
+        vocab_size=512,
+        dim=192,  # divisible by n_heads=48 → head_dim=4 per head
+        n_heads=48,
+        n_kv_heads=8,
+        max_seq_len=64,
+        max_loop_iters=4,
+        prelude_layers=1,
+        coda_layers=1,
+        attn_type="mla",
+        # Scale MLA dims proportionally to dim=192 while preserving the
+        # rope/nope/v split ratios from the full config (64/64/128 scaled ÷32).
+        kv_lora_rank=16,
+        q_lora_rank=48,
+        qk_rope_head_dim=2,
+        qk_nope_head_dim=2,
+        v_head_dim=4,
+        n_experts=32,
+        n_shared_experts=2,
+        n_experts_per_tok=4,
+        expert_dim=64,
+        act_threshold=0.99,
+        rope_theta=10_000_000.0,
+        lora_rank=4,
+        max_output_tokens=64,
+    )
+    base.update(overrides)
+    return MythosConfig(**base)
+
+
+# ---------------------------------------------------------------------------
+# 1. Config correctness — full-scale MiniMax-M2.7 dimensions
+# ---------------------------------------------------------------------------
+
+
+class TestMinimaxM2ConfigDimensions:
+    """Validate the full-scale minimax_m2_config() without allocating the model."""
+
+    def setup_method(self):
+        self.cfg = minimax_m2_config()
+
+    def test_vocab_size(self):
+        assert self.cfg.vocab_size == 200_064
+
+    def test_hidden_dim(self):
+        assert self.cfg.dim == 6144
+
+    def test_query_heads(self):
+        assert self.cfg.n_heads == 48
+
+    def test_kv_heads(self):
+        assert self.cfg.n_kv_heads == 8
+
+    def test_gqa_ratio_6x(self):
+        """MiniMax-M2.7 uses a 6:1 GQA ratio."""
+        assert self.cfg.n_heads // self.cfg.n_kv_heads == 6
+
+    def test_head_dim_128(self):
+        """Q/K attention head dim = rope + nope = 128 to match MiniMax-M2.7."""
+        head_dim = self.cfg.qk_rope_head_dim + self.cfg.qk_nope_head_dim
+        assert head_dim == 128
+
+    def test_value_head_dim_128(self):
+        assert self.cfg.v_head_dim == 128
+
+    def test_mla_output_matches_dim(self):
+        """n_heads × v_head_dim must equal dim so the output projection is square."""
+        assert self.cfg.n_heads * self.cfg.v_head_dim == self.cfg.dim
+
+    def test_num_routed_experts(self):
+        assert self.cfg.n_experts == 32
+
+    def test_num_shared_experts(self):
+        assert self.cfg.n_shared_experts == 2
+
+    def test_experts_per_token(self):
+        assert self.cfg.n_experts_per_tok == 4
+
+    def test_context_length(self):
+        """MiniMax-M2.7 supports a 40 960-token native context window."""
+        assert self.cfg.max_seq_len == 40_960
+
+    def test_rope_theta(self):
+        """High RoPE base for long-context stability."""
+        assert self.cfg.rope_theta == 10_000_000.0
+
+    def test_attention_type_is_mla(self):
+        assert self.cfg.attn_type == "mla"
+
+    def test_prelude_and_coda_layers(self):
+        assert self.cfg.prelude_layers == 4
+        assert self.cfg.coda_layers == 4
+
+
+# ---------------------------------------------------------------------------
+# 2. MINIMAX_M2_MODEL_ID constant
+# ---------------------------------------------------------------------------
+
+
+class TestMinimaxM2ModelID:
+    def test_constant_is_string(self):
+        assert isinstance(MINIMAX_M2_MODEL_ID, str)
+
+    def test_constant_points_to_minimax_org(self):
+        assert MINIMAX_M2_MODEL_ID.startswith("MiniMaxAI/")
+
+    def test_constant_contains_m2(self):
+        assert "M2" in MINIMAX_M2_MODEL_ID
+
+    def test_constant_importable_from_package(self):
+        """Ensure top-level package re-exports the constant."""
+        import open_mythos as om
+
+        assert om.MINIMAX_M2_MODEL_ID == MINIMAX_M2_MODEL_ID
+
+
+# ---------------------------------------------------------------------------
+# 3. minimax_m2_config importable from top-level package
+# ---------------------------------------------------------------------------
+
+
+class TestMinimaxM2ConfigExport:
+    def test_importable_from_package(self):
+        import open_mythos as om
+
+        cfg = om.minimax_m2_config()
+        assert isinstance(cfg, MythosConfig)
+
+    def test_returns_new_instance_each_call(self):
+        cfg1 = minimax_m2_config()
+        cfg2 = minimax_m2_config()
+        assert cfg1 is not cfg2
+        assert cfg1.dim == cfg2.dim
+
+
+# ---------------------------------------------------------------------------
+# 4. Small-scale forward-pass tests (CPU, synthetic tensors)
+# ---------------------------------------------------------------------------
+
+
+class TestMinimaxM2ForwardPass:
+    """Instantiate a tiny MiniMax-M2.7-shaped model and verify forward passes."""
+
+    def setup_method(self):
+        self.cfg = _small_m2_cfg()
+        self.model = OpenMythos(self.cfg)
+        self.ids = torch.randint(0, self.cfg.vocab_size, (B, T))
+
+    def test_forward_output_shape(self):
+        logits = self.model(self.ids)
+        assert logits.shape == (B, T, self.cfg.vocab_size)
+
+    def test_forward_no_nan(self):
+        logits = self.model(self.ids)
+        assert not torch.isnan(logits).any()
+
+    def test_forward_no_inf(self):
+        logits = self.model(self.ids)
+        assert torch.isfinite(logits).all()
+
+    def test_generate_shape(self):
+        out = self.model.generate(self.ids, max_new_tokens=4, n_loops=2)
+        assert out.shape == (B, T + 4)
+
+    def test_lti_spectral_radius_stable(self):
+        A = self.model.recurrent.injection.get_A()
+        assert A.max().item() < 1.0
+
+    def test_mla_cache_is_compressed(self):
+        cache = {}
+        with torch.no_grad():
+            self.model(self.ids, kv_cache=cache)
+        mla_entries = {k: v for k, v in cache.items() if "c_kv" in v}
+        assert len(mla_entries) > 0
+        for entry in mla_entries.values():
+            assert entry["c_kv"].shape[-1] == self.cfg.kv_lora_rank
+
+    def test_deeper_loops_change_output(self):
+        ids = self.ids
+        with torch.no_grad():
+            out1 = self.model(ids, n_loops=1)
+            out4 = self.model(ids, n_loops=4)
+        assert not torch.allclose(out1, out4)
+
+    def test_weight_tying(self):
+        """Embedding and output projection must share weights."""
+        assert self.model.head.weight is self.model.embed.weight
+
+
+# ---------------------------------------------------------------------------
+# 5. Attention head geometry — GQA 6:1 ratio preserved at small scale
+# ---------------------------------------------------------------------------
+
+
+class TestMinimaxM2AttentionGeometry:
+    def setup_method(self):
+        self.cfg = _small_m2_cfg()
+
+    def test_gqa_ratio_preserved(self):
+        assert self.cfg.n_heads // self.cfg.n_kv_heads == 6
+
+    def test_mla_output_dim_consistent(self):
+        """n_heads × v_head_dim must equal dim even at small scale."""
+        assert self.cfg.n_heads * self.cfg.v_head_dim == self.cfg.dim
+
+    def test_rope_freqs_shape_for_qk_rope_dim(self):
+        freqs = precompute_rope_freqs(
+            dim=self.cfg.qk_rope_head_dim, max_len=self.cfg.max_seq_len
+        )
+        assert freqs.shape == (self.cfg.max_seq_len, self.cfg.qk_rope_head_dim // 2)
+
+
+# ---------------------------------------------------------------------------
+# 6. MoE: 32 routed + 2 shared, top-4 activation
+# ---------------------------------------------------------------------------
+
+
+class TestMinimaxM2MoE:
+    def setup_method(self):
+        self.cfg = _small_m2_cfg()
+        self.model = OpenMythos(self.cfg)
+        # MoE FFN lives inside the recurrent block's inner TransformerBlock
+        self.moe = self.model.recurrent.block.ffn
+
+    def test_n_routed_experts(self):
+        assert self.moe.n_experts == 32
+
+    def test_n_shared_experts(self):
+        assert self.moe.n_shared == 2
+
+    def test_topk_experts(self):
+        assert self.moe.topk == 4
+
+    def test_moe_output_shape(self):
+        x = torch.randn(B, T, self.cfg.dim)
+        out = self.moe(x)
+        assert out.shape == (B, T, self.cfg.dim)
+
+    def test_shared_experts_always_fire(self):
+        """Zero out all routed experts; shared experts keep output nonzero."""
+        for exp in self.moe.routed_experts:
+            for p in exp.parameters():
+                p.data.zero_()
+        x = torch.randn(B, T, self.cfg.dim)
+        out = self.moe(x)
+        assert out.abs().sum() > 0
+
+
+# ---------------------------------------------------------------------------
+# 7. High rope_theta does not introduce NaN / Inf
+# ---------------------------------------------------------------------------
+
+
+class TestMinimaxM2RopeTheta:
+    def test_high_theta_rope_freqs_finite(self):
+        cfg = _small_m2_cfg()
+        freqs = precompute_rope_freqs(
+            dim=cfg.qk_rope_head_dim,
+            max_len=cfg.max_seq_len,
+            theta=cfg.rope_theta,
+        )
+        assert torch.isfinite(freqs.real).all()
+        assert torch.isfinite(freqs.imag).all()
+
+    def test_high_theta_model_forward_finite(self):
+        cfg = _small_m2_cfg()
+        model = OpenMythos(cfg)
+        ids = torch.randint(0, cfg.vocab_size, (1, T))
+        with torch.no_grad():
+            out = model(ids)
+        assert torch.isfinite(out).all()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "--verbose"])