kyegomez · octo-patch · Apr 22, 2026
diff --git a/open_mythos/__init__.py b/open_mythos/__init__.py
@@ -15,8 +15,9 @@
     loop_index_embedding,
     precompute_rope_freqs,
 )
-from open_mythos.tokenizer import MythosTokenizer
+from open_mythos.tokenizer import MINIMAX_M2_MODEL_ID, MythosTokenizer
 from open_mythos.variants import (
+    minimax_m2_config,
     mythos_1b,
     mythos_1t,
     mythos_3b,
@@ -49,6 +50,8 @@
     "mythos_100b",
     "mythos_500b",
     "mythos_1t",
+    "minimax_m2_config",
+    "MINIMAX_M2_MODEL_ID",
     "load_tokenizer",
     "get_vocab_size",
     "MythosTokenizer",

diff --git a/open_mythos/tokenizer.py b/open_mythos/tokenizer.py
@@ -2,6 +2,11 @@
 
 DEFAULT_MODEL_ID = "openai/gpt-oss-20b"
 
+# HuggingFace model ID for the MiniMax-M2.7 tokenizer.
+# Supports vocab_size=200064 — pass this to MythosTokenizer when working
+# with the minimax_m2_config() variant.
+MINIMAX_M2_MODEL_ID = "MiniMaxAI/MiniMax-M2.7"
+
 
 class MythosTokenizer:
     """

diff --git a/open_mythos/variants.py b/open_mythos/variants.py
@@ -4,6 +4,10 @@
 #   total ≈ embed + prelude/coda dense blocks + recurrent MLA + MoE
 #   MoE   = 3 * dim * expert_dim * (n_experts + n_shared * n_experts_per_tok)
 # expert_dim is solved from the residual budget after all other terms.
+#
+# MiniMax-M2.7 reference config (see minimax_m2_config below):
+#   Architecture source: MiniMax-01 technical report (MiniMaxAI/MiniMax-M2.7 on HuggingFace)
+#   230B total params, ~7B active per token, 48Q/8KV heads, head_dim=128, 32 routed experts
 
 
 def mythos_1b() -> MythosConfig:
@@ -196,3 +200,61 @@ def mythos_1t() -> MythosConfig:
         lora_rank=256,
         max_output_tokens=131072,
     )
+
+
+def minimax_m2_config() -> MythosConfig:
+    """OpenMythos configuration matching the MiniMax-M2.7 architecture.
+
+    MiniMax-M2.7 (MiniMaxAI/MiniMax-M2.7 on HuggingFace) is a 230B-parameter sparse
+    MoE language model.  Key structural dimensions expressed in OpenMythos terms:
+
+    * ``dim = 6144``  — matches MiniMax-M2.7's hidden size
+    * ``n_heads = 48`` / ``n_kv_heads = 8``  — 6× GQA ratio (48 query heads, 8 KV heads)
+    * ``qk_rope_head_dim = 64``, ``qk_nope_head_dim = 64``, ``v_head_dim = 128``
+      — the per-head dimensions satisfy MiniMax-M2.7's head_dim = 128 for both
+      the Q/K attention key size (rope + nope = 128) and the value projection (128)
+    * ``n_experts = 32``, ``n_shared_experts = 2``, ``n_experts_per_tok = 4``
+      — mirrors MiniMax-M2.7's fine-grained MoE design (32 routed + 2 always-on)
+    * ``max_seq_len = 40960``  — MiniMax-M2.7's 40K-token native context window
+    * ``rope_theta = 10_000_000``  — high RoPE base for long-context stability
+
+    Use this config to instantiate an OpenMythos model whose structural dimensions
+    reflect the MiniMax-M2.7 design point.  Weights are randomly initialized; load
+    ``MiniMaxAI/MiniMax-M2.7`` from HuggingFace for production inference.
+
+    Example::
+
+        from open_mythos.variants import minimax_m2_config
+        from open_mythos.main import OpenMythos
+        import torch
+
+        cfg = minimax_m2_config()
+        model = OpenMythos(cfg)
+        ids = torch.randint(0, cfg.vocab_size, (1, 16))
+        logits = model(ids, n_loops=4)
+        print(logits.shape)  # (1, 16, 200064)
+    """
+    return MythosConfig(
+        vocab_size=200064,
+        dim=6144,
+        n_heads=48,
+        n_kv_heads=8,
+        max_seq_len=40960,
+        max_loop_iters=16,
+        prelude_layers=4,
+        coda_layers=4,
+        attn_type="mla",
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        qk_nope_head_dim=64,
+        v_head_dim=128,
+        n_experts=32,
+        n_shared_experts=2,
+        n_experts_per_tok=4,
+        expert_dim=8192,
+        act_threshold=0.99,
+        rope_theta=10_000_000.0,
+        lora_rank=32,
+        max_output_tokens=4096,
+    )