From 463856cb3fee8c6bba906fa425ba4339d1d811d7 Mon Sep 17 00:00:00 2001
From: anphonic <abilly@gmail.com>
Date: Thu, 23 Apr 2026 22:18:59 -0500
Subject: [PATCH 1/7] Add lightweight OpenMythos runtime metadata

Expose a lightweight config/profile surface for orchestration, keep package imports lazy, and add tokenizer helpers plus tests.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .github/copilot-instructions.md |  36 ++++++++++++
 open_mythos/__init__.py         |  75 ++++++++++++++++++------
 open_mythos/config.py           | 101 ++++++++++++++++++++++++++++++++
 open_mythos/main.py             |  75 ++----------------------
 open_mythos/tokenizer.py        |  19 +++++-
 open_mythos/variants.py         |   2 +-
 tests/test_main.py              |  34 +++++++++++
 tests/test_tokenizer.py         |  15 ++++-
 8 files changed, 267 insertions(+), 90 deletions(-)
 create mode 100644 .github/copilot-instructions.md
 create mode 100644 open_mythos/config.py

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000..e3ec543
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,36 @@
+# Copilot Instructions
+
+## Build, test, and lint commands
+
+- Install dependencies for local work with `poetry install --with dev`. The repo also keeps a minimal `requirements.txt` for `pip install -r requirements.txt`.
+- Build the package with `poetry build`.
+- Run the main unit suite with `poetry run pytest tests/test_main.py`.
+- Run a single test with `poetry run pytest tests/test_main.py::TestOpenMythosGQA::test_generate_shape`.
+- Run tokenizer tests with `poetry run pytest tests/test_tokenizer.py`. These load the default Hugging Face tokenizer (`openai/gpt-oss-20b`), so they are heavier than `tests/test_main.py`.
+- Run the RoPE debug script with `poetry run python tests/test_rope_debug.py`.
+- Lint with `poetry run ruff check .`.
+- Check formatting with `poetry run black --check .`.
+- The documented training entrypoints are `poetry run python training/3b_fine_web_edu.py` and `poetry run torchrun --nproc_per_node=$(python -c "import torch; print(torch.cuda.device_count())") training/3b_fine_web_edu.py`.
+- Benchmark utilities are executable scripts under `tests/`, not pytest files: `poetry run python tests/small_benchmark.py` and `poetry run python tests/bench_vs_transformer.py`.
+
+## High-level architecture
+
+- `open_mythos/main.py` is the primary implementation. It defines `MythosConfig`, both attention backends, the MoE FFN, the recurrent block, and the top-level `OpenMythos` model.
+- The model pipeline is fixed: token embedding -> Prelude (`cfg.prelude_layers` dense `TransformerBlock`s run once) -> Recurrent Block (one shared `TransformerBlock` looped `n_loops` times with LoRA, ACT halting, and LTI-stable input injection) -> Coda (`cfg.coda_layers` dense `TransformerBlock`s run once) -> `RMSNorm` -> tied LM head.
+- The key architectural invariant is that the encoded input `e` is frozen after the Prelude and injected on every recurrent iteration. The recurrent update path is `loop_index_embedding -> TransformerBlock(use_moe=True) -> LoRAAdapter -> LTIInjection -> ACTHalting`.
+- `cfg.attn_type` switches the whole attention/cache path. `GQAttention` stores full K/V by layer, while `MLAttention` stores compressed `c_kv` plus `k_rope` and reconstructs K/V on demand. `OpenMythos.forward()` also switches RoPE buffers based on `attn_type`.
+- `open_mythos/variants.py` contains the named model scales (`mythos_1b` through `mythos_1t`) as `MythosConfig` factories. `open_mythos/tokenizer.py` is a small wrapper around `transformers.AutoTokenizer` with `openai/gpt-oss-20b` as the default tokenizer.
+- `training/3b_fine_web_edu.py` is a standalone FSDP training script for the 3B variant. It streams FineWeb-Edu shards, uses `MythosTokenizer`, and keeps checkpointing/training logic out of the library module.
+- `docs/open_mythos.md` is the detailed API and architecture reference. `docs/datasets.md` holds the training dataset recommendations and token-budget guidance referenced by the training script/README.
+- `open_mythos/moda.py` is a separate experimental MoDA + MoE implementation, not part of the main `OpenMythos` export surface.
+
+## Key conventions
+
+- For tests, smoke runs, and examples that should execute quickly, use the tiny helper configs from `tests/test_main.py` (`gqa_cfg()` / `mla_cfg()`) or similarly small custom configs. `MythosConfig()` defaults to a much larger research model.
+- Preserve decode-position semantics when touching inference code: `start_pos` chooses the RoPE slice for incremental decoding, and each cache entry uses a deterministic key (`prelude_{i}`, `recurrent_loop_{t}`, `coda_{i}`).
+- Do not short-circuit recurrent execution when a KV cache is active. `RecurrentBlock` only exits early on ACT convergence when `kv_cache is None`; cached decoding relies on every loop depth writing its cache entry on every step.
+- Keep the embedding/LM head weight tying intact (`self.head.weight = self.embed.weight`), and keep `router_bias` as a registered buffer rather than a trainable parameter.
+- Prelude and Coda are always dense FFN blocks (`use_moe=False`); the recurrent block is the only place that uses the MoE FFN (`use_moe=True`).
+- Named variants and default configs are MLA-first. If you switch code or tests to GQA, make sure the config still provides valid MLA fields because shared config helpers are reused across both modes.
+- Use the variant helpers that actually exist in `open_mythos/variants.py`: `mythos_1b`, `mythos_3b`, `mythos_10b`, `mythos_50b`, `mythos_100b`, `mythos_500b`, and `mythos_1t`. The README currently shows a stale `mythos_7b()` example.
+- The benchmark scripts live under `tests/` and are intended to be run directly as scripts. Their docstrings still reference a `benchmarks/` path that does not exist in this repository.
diff --git a/open_mythos/__init__.py b/open_mythos/__init__.py
index 73c2c04..28f87ed 100644
--- a/open_mythos/__init__.py
+++ b/open_mythos/__init__.py
@@ -1,21 +1,5 @@
-from open_mythos.main import (
-    ACTHalting,
-    Expert,
-    GQAttention,
-    LoRAAdapter,
-    LTIInjection,
-    MLAttention,
-    MoEFFN,
-    MythosConfig,
-    OpenMythos,
-    RecurrentBlock,
-    RMSNorm,
-    TransformerBlock,
-    apply_rope,
-    loop_index_embedding,
-    precompute_rope_freqs,
-)
-from open_mythos.tokenizer import MythosTokenizer
+from open_mythos.config import MythosConfig
+from open_mythos.tokenizer import MythosTokenizer, get_vocab_size, load_tokenizer
 from open_mythos.variants import (
     mythos_1b,
     mythos_1t,
@@ -26,6 +10,23 @@
     mythos_500b,
 )
 
+_MAIN_EXPORTS = {
+    "ACTHalting",
+    "Expert",
+    "GQAttention",
+    "LoRAAdapter",
+    "LTIInjection",
+    "MLAttention",
+    "MoEFFN",
+    "OpenMythos",
+    "RecurrentBlock",
+    "RMSNorm",
+    "TransformerBlock",
+    "apply_rope",
+    "loop_index_embedding",
+    "precompute_rope_freqs",
+}
+
 __all__ = [
     "MythosConfig",
     "RMSNorm",
@@ -53,3 +54,41 @@
     "get_vocab_size",
     "MythosTokenizer",
 ]
+
+
+def __getattr__(name: str):
+    if name in _MAIN_EXPORTS:
+        from open_mythos.main import (
+            ACTHalting,
+            Expert,
+            GQAttention,
+            LoRAAdapter,
+            LTIInjection,
+            MLAttention,
+            MoEFFN,
+            OpenMythos,
+            RecurrentBlock,
+            RMSNorm,
+            TransformerBlock,
+            apply_rope,
+            loop_index_embedding,
+            precompute_rope_freqs,
+        )
+
+        return {
+            "ACTHalting": ACTHalting,
+            "Expert": Expert,
+            "GQAttention": GQAttention,
+            "LoRAAdapter": LoRAAdapter,
+            "LTIInjection": LTIInjection,
+            "MLAttention": MLAttention,
+            "MoEFFN": MoEFFN,
+            "OpenMythos": OpenMythos,
+            "RecurrentBlock": RecurrentBlock,
+            "RMSNorm": RMSNorm,
+            "TransformerBlock": TransformerBlock,
+            "apply_rope": apply_rope,
+            "loop_index_embedding": loop_index_embedding,
+            "precompute_rope_freqs": precompute_rope_freqs,
+        }[name]
+    raise AttributeError(f"module 'open_mythos' has no attribute {name!r}")
diff --git a/open_mythos/config.py b/open_mythos/config.py
new file mode 100644
index 0000000..54ae93a
--- /dev/null
+++ b/open_mythos/config.py
@@ -0,0 +1,101 @@
+from dataclasses import asdict, dataclass
+
+
+@dataclass
+class MythosConfig:
+    """
+    Hyperparameter configuration for OpenMythos.
+
+    Core:
+        vocab_size      -- token vocabulary size
+        dim             -- model hidden dimension
+        n_heads         -- number of query attention heads
+        n_kv_heads      -- number of key/value heads (GQA; ignored by MLA)
+        max_seq_len     -- maximum sequence length for RoPE precomputation
+        max_loop_iters  -- default recurrent loop depth T at inference
+        prelude_layers  -- number of standard transformer layers before the loop
+        coda_layers     -- number of standard transformer layers after the loop
+
+    Attention (attn_type selects between the two):
+        attn_type       -- "gqa" for Grouped Query Attention, "mla" for Multi-Latent Attention
+        kv_lora_rank    -- [MLA] compressed KV latent dimension stored in the cache
+        q_lora_rank     -- [MLA] compressed Q latent dimension
+        qk_rope_head_dim-- [MLA] per-head dims that receive RoPE
+        qk_nope_head_dim-- [MLA] per-head dims without positional encoding
+        v_head_dim      -- [MLA] per-head value dimension
+
+    MoE FFN (used inside the recurrent block):
+        n_experts       -- total number of routed expert FFNs
+        n_shared_experts-- number of always-active shared experts
+        n_experts_per_tok-- top-K experts selected per token by the router
+        expert_dim      -- hidden dimension inside each fine-grained expert
+
+    Other:
+        act_threshold   -- ACT halting threshold (cumulative probability to stop looping)
+        rope_theta      -- RoPE base frequency
+        lora_rank       -- rank of the per-loop depth-wise LoRA adapter
+    """
+
+    vocab_size: int = 32000
+    dim: int = 2048
+    n_heads: int = 16
+    n_kv_heads: int = 4  # GQA: fewer KV heads than Q heads
+    max_seq_len: int = 4096
+    max_loop_iters: int = 16  # T — recurrent depth at inference
+    prelude_layers: int = 2
+    coda_layers: int = 2
+    # Attention type: "gqa" | "mla"
+    attn_type: str = "mla"
+    # MLA params (only used when attn_type="mla")
+    kv_lora_rank: int = 512  # compressed KV latent cached instead of full K/V
+    q_lora_rank: int = 1536  # compressed Q latent dim
+    qk_rope_head_dim: int = 64  # per-head dims that receive RoPE
+    qk_nope_head_dim: int = 128  # per-head dims without RoPE
+    v_head_dim: int = 128  # per-head value dim
+    # MoE
+    n_experts: int = 64
+    n_shared_experts: int = 2
+    n_experts_per_tok: int = 4  # top-K routed
+    expert_dim: int = 512  # fine-grained: dim // (n_experts // n_experts_per_tok)
+    # ACT halting
+    act_threshold: float = 0.99
+    # RoPE
+    rope_theta: float = 500000.0
+    # LoRA depth adaptation
+    lora_rank: int = 16
+    # Maximum tokens to generate per forward pass
+    max_output_tokens: int = 4096
+    # Dropout (set 0.0 to disable; 0.1 is standard for pretraining)
+    dropout: float = 0.0
+
+    def to_dict(self) -> dict[str, object]:
+        """Return a plain-Python config dictionary for serialization."""
+        return asdict(self)
+
+    def runtime_profile(self) -> dict[str, object]:
+        """
+        Describe the stable runtime-facing capabilities of this config.
+
+        GatesOfMythos can use this to validate routing decisions without
+        inspecting model internals or loading a heavyweight instance first.
+        """
+        return {
+            "model_name": "OpenMythos",
+            "attn_type": self.attn_type,
+            "supports_kv_cache": True,
+            "supports_incremental_decode": True,
+            "uses_moe": True,
+            "uses_act_halting": True,
+            "uses_lti_injection": True,
+            "max_context_tokens": self.max_seq_len,
+            "max_loop_iters": self.max_loop_iters,
+            "max_output_tokens": self.max_output_tokens,
+            "cache_layout": {
+                "prelude": [f"prelude_{i}" for i in range(self.prelude_layers)],
+                "recurrent": "recurrent_loop_{t}",
+                "coda": [f"coda_{i}" for i in range(self.coda_layers)],
+            },
+            "attention_backend": (
+                "multi_latent" if self.attn_type == "mla" else "grouped_query"
+            ),
+        }
diff --git a/open_mythos/main.py b/open_mythos/main.py
index 65b0fa8..84ab50b 100644
--- a/open_mythos/main.py
+++ b/open_mythos/main.py
@@ -1,10 +1,11 @@
-from dataclasses import dataclass
 from typing import Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
+from open_mythos.config import MythosConfig
+
 try:
     from flash_attn import flash_attn_func
 
@@ -13,74 +14,6 @@
     _HAS_FLASH_ATTN = False
 
 
-@dataclass
-class MythosConfig:
-    """
-    Hyperparameter configuration for OpenMythos.
-
-    Core:
-        vocab_size      -- token vocabulary size
-        dim             -- model hidden dimension
-        n_heads         -- number of query attention heads
-        n_kv_heads      -- number of key/value heads (GQA; ignored by MLA)
-        max_seq_len     -- maximum sequence length for RoPE precomputation
-        max_loop_iters  -- default recurrent loop depth T at inference
-        prelude_layers  -- number of standard transformer layers before the loop
-        coda_layers     -- number of standard transformer layers after the loop
-
-    Attention (attn_type selects between the two):
-        attn_type       -- "gqa" for Grouped Query Attention, "mla" for Multi-Latent Attention
-        kv_lora_rank    -- [MLA] compressed KV latent dimension stored in the cache
-        q_lora_rank     -- [MLA] compressed Q latent dimension
-        qk_rope_head_dim-- [MLA] per-head dims that receive RoPE
-        qk_nope_head_dim-- [MLA] per-head dims without positional encoding
-        v_head_dim      -- [MLA] per-head value dimension
-
-    MoE FFN (used inside the recurrent block):
-        n_experts       -- total number of routed expert FFNs
-        n_shared_experts-- number of always-active shared experts
-        n_experts_per_tok-- top-K experts selected per token by the router
-        expert_dim      -- hidden dimension inside each fine-grained expert
-
-    Other:
-        act_threshold   -- ACT halting threshold (cumulative probability to stop looping)
-        rope_theta      -- RoPE base frequency
-        lora_rank       -- rank of the per-loop depth-wise LoRA adapter
-    """
-
-    vocab_size: int = 32000
-    dim: int = 2048
-    n_heads: int = 16
-    n_kv_heads: int = 4  # GQA: fewer KV heads than Q heads
-    max_seq_len: int = 4096
-    max_loop_iters: int = 16  # T — recurrent depth at inference
-    prelude_layers: int = 2
-    coda_layers: int = 2
-    # Attention type: "gqa" | "mla"
-    attn_type: str = "mla"
-    # MLA params (only used when attn_type="mla")
-    kv_lora_rank: int = 512  # compressed KV latent cached instead of full K/V
-    q_lora_rank: int = 1536  # compressed Q latent dim
-    qk_rope_head_dim: int = 64  # per-head dims that receive RoPE
-    qk_nope_head_dim: int = 128  # per-head dims without RoPE
-    v_head_dim: int = 128  # per-head value dim
-    # MoE
-    n_experts: int = 64
-    n_shared_experts: int = 2
-    n_experts_per_tok: int = 4  # top-K routed
-    expert_dim: int = 512  # fine-grained: dim // (n_experts // n_experts_per_tok)
-    # ACT halting
-    act_threshold: float = 0.99
-    # RoPE
-    rope_theta: float = 500000.0
-    # LoRA depth adaptation
-    lora_rank: int = 16
-    # Maximum tokens to generate per forward pass
-    max_output_tokens: int = 4096
-    # Dropout (set 0.0 to disable; 0.1 is standard for pretraining)
-    dropout: float = 0.0
-
-
 # ---------------------------------------------------------------------------
 # RMSNorm
 # ---------------------------------------------------------------------------
@@ -965,6 +898,10 @@ def _init_weights(self) -> None:
             elif isinstance(m, nn.Embedding):
                 nn.init.normal_(m.weight, std=0.02)
 
+    def describe(self) -> dict[str, object]:
+        """Return the stable runtime profile for orchestration and routing."""
+        return self.cfg.runtime_profile()
+
     @staticmethod
     def _causal_mask(
         seq_len: int, device: torch.device, dtype: torch.dtype
diff --git a/open_mythos/tokenizer.py b/open_mythos/tokenizer.py
index fadb3a5..d9ecd12 100644
--- a/open_mythos/tokenizer.py
+++ b/open_mythos/tokenizer.py
@@ -1,4 +1,7 @@
-from transformers import AutoTokenizer
+try:
+    from transformers import AutoTokenizer
+except ImportError:  # pragma: no cover - dependency may be absent in lightweight envs
+    AutoTokenizer = None
 
 DEFAULT_MODEL_ID = "openai/gpt-oss-20b"
 
@@ -27,6 +30,10 @@ def __init__(self, model_id: str = DEFAULT_MODEL_ID):
         Args:
             model_id (str): HuggingFace model identifier or path to tokenizer files.
         """
+        if AutoTokenizer is None:
+            raise ModuleNotFoundError(
+                "transformers is required to construct MythosTokenizer"
+            )
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     @property
@@ -62,3 +69,13 @@ def decode(self, token_ids: list[int]) -> str:
             str: Decoded string representation of the token IDs.
         """
         return self.tokenizer.decode(token_ids, skip_special_tokens=True)
+
+
+def load_tokenizer(model_id: str = DEFAULT_MODEL_ID) -> MythosTokenizer:
+    """Construct a tokenizer wrapper using the default or requested model id."""
+    return MythosTokenizer(model_id=model_id)
+
+
+def get_vocab_size(model_id: str = DEFAULT_MODEL_ID) -> int:
+    """Return the tokenizer vocabulary size for a given model id."""
+    return load_tokenizer(model_id=model_id).vocab_size
diff --git a/open_mythos/variants.py b/open_mythos/variants.py
index 83f7dd4..0133714 100644
--- a/open_mythos/variants.py
+++ b/open_mythos/variants.py
@@ -1,4 +1,4 @@
-from open_mythos.main import MythosConfig
+from open_mythos.config import MythosConfig
 
 # Parameter budget breakdown per variant:
 #   total ≈ embed + prelude/coda dense blocks + recurrent MLA + MoE
diff --git a/tests/test_main.py b/tests/test_main.py
index c54c462..1245d4e 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -543,6 +543,40 @@ def test_single_loop_runs(self):
         assert out.shape == (B, T, self.cfg.dim)
 
 
+# ---------------------------------------------------------------------------
+# MythosConfig / OpenMythos introspection
+# ---------------------------------------------------------------------------
+
+
+class TestMythosConfigIntrospection:
+    def test_runtime_profile_includes_cache_layout(self):
+        cfg = mla_cfg(prelude_layers=2, coda_layers=1, max_loop_iters=4)
+        profile = cfg.runtime_profile()
+
+        assert profile["model_name"] == "OpenMythos"
+        assert profile["attn_type"] == "mla"
+        assert profile["supports_kv_cache"] is True
+        assert profile["supports_incremental_decode"] is True
+        assert profile["cache_layout"]["prelude"] == ["prelude_0", "prelude_1"]
+        assert profile["cache_layout"]["recurrent"] == "recurrent_loop_{t}"
+        assert profile["cache_layout"]["coda"] == ["coda_0"]
+
+    def test_to_dict_round_trips_dataclass_fields(self):
+        cfg = gqa_cfg(dim=128, max_output_tokens=256)
+        data = cfg.to_dict()
+
+        assert data["dim"] == 128
+        assert data["max_output_tokens"] == 256
+        assert data["attn_type"] == "gqa"
+
+
+class TestOpenMythosIntrospection:
+    def test_describe_matches_config_profile(self):
+        cfg = gqa_cfg()
+        model = OpenMythos(cfg)
+
+        assert model.describe() == cfg.runtime_profile()
+
 # ---------------------------------------------------------------------------
 # OpenMythos — GQA mode
 # ---------------------------------------------------------------------------
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index fab7533..9c17b98 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -1,5 +1,9 @@
 import pytest
-from open_mythos.tokenizer import MythosTokenizer
+from open_mythos.tokenizer import (
+    MythosTokenizer,
+    get_vocab_size,
+    load_tokenizer,
+)
 
 
 @pytest.fixture(scope="module")
@@ -64,6 +68,15 @@ def test_custom_model_id():
     assert tok.vocab_size > 0
 
 
+def test_load_tokenizer_helper():
+    tok = load_tokenizer()
+    assert isinstance(tok, MythosTokenizer)
+
+
+def test_get_vocab_size_helper(tokenizer):
+    assert get_vocab_size() == tokenizer.vocab_size
+
+
 def test_vocab_size_consistent(tokenizer):
     outer = tokenizer.vocab_size
     inner = tokenizer.tokenizer.vocab_size

From 95fcf0a74a60f39927858000d440d620c00ee245 Mon Sep 17 00:00:00 2001
From: anphonic <abilly@gmail.com>
Date: Thu, 23 Apr 2026 23:23:51 -0500
Subject: [PATCH 2/7] Make tokenizer imports lazy

Delay importing transformers until a tokenizer is actually constructed so package imports stay lightweight.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 open_mythos/tokenizer.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/open_mythos/tokenizer.py b/open_mythos/tokenizer.py
index d9ecd12..e7b1a93 100644
--- a/open_mythos/tokenizer.py
+++ b/open_mythos/tokenizer.py
@@ -1,11 +1,16 @@
-try:
-    from transformers import AutoTokenizer
-except ImportError:  # pragma: no cover - dependency may be absent in lightweight envs
-    AutoTokenizer = None
-
 DEFAULT_MODEL_ID = "openai/gpt-oss-20b"
 
 
+def _load_auto_tokenizer():
+    try:
+        from transformers import AutoTokenizer
+    except ImportError as exc:  # pragma: no cover - dependency may be absent
+        raise ModuleNotFoundError(
+            "transformers is required to construct MythosTokenizer"
+        ) from exc
+    return AutoTokenizer
+
+
 class MythosTokenizer:
     """
     HuggingFace tokenizer wrapper for OpenMythos.
@@ -30,11 +35,7 @@ def __init__(self, model_id: str = DEFAULT_MODEL_ID):
         Args:
             model_id (str): HuggingFace model identifier or path to tokenizer files.
         """
-        if AutoTokenizer is None:
-            raise ModuleNotFoundError(
-                "transformers is required to construct MythosTokenizer"
-            )
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        self.tokenizer = _load_auto_tokenizer().from_pretrained(model_id)
 
     @property
     def vocab_size(self) -> int:

From bda5ca98e33e5792a0deeac56e2cff2777c78c92 Mon Sep 17 00:00:00 2001
From: anphonic <abilly@gmail.com>
Date: Thu, 23 Apr 2026 23:26:30 -0500
Subject: [PATCH 3/7] Validate attention type and update docs

Reject unsupported attn_type values and refresh the architecture note to match the new module layout.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .github/copilot-instructions.md | 2 +-
 open_mythos/config.py           | 6 ++++++
 tests/test_main.py              | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index e3ec543..e466574 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -15,7 +15,7 @@
 
 ## High-level architecture
 
-- `open_mythos/main.py` is the primary implementation. It defines `MythosConfig`, both attention backends, the MoE FFN, the recurrent block, and the top-level `OpenMythos` model.
+- `open_mythos/config.py` defines `MythosConfig`. `open_mythos/main.py` is the primary implementation for both attention backends, the MoE FFN, the recurrent block, and the top-level `OpenMythos` model.
 - The model pipeline is fixed: token embedding -> Prelude (`cfg.prelude_layers` dense `TransformerBlock`s run once) -> Recurrent Block (one shared `TransformerBlock` looped `n_loops` times with LoRA, ACT halting, and LTI-stable input injection) -> Coda (`cfg.coda_layers` dense `TransformerBlock`s run once) -> `RMSNorm` -> tied LM head.
 - The key architectural invariant is that the encoded input `e` is frozen after the Prelude and injected on every recurrent iteration. The recurrent update path is `loop_index_embedding -> TransformerBlock(use_moe=True) -> LoRAAdapter -> LTIInjection -> ACTHalting`.
 - `cfg.attn_type` switches the whole attention/cache path. `GQAttention` stores full K/V by layer, while `MLAttention` stores compressed `c_kv` plus `k_rope` and reconstructs K/V on demand. `OpenMythos.forward()` also switches RoPE buffers based on `attn_type`.
diff --git a/open_mythos/config.py b/open_mythos/config.py
index 54ae93a..ee12f49 100644
--- a/open_mythos/config.py
+++ b/open_mythos/config.py
@@ -68,6 +68,12 @@ class MythosConfig:
     # Dropout (set 0.0 to disable; 0.1 is standard for pretraining)
     dropout: float = 0.0
 
+    def __post_init__(self) -> None:
+        if self.attn_type not in {"gqa", "mla"}:
+            raise ValueError(
+                f"Unsupported attn_type {self.attn_type!r}; expected 'gqa' or 'mla'"
+            )
+
     def to_dict(self) -> dict[str, object]:
         """Return a plain-Python config dictionary for serialization."""
         return asdict(self)
diff --git a/tests/test_main.py b/tests/test_main.py
index 1245d4e..54063fe 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -569,6 +569,10 @@ def test_to_dict_round_trips_dataclass_fields(self):
         assert data["max_output_tokens"] == 256
         assert data["attn_type"] == "gqa"
 
+    def test_invalid_attn_type_raises(self):
+        with pytest.raises(ValueError, match="Unsupported attn_type"):
+            gqa_cfg(attn_type="mlaa")
+
 
 class TestOpenMythosIntrospection:
     def test_describe_matches_config_profile(self):

From b5388b9804a376499be42275a5110aadf5edffbb Mon Sep 17 00:00:00 2001
From: anphonic <abilly@gmail.com>
Date: Thu, 23 Apr 2026 23:31:36 -0500
Subject: [PATCH 4/7] Add MythosConfig sanity checks

Validate core config fields up front so invalid shapes and thresholds fail fast.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 open_mythos/config.py | 44 +++++++++++++++++++++++++++++++++++++++++++
 tests/test_main.py    |  6 ++++++
 2 files changed, 50 insertions(+)

diff --git a/open_mythos/config.py b/open_mythos/config.py
index ee12f49..5688b33 100644
--- a/open_mythos/config.py
+++ b/open_mythos/config.py
@@ -73,6 +73,50 @@ def __post_init__(self) -> None:
             raise ValueError(
                 f"Unsupported attn_type {self.attn_type!r}; expected 'gqa' or 'mla'"
             )
+        if self.vocab_size <= 0:
+            raise ValueError("vocab_size must be positive")
+        if self.dim <= 0:
+            raise ValueError("dim must be positive")
+        if self.n_heads <= 0:
+            raise ValueError("n_heads must be positive")
+        if self.n_kv_heads <= 0:
+            raise ValueError("n_kv_heads must be positive")
+        if self.max_seq_len <= 0:
+            raise ValueError("max_seq_len must be positive")
+        if self.max_loop_iters <= 0:
+            raise ValueError("max_loop_iters must be positive")
+        if self.prelude_layers < 0:
+            raise ValueError("prelude_layers must be non-negative")
+        if self.coda_layers < 0:
+            raise ValueError("coda_layers must be non-negative")
+        if self.kv_lora_rank <= 0:
+            raise ValueError("kv_lora_rank must be positive")
+        if self.q_lora_rank <= 0:
+            raise ValueError("q_lora_rank must be positive")
+        if self.qk_rope_head_dim <= 0:
+            raise ValueError("qk_rope_head_dim must be positive")
+        if self.qk_nope_head_dim <= 0:
+            raise ValueError("qk_nope_head_dim must be positive")
+        if self.v_head_dim <= 0:
+            raise ValueError("v_head_dim must be positive")
+        if self.n_experts <= 0:
+            raise ValueError("n_experts must be positive")
+        if self.n_shared_experts < 0:
+            raise ValueError("n_shared_experts must be non-negative")
+        if self.n_experts_per_tok <= 0:
+            raise ValueError("n_experts_per_tok must be positive")
+        if self.expert_dim <= 0:
+            raise ValueError("expert_dim must be positive")
+        if not 0.0 < self.act_threshold <= 1.0:
+            raise ValueError("act_threshold must be in the interval (0, 1]")
+        if self.rope_theta <= 0:
+            raise ValueError("rope_theta must be positive")
+        if self.lora_rank <= 0:
+            raise ValueError("lora_rank must be positive")
+        if self.max_output_tokens <= 0:
+            raise ValueError("max_output_tokens must be positive")
+        if self.dropout < 0:
+            raise ValueError("dropout must be non-negative")
 
     def to_dict(self) -> dict[str, object]:
         """Return a plain-Python config dictionary for serialization."""
diff --git a/tests/test_main.py b/tests/test_main.py
index 54063fe..bc8cdb3 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -573,6 +573,12 @@ def test_invalid_attn_type_raises(self):
         with pytest.raises(ValueError, match="Unsupported attn_type"):
             gqa_cfg(attn_type="mlaa")
 
+    def test_invalid_numeric_fields_raise(self):
+        with pytest.raises(ValueError, match="dim must be positive"):
+            gqa_cfg(dim=0)
+        with pytest.raises(ValueError, match="act_threshold must be in the interval"):
+            gqa_cfg(act_threshold=1.1)
+
 
 class TestOpenMythosIntrospection:
     def test_describe_matches_config_profile(self):

From b20e8ff48e7665e691be4af4acb5d17f0fe23844 Mon Sep 17 00:00:00 2001
From: anphonic <abilly@gmail.com>
Date: Thu, 23 Apr 2026 23:44:07 -0500
Subject: [PATCH 5/7] Tighten MythosConfig validation

Validate head-size and attention-grouping invariants up front so invalid configs fail before runtime.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 open_mythos/config.py | 11 +++++++++++
 tests/test_main.py    |  6 ++++++
 2 files changed, 17 insertions(+)

diff --git a/open_mythos/config.py b/open_mythos/config.py
index 5688b33..83862ed 100644
--- a/open_mythos/config.py
+++ b/open_mythos/config.py
@@ -117,6 +117,17 @@ def __post_init__(self) -> None:
             raise ValueError("max_output_tokens must be positive")
         if self.dropout < 0:
             raise ValueError("dropout must be non-negative")
+        if self.dim % self.n_heads != 0:
+            raise ValueError("dim must be divisible by n_heads")
+        if (self.dim // self.n_heads) % 2 != 0:
+            raise ValueError("dim // n_heads must be even for RoPE")
+        if self.attn_type == "gqa":
+            if self.n_kv_heads > self.n_heads:
+                raise ValueError("n_kv_heads must be less than or equal to n_heads")
+            if self.n_heads % self.n_kv_heads != 0:
+                raise ValueError("n_heads must be divisible by n_kv_heads")
+        if self.attn_type == "mla" and self.qk_rope_head_dim % 2 != 0:
+            raise ValueError("qk_rope_head_dim must be even for MLA RoPE")
 
     def to_dict(self) -> dict[str, object]:
         """Return a plain-Python config dictionary for serialization."""
diff --git a/tests/test_main.py b/tests/test_main.py
index bc8cdb3..d1aa7ec 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -578,6 +578,12 @@ def test_invalid_numeric_fields_raise(self):
             gqa_cfg(dim=0)
         with pytest.raises(ValueError, match="act_threshold must be in the interval"):
             gqa_cfg(act_threshold=1.1)
+        with pytest.raises(ValueError, match="dim must be divisible by n_heads"):
+            gqa_cfg(dim=66, n_heads=8)
+        with pytest.raises(ValueError, match="n_heads must be divisible by n_kv_heads"):
+            gqa_cfg(n_kv_heads=3)
+        with pytest.raises(ValueError, match="qk_rope_head_dim must be even"):
+            mla_cfg(qk_rope_head_dim=7)
 
 
 class TestOpenMythosIntrospection:

From 936d24d34fc5dcc58bd4c58bb4a12b5e9380c436 Mon Sep 17 00:00:00 2001
From: anphonic <abilly@gmail.com>
Date: Thu, 23 Apr 2026 23:47:53 -0500
Subject: [PATCH 6/7] Tighten config bounds

Validate expert top-k and dropout probability bounds alongside the existing shape checks.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 open_mythos/config.py | 8 ++++++--
 tests/test_main.py    | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/open_mythos/config.py b/open_mythos/config.py
index 83862ed..357b58b 100644
--- a/open_mythos/config.py
+++ b/open_mythos/config.py
@@ -105,6 +105,10 @@ def __post_init__(self) -> None:
             raise ValueError("n_shared_experts must be non-negative")
         if self.n_experts_per_tok <= 0:
             raise ValueError("n_experts_per_tok must be positive")
+        if self.n_experts_per_tok > self.n_experts:
+            raise ValueError(
+                "n_experts_per_tok must be less than or equal to n_experts"
+            )
         if self.expert_dim <= 0:
             raise ValueError("expert_dim must be positive")
         if not 0.0 < self.act_threshold <= 1.0:
@@ -115,8 +119,8 @@ def __post_init__(self) -> None:
             raise ValueError("lora_rank must be positive")
         if self.max_output_tokens <= 0:
             raise ValueError("max_output_tokens must be positive")
-        if self.dropout < 0:
-            raise ValueError("dropout must be non-negative")
+        if not 0.0 <= self.dropout <= 1.0:
+            raise ValueError("dropout must be in the interval [0, 1]")
         if self.dim % self.n_heads != 0:
             raise ValueError("dim must be divisible by n_heads")
         if (self.dim // self.n_heads) % 2 != 0:
diff --git a/tests/test_main.py b/tests/test_main.py
index d1aa7ec..81703ec 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -584,6 +584,10 @@ def test_invalid_numeric_fields_raise(self):
             gqa_cfg(n_kv_heads=3)
         with pytest.raises(ValueError, match="qk_rope_head_dim must be even"):
             mla_cfg(qk_rope_head_dim=7)
+        with pytest.raises(ValueError, match="n_experts_per_tok must be less than or equal"):
+            gqa_cfg(n_experts=2, n_experts_per_tok=3)
+        with pytest.raises(ValueError, match="dropout must be in the interval"):
+            gqa_cfg(dropout=1.1)
 
 
 class TestOpenMythosIntrospection:

From 2bd4d05b8ae8294a43a1d1bc41816c264d3513c5 Mon Sep 17 00:00:00 2001
From: anphonic <abilly@gmail.com>
Date: Fri, 24 Apr 2026 18:08:04 -0500
Subject: [PATCH 7/7] Cache lazy OpenMythos exports

Store resolved lazy exports on the package module so repeated introspection does not rebuild the export map.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 open_mythos/__init__.py | 42 +++++++++--------------------------------
 tests/test_main.py      | 10 ++++++++++
 2 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/open_mythos/__init__.py b/open_mythos/__init__.py
index 28f87ed..f7dcefc 100644
--- a/open_mythos/__init__.py
+++ b/open_mythos/__init__.py
@@ -1,3 +1,5 @@
+from importlib import import_module
+
 from open_mythos.config import MythosConfig
 from open_mythos.tokenizer import MythosTokenizer, get_vocab_size, load_tokenizer
 from open_mythos.variants import (
@@ -26,6 +28,7 @@
     "loop_index_embedding",
     "precompute_rope_freqs",
 }
+_MAIN_MODULE = None
 
 __all__ = [
     "MythosConfig",
@@ -58,37 +61,10 @@
 
 def __getattr__(name: str):
     if name in _MAIN_EXPORTS:
-        from open_mythos.main import (
-            ACTHalting,
-            Expert,
-            GQAttention,
-            LoRAAdapter,
-            LTIInjection,
-            MLAttention,
-            MoEFFN,
-            OpenMythos,
-            RecurrentBlock,
-            RMSNorm,
-            TransformerBlock,
-            apply_rope,
-            loop_index_embedding,
-            precompute_rope_freqs,
-        )
-
-        return {
-            "ACTHalting": ACTHalting,
-            "Expert": Expert,
-            "GQAttention": GQAttention,
-            "LoRAAdapter": LoRAAdapter,
-            "LTIInjection": LTIInjection,
-            "MLAttention": MLAttention,
-            "MoEFFN": MoEFFN,
-            "OpenMythos": OpenMythos,
-            "RecurrentBlock": RecurrentBlock,
-            "RMSNorm": RMSNorm,
-            "TransformerBlock": TransformerBlock,
-            "apply_rope": apply_rope,
-            "loop_index_embedding": loop_index_embedding,
-            "precompute_rope_freqs": precompute_rope_freqs,
-        }[name]
+        global _MAIN_MODULE
+        if _MAIN_MODULE is None:
+            _MAIN_MODULE = import_module("open_mythos.main")
+        value = getattr(_MAIN_MODULE, name)
+        globals()[name] = value
+        return value
     raise AttributeError(f"module 'open_mythos' has no attribute {name!r}")
diff --git a/tests/test_main.py b/tests/test_main.py
index 81703ec..3786be8 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -1,5 +1,6 @@
 import torch
 import pytest
+import open_mythos
 from open_mythos.main import (
     ACTHalting,
     Expert,
@@ -597,6 +598,15 @@ def test_describe_matches_config_profile(self):
 
         assert model.describe() == cfg.runtime_profile()
 
+
+class TestPackageLazyExports:
+    def test_lazy_export_is_cached_on_module(self):
+        first = open_mythos.OpenMythos
+        second = open_mythos.OpenMythos
+
+        assert first is second
+        assert open_mythos.__dict__["OpenMythos"] is first
+
 # ---------------------------------------------------------------------------
 # OpenMythos — GQA mode
 # ---------------------------------------------------------------------------