From 463856cb3fee8c6bba906fa425ba4339d1d811d7 Mon Sep 17 00:00:00 2001 From: anphonic Date: Thu, 23 Apr 2026 22:18:59 -0500 Subject: [PATCH 1/7] Add lightweight OpenMythos runtime metadata Expose a lightweight config/profile surface for orchestration, keep package imports lazy, and add tokenizer helpers plus tests. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/copilot-instructions.md | 36 ++++++++++++ open_mythos/__init__.py | 75 ++++++++++++++++++------ open_mythos/config.py | 101 ++++++++++++++++++++++++++++++++ open_mythos/main.py | 75 ++---------------------- open_mythos/tokenizer.py | 19 +++++- open_mythos/variants.py | 2 +- tests/test_main.py | 34 +++++++++++ tests/test_tokenizer.py | 15 ++++- 8 files changed, 267 insertions(+), 90 deletions(-) create mode 100644 .github/copilot-instructions.md create mode 100644 open_mythos/config.py diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..e3ec543 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,36 @@ +# Copilot Instructions + +## Build, test, and lint commands + +- Install dependencies for local work with `poetry install --with dev`. The repo also keeps a minimal `requirements.txt` for `pip install -r requirements.txt`. +- Build the package with `poetry build`. +- Run the main unit suite with `poetry run pytest tests/test_main.py`. +- Run a single test with `poetry run pytest tests/test_main.py::TestOpenMythosGQA::test_generate_shape`. +- Run tokenizer tests with `poetry run pytest tests/test_tokenizer.py`. These load the default Hugging Face tokenizer (`openai/gpt-oss-20b`), so they are heavier than `tests/test_main.py`. +- Run the RoPE debug script with `poetry run python tests/test_rope_debug.py`. +- Lint with `poetry run ruff check .`. +- Check formatting with `poetry run black --check .`. +- The documented training entrypoints are `poetry run python training/3b_fine_web_edu.py` and `poetry run torchrun --nproc_per_node=$(python -c "import torch; print(torch.cuda.device_count())") training/3b_fine_web_edu.py`. +- Benchmark utilities are executable scripts under `tests/`, not pytest files: `poetry run python tests/small_benchmark.py` and `poetry run python tests/bench_vs_transformer.py`. + +## High-level architecture + +- `open_mythos/main.py` is the primary implementation. It defines `MythosConfig`, both attention backends, the MoE FFN, the recurrent block, and the top-level `OpenMythos` model. +- The model pipeline is fixed: token embedding -> Prelude (`cfg.prelude_layers` dense `TransformerBlock`s run once) -> Recurrent Block (one shared `TransformerBlock` looped `n_loops` times with LoRA, ACT halting, and LTI-stable input injection) -> Coda (`cfg.coda_layers` dense `TransformerBlock`s run once) -> `RMSNorm` -> tied LM head. +- The key architectural invariant is that the encoded input `e` is frozen after the Prelude and injected on every recurrent iteration. The recurrent update path is `loop_index_embedding -> TransformerBlock(use_moe=True) -> LoRAAdapter -> LTIInjection -> ACTHalting`. +- `cfg.attn_type` switches the whole attention/cache path. `GQAttention` stores full K/V by layer, while `MLAttention` stores compressed `c_kv` plus `k_rope` and reconstructs K/V on demand. `OpenMythos.forward()` also switches RoPE buffers based on `attn_type`. +- `open_mythos/variants.py` contains the named model scales (`mythos_1b` through `mythos_1t`) as `MythosConfig` factories. `open_mythos/tokenizer.py` is a small wrapper around `transformers.AutoTokenizer` with `openai/gpt-oss-20b` as the default tokenizer. +- `training/3b_fine_web_edu.py` is a standalone FSDP training script for the 3B variant. It streams FineWeb-Edu shards, uses `MythosTokenizer`, and keeps checkpointing/training logic out of the library module. +- `docs/open_mythos.md` is the detailed API and architecture reference. `docs/datasets.md` holds the training dataset recommendations and token-budget guidance referenced by the training script/README. +- `open_mythos/moda.py` is a separate experimental MoDA + MoE implementation, not part of the main `OpenMythos` export surface. + +## Key conventions + +- For tests, smoke runs, and examples that should execute quickly, use the tiny helper configs from `tests/test_main.py` (`gqa_cfg()` / `mla_cfg()`) or similarly small custom configs. `MythosConfig()` defaults to a much larger research model. +- Preserve decode-position semantics when touching inference code: `start_pos` chooses the RoPE slice for incremental decoding, and each cache entry uses a deterministic key (`prelude_{i}`, `recurrent_loop_{t}`, `coda_{i}`). +- Do not short-circuit recurrent execution when a KV cache is active. `RecurrentBlock` only exits early on ACT convergence when `kv_cache is None`; cached decoding relies on every loop depth writing its cache entry on every step. +- Keep the embedding/LM head weight tying intact (`self.head.weight = self.embed.weight`), and keep `router_bias` as a registered buffer rather than a trainable parameter. +- Prelude and Coda are always dense FFN blocks (`use_moe=False`); the recurrent block is the only place that uses the MoE FFN (`use_moe=True`). +- Named variants and default configs are MLA-first. If you switch code or tests to GQA, make sure the config still provides valid MLA fields because shared config helpers are reused across both modes. +- Use the variant helpers that actually exist in `open_mythos/variants.py`: `mythos_1b`, `mythos_3b`, `mythos_10b`, `mythos_50b`, `mythos_100b`, `mythos_500b`, and `mythos_1t`. The README currently shows a stale `mythos_7b()` example. +- The benchmark scripts live under `tests/` and are intended to be run directly as scripts. Their docstrings still reference a `benchmarks/` path that does not exist in this repository. diff --git a/open_mythos/__init__.py b/open_mythos/__init__.py index 73c2c04..28f87ed 100644 --- a/open_mythos/__init__.py +++ b/open_mythos/__init__.py @@ -1,21 +1,5 @@ -from open_mythos.main import ( - ACTHalting, - Expert, - GQAttention, - LoRAAdapter, - LTIInjection, - MLAttention, - MoEFFN, - MythosConfig, - OpenMythos, - RecurrentBlock, - RMSNorm, - TransformerBlock, - apply_rope, - loop_index_embedding, - precompute_rope_freqs, -) -from open_mythos.tokenizer import MythosTokenizer +from open_mythos.config import MythosConfig +from open_mythos.tokenizer import MythosTokenizer, get_vocab_size, load_tokenizer from open_mythos.variants import ( mythos_1b, mythos_1t, @@ -26,6 +10,23 @@ mythos_500b, ) +_MAIN_EXPORTS = { + "ACTHalting", + "Expert", + "GQAttention", + "LoRAAdapter", + "LTIInjection", + "MLAttention", + "MoEFFN", + "OpenMythos", + "RecurrentBlock", + "RMSNorm", + "TransformerBlock", + "apply_rope", + "loop_index_embedding", + "precompute_rope_freqs", +} + __all__ = [ "MythosConfig", "RMSNorm", @@ -53,3 +54,41 @@ "get_vocab_size", "MythosTokenizer", ] + + +def __getattr__(name: str): + if name in _MAIN_EXPORTS: + from open_mythos.main import ( + ACTHalting, + Expert, + GQAttention, + LoRAAdapter, + LTIInjection, + MLAttention, + MoEFFN, + OpenMythos, + RecurrentBlock, + RMSNorm, + TransformerBlock, + apply_rope, + loop_index_embedding, + precompute_rope_freqs, + ) + + return { + "ACTHalting": ACTHalting, + "Expert": Expert, + "GQAttention": GQAttention, + "LoRAAdapter": LoRAAdapter, + "LTIInjection": LTIInjection, + "MLAttention": MLAttention, + "MoEFFN": MoEFFN, + "OpenMythos": OpenMythos, + "RecurrentBlock": RecurrentBlock, + "RMSNorm": RMSNorm, + "TransformerBlock": TransformerBlock, + "apply_rope": apply_rope, + "loop_index_embedding": loop_index_embedding, + "precompute_rope_freqs": precompute_rope_freqs, + }[name] + raise AttributeError(f"module 'open_mythos' has no attribute {name!r}") diff --git a/open_mythos/config.py b/open_mythos/config.py new file mode 100644 index 0000000..54ae93a --- /dev/null +++ b/open_mythos/config.py @@ -0,0 +1,101 @@ +from dataclasses import asdict, dataclass + + +@dataclass +class MythosConfig: + """ + Hyperparameter configuration for OpenMythos. + + Core: + vocab_size -- token vocabulary size + dim -- model hidden dimension + n_heads -- number of query attention heads + n_kv_heads -- number of key/value heads (GQA; ignored by MLA) + max_seq_len -- maximum sequence length for RoPE precomputation + max_loop_iters -- default recurrent loop depth T at inference + prelude_layers -- number of standard transformer layers before the loop + coda_layers -- number of standard transformer layers after the loop + + Attention (attn_type selects between the two): + attn_type -- "gqa" for Grouped Query Attention, "mla" for Multi-Latent Attention + kv_lora_rank -- [MLA] compressed KV latent dimension stored in the cache + q_lora_rank -- [MLA] compressed Q latent dimension + qk_rope_head_dim-- [MLA] per-head dims that receive RoPE + qk_nope_head_dim-- [MLA] per-head dims without positional encoding + v_head_dim -- [MLA] per-head value dimension + + MoE FFN (used inside the recurrent block): + n_experts -- total number of routed expert FFNs + n_shared_experts-- number of always-active shared experts + n_experts_per_tok-- top-K experts selected per token by the router + expert_dim -- hidden dimension inside each fine-grained expert + + Other: + act_threshold -- ACT halting threshold (cumulative probability to stop looping) + rope_theta -- RoPE base frequency + lora_rank -- rank of the per-loop depth-wise LoRA adapter + """ + + vocab_size: int = 32000 + dim: int = 2048 + n_heads: int = 16 + n_kv_heads: int = 4 # GQA: fewer KV heads than Q heads + max_seq_len: int = 4096 + max_loop_iters: int = 16 # T — recurrent depth at inference + prelude_layers: int = 2 + coda_layers: int = 2 + # Attention type: "gqa" | "mla" + attn_type: str = "mla" + # MLA params (only used when attn_type="mla") + kv_lora_rank: int = 512 # compressed KV latent cached instead of full K/V + q_lora_rank: int = 1536 # compressed Q latent dim + qk_rope_head_dim: int = 64 # per-head dims that receive RoPE + qk_nope_head_dim: int = 128 # per-head dims without RoPE + v_head_dim: int = 128 # per-head value dim + # MoE + n_experts: int = 64 + n_shared_experts: int = 2 + n_experts_per_tok: int = 4 # top-K routed + expert_dim: int = 512 # fine-grained: dim // (n_experts // n_experts_per_tok) + # ACT halting + act_threshold: float = 0.99 + # RoPE + rope_theta: float = 500000.0 + # LoRA depth adaptation + lora_rank: int = 16 + # Maximum tokens to generate per forward pass + max_output_tokens: int = 4096 + # Dropout (set 0.0 to disable; 0.1 is standard for pretraining) + dropout: float = 0.0 + + def to_dict(self) -> dict[str, object]: + """Return a plain-Python config dictionary for serialization.""" + return asdict(self) + + def runtime_profile(self) -> dict[str, object]: + """ + Describe the stable runtime-facing capabilities of this config. + + GatesOfMythos can use this to validate routing decisions without + inspecting model internals or loading a heavyweight instance first. + """ + return { + "model_name": "OpenMythos", + "attn_type": self.attn_type, + "supports_kv_cache": True, + "supports_incremental_decode": True, + "uses_moe": True, + "uses_act_halting": True, + "uses_lti_injection": True, + "max_context_tokens": self.max_seq_len, + "max_loop_iters": self.max_loop_iters, + "max_output_tokens": self.max_output_tokens, + "cache_layout": { + "prelude": [f"prelude_{i}" for i in range(self.prelude_layers)], + "recurrent": "recurrent_loop_{t}", + "coda": [f"coda_{i}" for i in range(self.coda_layers)], + }, + "attention_backend": ( + "multi_latent" if self.attn_type == "mla" else "grouped_query" + ), + } diff --git a/open_mythos/main.py b/open_mythos/main.py index 65b0fa8..84ab50b 100644 --- a/open_mythos/main.py +++ b/open_mythos/main.py @@ -1,10 +1,11 @@ -from dataclasses import dataclass from typing import Optional import torch import torch.nn as nn import torch.nn.functional as F +from open_mythos.config import MythosConfig + try: from flash_attn import flash_attn_func @@ -13,74 +14,6 @@ _HAS_FLASH_ATTN = False -@dataclass -class MythosConfig: - """ - Hyperparameter configuration for OpenMythos. - - Core: - vocab_size -- token vocabulary size - dim -- model hidden dimension - n_heads -- number of query attention heads - n_kv_heads -- number of key/value heads (GQA; ignored by MLA) - max_seq_len -- maximum sequence length for RoPE precomputation - max_loop_iters -- default recurrent loop depth T at inference - prelude_layers -- number of standard transformer layers before the loop - coda_layers -- number of standard transformer layers after the loop - - Attention (attn_type selects between the two): - attn_type -- "gqa" for Grouped Query Attention, "mla" for Multi-Latent Attention - kv_lora_rank -- [MLA] compressed KV latent dimension stored in the cache - q_lora_rank -- [MLA] compressed Q latent dimension - qk_rope_head_dim-- [MLA] per-head dims that receive RoPE - qk_nope_head_dim-- [MLA] per-head dims without positional encoding - v_head_dim -- [MLA] per-head value dimension - - MoE FFN (used inside the recurrent block): - n_experts -- total number of routed expert FFNs - n_shared_experts-- number of always-active shared experts - n_experts_per_tok-- top-K experts selected per token by the router - expert_dim -- hidden dimension inside each fine-grained expert - - Other: - act_threshold -- ACT halting threshold (cumulative probability to stop looping) - rope_theta -- RoPE base frequency - lora_rank -- rank of the per-loop depth-wise LoRA adapter - """ - - vocab_size: int = 32000 - dim: int = 2048 - n_heads: int = 16 - n_kv_heads: int = 4 # GQA: fewer KV heads than Q heads - max_seq_len: int = 4096 - max_loop_iters: int = 16 # T — recurrent depth at inference - prelude_layers: int = 2 - coda_layers: int = 2 - # Attention type: "gqa" | "mla" - attn_type: str = "mla" - # MLA params (only used when attn_type="mla") - kv_lora_rank: int = 512 # compressed KV latent cached instead of full K/V - q_lora_rank: int = 1536 # compressed Q latent dim - qk_rope_head_dim: int = 64 # per-head dims that receive RoPE - qk_nope_head_dim: int = 128 # per-head dims without RoPE - v_head_dim: int = 128 # per-head value dim - # MoE - n_experts: int = 64 - n_shared_experts: int = 2 - n_experts_per_tok: int = 4 # top-K routed - expert_dim: int = 512 # fine-grained: dim // (n_experts // n_experts_per_tok) - # ACT halting - act_threshold: float = 0.99 - # RoPE - rope_theta: float = 500000.0 - # LoRA depth adaptation - lora_rank: int = 16 - # Maximum tokens to generate per forward pass - max_output_tokens: int = 4096 - # Dropout (set 0.0 to disable; 0.1 is standard for pretraining) - dropout: float = 0.0 - - # --------------------------------------------------------------------------- # RMSNorm # --------------------------------------------------------------------------- @@ -965,6 +898,10 @@ def _init_weights(self) -> None: elif isinstance(m, nn.Embedding): nn.init.normal_(m.weight, std=0.02) + def describe(self) -> dict[str, object]: + """Return the stable runtime profile for orchestration and routing.""" + return self.cfg.runtime_profile() + @staticmethod def _causal_mask( seq_len: int, device: torch.device, dtype: torch.dtype diff --git a/open_mythos/tokenizer.py b/open_mythos/tokenizer.py index fadb3a5..d9ecd12 100644 --- a/open_mythos/tokenizer.py +++ b/open_mythos/tokenizer.py @@ -1,4 +1,7 @@ -from transformers import AutoTokenizer +try: + from transformers import AutoTokenizer +except ImportError: # pragma: no cover - dependency may be absent in lightweight envs + AutoTokenizer = None DEFAULT_MODEL_ID = "openai/gpt-oss-20b" @@ -27,6 +30,10 @@ def __init__(self, model_id: str = DEFAULT_MODEL_ID): Args: model_id (str): HuggingFace model identifier or path to tokenizer files. """ + if AutoTokenizer is None: + raise ModuleNotFoundError( + "transformers is required to construct MythosTokenizer" + ) self.tokenizer = AutoTokenizer.from_pretrained(model_id) @property @@ -62,3 +69,13 @@ def decode(self, token_ids: list[int]) -> str: str: Decoded string representation of the token IDs. """ return self.tokenizer.decode(token_ids, skip_special_tokens=True) + + +def load_tokenizer(model_id: str = DEFAULT_MODEL_ID) -> MythosTokenizer: + """Construct a tokenizer wrapper using the default or requested model id.""" + return MythosTokenizer(model_id=model_id) + + +def get_vocab_size(model_id: str = DEFAULT_MODEL_ID) -> int: + """Return the tokenizer vocabulary size for a given model id.""" + return load_tokenizer(model_id=model_id).vocab_size diff --git a/open_mythos/variants.py b/open_mythos/variants.py index 83f7dd4..0133714 100644 --- a/open_mythos/variants.py +++ b/open_mythos/variants.py @@ -1,4 +1,4 @@ -from open_mythos.main import MythosConfig +from open_mythos.config import MythosConfig # Parameter budget breakdown per variant: # total ≈ embed + prelude/coda dense blocks + recurrent MLA + MoE diff --git a/tests/test_main.py b/tests/test_main.py index c54c462..1245d4e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -543,6 +543,40 @@ def test_single_loop_runs(self): assert out.shape == (B, T, self.cfg.dim) +# --------------------------------------------------------------------------- +# MythosConfig / OpenMythos introspection +# --------------------------------------------------------------------------- + + +class TestMythosConfigIntrospection: + def test_runtime_profile_includes_cache_layout(self): + cfg = mla_cfg(prelude_layers=2, coda_layers=1, max_loop_iters=4) + profile = cfg.runtime_profile() + + assert profile["model_name"] == "OpenMythos" + assert profile["attn_type"] == "mla" + assert profile["supports_kv_cache"] is True + assert profile["supports_incremental_decode"] is True + assert profile["cache_layout"]["prelude"] == ["prelude_0", "prelude_1"] + assert profile["cache_layout"]["recurrent"] == "recurrent_loop_{t}" + assert profile["cache_layout"]["coda"] == ["coda_0"] + + def test_to_dict_round_trips_dataclass_fields(self): + cfg = gqa_cfg(dim=128, max_output_tokens=256) + data = cfg.to_dict() + + assert data["dim"] == 128 + assert data["max_output_tokens"] == 256 + assert data["attn_type"] == "gqa" + + +class TestOpenMythosIntrospection: + def test_describe_matches_config_profile(self): + cfg = gqa_cfg() + model = OpenMythos(cfg) + + assert model.describe() == cfg.runtime_profile() + # --------------------------------------------------------------------------- # OpenMythos — GQA mode # --------------------------------------------------------------------------- diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index fab7533..9c17b98 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,5 +1,9 @@ import pytest -from open_mythos.tokenizer import MythosTokenizer +from open_mythos.tokenizer import ( + MythosTokenizer, + get_vocab_size, + load_tokenizer, +) @pytest.fixture(scope="module") @@ -64,6 +68,15 @@ def test_custom_model_id(): assert tok.vocab_size > 0 +def test_load_tokenizer_helper(): + tok = load_tokenizer() + assert isinstance(tok, MythosTokenizer) + + +def test_get_vocab_size_helper(tokenizer): + assert get_vocab_size() == tokenizer.vocab_size + + def test_vocab_size_consistent(tokenizer): outer = tokenizer.vocab_size inner = tokenizer.tokenizer.vocab_size From 95fcf0a74a60f39927858000d440d620c00ee245 Mon Sep 17 00:00:00 2001 From: anphonic Date: Thu, 23 Apr 2026 23:23:51 -0500 Subject: [PATCH 2/7] Make tokenizer imports lazy Delay importing transformers until a tokenizer is actually constructed so package imports stay lightweight. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- open_mythos/tokenizer.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/open_mythos/tokenizer.py b/open_mythos/tokenizer.py index d9ecd12..e7b1a93 100644 --- a/open_mythos/tokenizer.py +++ b/open_mythos/tokenizer.py @@ -1,11 +1,16 @@ -try: - from transformers import AutoTokenizer -except ImportError: # pragma: no cover - dependency may be absent in lightweight envs - AutoTokenizer = None - DEFAULT_MODEL_ID = "openai/gpt-oss-20b" +def _load_auto_tokenizer(): + try: + from transformers import AutoTokenizer + except ImportError as exc: # pragma: no cover - dependency may be absent + raise ModuleNotFoundError( + "transformers is required to construct MythosTokenizer" + ) from exc + return AutoTokenizer + + class MythosTokenizer: """ HuggingFace tokenizer wrapper for OpenMythos. @@ -30,11 +35,7 @@ def __init__(self, model_id: str = DEFAULT_MODEL_ID): Args: model_id (str): HuggingFace model identifier or path to tokenizer files. """ - if AutoTokenizer is None: - raise ModuleNotFoundError( - "transformers is required to construct MythosTokenizer" - ) - self.tokenizer = AutoTokenizer.from_pretrained(model_id) + self.tokenizer = _load_auto_tokenizer().from_pretrained(model_id) @property def vocab_size(self) -> int: From bda5ca98e33e5792a0deeac56e2cff2777c78c92 Mon Sep 17 00:00:00 2001 From: anphonic Date: Thu, 23 Apr 2026 23:26:30 -0500 Subject: [PATCH 3/7] Validate attention type and update docs Reject unsupported attn_type values and refresh the architecture note to match the new module layout. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/copilot-instructions.md | 2 +- open_mythos/config.py | 6 ++++++ tests/test_main.py | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index e3ec543..e466574 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -15,7 +15,7 @@ ## High-level architecture -- `open_mythos/main.py` is the primary implementation. It defines `MythosConfig`, both attention backends, the MoE FFN, the recurrent block, and the top-level `OpenMythos` model. +- `open_mythos/config.py` defines `MythosConfig`. `open_mythos/main.py` is the primary implementation for both attention backends, the MoE FFN, the recurrent block, and the top-level `OpenMythos` model. - The model pipeline is fixed: token embedding -> Prelude (`cfg.prelude_layers` dense `TransformerBlock`s run once) -> Recurrent Block (one shared `TransformerBlock` looped `n_loops` times with LoRA, ACT halting, and LTI-stable input injection) -> Coda (`cfg.coda_layers` dense `TransformerBlock`s run once) -> `RMSNorm` -> tied LM head. - The key architectural invariant is that the encoded input `e` is frozen after the Prelude and injected on every recurrent iteration. The recurrent update path is `loop_index_embedding -> TransformerBlock(use_moe=True) -> LoRAAdapter -> LTIInjection -> ACTHalting`. - `cfg.attn_type` switches the whole attention/cache path. `GQAttention` stores full K/V by layer, while `MLAttention` stores compressed `c_kv` plus `k_rope` and reconstructs K/V on demand. `OpenMythos.forward()` also switches RoPE buffers based on `attn_type`. diff --git a/open_mythos/config.py b/open_mythos/config.py index 54ae93a..ee12f49 100644 --- a/open_mythos/config.py +++ b/open_mythos/config.py @@ -68,6 +68,12 @@ class MythosConfig: # Dropout (set 0.0 to disable; 0.1 is standard for pretraining) dropout: float = 0.0 + def __post_init__(self) -> None: + if self.attn_type not in {"gqa", "mla"}: + raise ValueError( + f"Unsupported attn_type {self.attn_type!r}; expected 'gqa' or 'mla'" + ) + def to_dict(self) -> dict[str, object]: """Return a plain-Python config dictionary for serialization.""" return asdict(self) diff --git a/tests/test_main.py b/tests/test_main.py index 1245d4e..54063fe 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -569,6 +569,10 @@ def test_to_dict_round_trips_dataclass_fields(self): assert data["max_output_tokens"] == 256 assert data["attn_type"] == "gqa" + def test_invalid_attn_type_raises(self): + with pytest.raises(ValueError, match="Unsupported attn_type"): + gqa_cfg(attn_type="mlaa") + class TestOpenMythosIntrospection: def test_describe_matches_config_profile(self): From b5388b9804a376499be42275a5110aadf5edffbb Mon Sep 17 00:00:00 2001 From: anphonic Date: Thu, 23 Apr 2026 23:31:36 -0500 Subject: [PATCH 4/7] Add MythosConfig sanity checks Validate core config fields up front so invalid shapes and thresholds fail fast. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- open_mythos/config.py | 44 +++++++++++++++++++++++++++++++++++++++++++ tests/test_main.py | 6 ++++++ 2 files changed, 50 insertions(+) diff --git a/open_mythos/config.py b/open_mythos/config.py index ee12f49..5688b33 100644 --- a/open_mythos/config.py +++ b/open_mythos/config.py @@ -73,6 +73,50 @@ def __post_init__(self) -> None: raise ValueError( f"Unsupported attn_type {self.attn_type!r}; expected 'gqa' or 'mla'" ) + if self.vocab_size <= 0: + raise ValueError("vocab_size must be positive") + if self.dim <= 0: + raise ValueError("dim must be positive") + if self.n_heads <= 0: + raise ValueError("n_heads must be positive") + if self.n_kv_heads <= 0: + raise ValueError("n_kv_heads must be positive") + if self.max_seq_len <= 0: + raise ValueError("max_seq_len must be positive") + if self.max_loop_iters <= 0: + raise ValueError("max_loop_iters must be positive") + if self.prelude_layers < 0: + raise ValueError("prelude_layers must be non-negative") + if self.coda_layers < 0: + raise ValueError("coda_layers must be non-negative") + if self.kv_lora_rank <= 0: + raise ValueError("kv_lora_rank must be positive") + if self.q_lora_rank <= 0: + raise ValueError("q_lora_rank must be positive") + if self.qk_rope_head_dim <= 0: + raise ValueError("qk_rope_head_dim must be positive") + if self.qk_nope_head_dim <= 0: + raise ValueError("qk_nope_head_dim must be positive") + if self.v_head_dim <= 0: + raise ValueError("v_head_dim must be positive") + if self.n_experts <= 0: + raise ValueError("n_experts must be positive") + if self.n_shared_experts < 0: + raise ValueError("n_shared_experts must be non-negative") + if self.n_experts_per_tok <= 0: + raise ValueError("n_experts_per_tok must be positive") + if self.expert_dim <= 0: + raise ValueError("expert_dim must be positive") + if not 0.0 < self.act_threshold <= 1.0: + raise ValueError("act_threshold must be in the interval (0, 1]") + if self.rope_theta <= 0: + raise ValueError("rope_theta must be positive") + if self.lora_rank <= 0: + raise ValueError("lora_rank must be positive") + if self.max_output_tokens <= 0: + raise ValueError("max_output_tokens must be positive") + if self.dropout < 0: + raise ValueError("dropout must be non-negative") def to_dict(self) -> dict[str, object]: """Return a plain-Python config dictionary for serialization.""" diff --git a/tests/test_main.py b/tests/test_main.py index 54063fe..bc8cdb3 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -573,6 +573,12 @@ def test_invalid_attn_type_raises(self): with pytest.raises(ValueError, match="Unsupported attn_type"): gqa_cfg(attn_type="mlaa") + def test_invalid_numeric_fields_raise(self): + with pytest.raises(ValueError, match="dim must be positive"): + gqa_cfg(dim=0) + with pytest.raises(ValueError, match="act_threshold must be in the interval"): + gqa_cfg(act_threshold=1.1) + class TestOpenMythosIntrospection: def test_describe_matches_config_profile(self): From b20e8ff48e7665e691be4af4acb5d17f0fe23844 Mon Sep 17 00:00:00 2001 From: anphonic Date: Thu, 23 Apr 2026 23:44:07 -0500 Subject: [PATCH 5/7] Tighten MythosConfig validation Validate head-size and attention-grouping invariants up front so invalid configs fail before runtime. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- open_mythos/config.py | 11 +++++++++++ tests/test_main.py | 6 ++++++ 2 files changed, 17 insertions(+) diff --git a/open_mythos/config.py b/open_mythos/config.py index 5688b33..83862ed 100644 --- a/open_mythos/config.py +++ b/open_mythos/config.py @@ -117,6 +117,17 @@ def __post_init__(self) -> None: raise ValueError("max_output_tokens must be positive") if self.dropout < 0: raise ValueError("dropout must be non-negative") + if self.dim % self.n_heads != 0: + raise ValueError("dim must be divisible by n_heads") + if (self.dim // self.n_heads) % 2 != 0: + raise ValueError("dim // n_heads must be even for RoPE") + if self.attn_type == "gqa": + if self.n_kv_heads > self.n_heads: + raise ValueError("n_kv_heads must be less than or equal to n_heads") + if self.n_heads % self.n_kv_heads != 0: + raise ValueError("n_heads must be divisible by n_kv_heads") + if self.attn_type == "mla" and self.qk_rope_head_dim % 2 != 0: + raise ValueError("qk_rope_head_dim must be even for MLA RoPE") def to_dict(self) -> dict[str, object]: """Return a plain-Python config dictionary for serialization.""" diff --git a/tests/test_main.py b/tests/test_main.py index bc8cdb3..d1aa7ec 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -578,6 +578,12 @@ def test_invalid_numeric_fields_raise(self): gqa_cfg(dim=0) with pytest.raises(ValueError, match="act_threshold must be in the interval"): gqa_cfg(act_threshold=1.1) + with pytest.raises(ValueError, match="dim must be divisible by n_heads"): + gqa_cfg(dim=66, n_heads=8) + with pytest.raises(ValueError, match="n_heads must be divisible by n_kv_heads"): + gqa_cfg(n_kv_heads=3) + with pytest.raises(ValueError, match="qk_rope_head_dim must be even"): + mla_cfg(qk_rope_head_dim=7) class TestOpenMythosIntrospection: From 936d24d34fc5dcc58bd4c58bb4a12b5e9380c436 Mon Sep 17 00:00:00 2001 From: anphonic Date: Thu, 23 Apr 2026 23:47:53 -0500 Subject: [PATCH 6/7] Tighten config bounds Validate expert top-k and dropout probability bounds alongside the existing shape checks. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- open_mythos/config.py | 8 ++++++-- tests/test_main.py | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/open_mythos/config.py b/open_mythos/config.py index 83862ed..357b58b 100644 --- a/open_mythos/config.py +++ b/open_mythos/config.py @@ -105,6 +105,10 @@ def __post_init__(self) -> None: raise ValueError("n_shared_experts must be non-negative") if self.n_experts_per_tok <= 0: raise ValueError("n_experts_per_tok must be positive") + if self.n_experts_per_tok > self.n_experts: + raise ValueError( + "n_experts_per_tok must be less than or equal to n_experts" + ) if self.expert_dim <= 0: raise ValueError("expert_dim must be positive") if not 0.0 < self.act_threshold <= 1.0: @@ -115,8 +119,8 @@ def __post_init__(self) -> None: raise ValueError("lora_rank must be positive") if self.max_output_tokens <= 0: raise ValueError("max_output_tokens must be positive") - if self.dropout < 0: - raise ValueError("dropout must be non-negative") + if not 0.0 <= self.dropout <= 1.0: + raise ValueError("dropout must be in the interval [0, 1]") if self.dim % self.n_heads != 0: raise ValueError("dim must be divisible by n_heads") if (self.dim // self.n_heads) % 2 != 0: diff --git a/tests/test_main.py b/tests/test_main.py index d1aa7ec..81703ec 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -584,6 +584,10 @@ def test_invalid_numeric_fields_raise(self): gqa_cfg(n_kv_heads=3) with pytest.raises(ValueError, match="qk_rope_head_dim must be even"): mla_cfg(qk_rope_head_dim=7) + with pytest.raises(ValueError, match="n_experts_per_tok must be less than or equal"): + gqa_cfg(n_experts=2, n_experts_per_tok=3) + with pytest.raises(ValueError, match="dropout must be in the interval"): + gqa_cfg(dropout=1.1) class TestOpenMythosIntrospection: From 2bd4d05b8ae8294a43a1d1bc41816c264d3513c5 Mon Sep 17 00:00:00 2001 From: anphonic Date: Fri, 24 Apr 2026 18:08:04 -0500 Subject: [PATCH 7/7] Cache lazy OpenMythos exports Store resolved lazy exports on the package module so repeated introspection does not rebuild the export map. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- open_mythos/__init__.py | 42 +++++++++-------------------------------- tests/test_main.py | 10 ++++++++++ 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/open_mythos/__init__.py b/open_mythos/__init__.py index 28f87ed..f7dcefc 100644 --- a/open_mythos/__init__.py +++ b/open_mythos/__init__.py @@ -1,3 +1,5 @@ +from importlib import import_module + from open_mythos.config import MythosConfig from open_mythos.tokenizer import MythosTokenizer, get_vocab_size, load_tokenizer from open_mythos.variants import ( @@ -26,6 +28,7 @@ "loop_index_embedding", "precompute_rope_freqs", } +_MAIN_MODULE = None __all__ = [ "MythosConfig", @@ -58,37 +61,10 @@ def __getattr__(name: str): if name in _MAIN_EXPORTS: - from open_mythos.main import ( - ACTHalting, - Expert, - GQAttention, - LoRAAdapter, - LTIInjection, - MLAttention, - MoEFFN, - OpenMythos, - RecurrentBlock, - RMSNorm, - TransformerBlock, - apply_rope, - loop_index_embedding, - precompute_rope_freqs, - ) - - return { - "ACTHalting": ACTHalting, - "Expert": Expert, - "GQAttention": GQAttention, - "LoRAAdapter": LoRAAdapter, - "LTIInjection": LTIInjection, - "MLAttention": MLAttention, - "MoEFFN": MoEFFN, - "OpenMythos": OpenMythos, - "RecurrentBlock": RecurrentBlock, - "RMSNorm": RMSNorm, - "TransformerBlock": TransformerBlock, - "apply_rope": apply_rope, - "loop_index_embedding": loop_index_embedding, - "precompute_rope_freqs": precompute_rope_freqs, - }[name] + global _MAIN_MODULE + if _MAIN_MODULE is None: + _MAIN_MODULE = import_module("open_mythos.main") + value = getattr(_MAIN_MODULE, name) + globals()[name] = value + return value raise AttributeError(f"module 'open_mythos' has no attribute {name!r}") diff --git a/tests/test_main.py b/tests/test_main.py index 81703ec..3786be8 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,5 +1,6 @@ import torch import pytest +import open_mythos from open_mythos.main import ( ACTHalting, Expert, @@ -597,6 +598,15 @@ def test_describe_matches_config_profile(self): assert model.describe() == cfg.runtime_profile() + +class TestPackageLazyExports: + def test_lazy_export_is_cached_on_module(self): + first = open_mythos.OpenMythos + second = open_mythos.OpenMythos + + assert first is second + assert open_mythos.__dict__["OpenMythos"] is first + # --------------------------------------------------------------------------- # OpenMythos — GQA mode # ---------------------------------------------------------------------------