From 13802d6d56bc8848f409ec5cbee596985ff9be59 Mon Sep 17 00:00:00 2001 From: Kailas Mahavarkar <66670953+KailasMahavarkar@users.noreply.github.com> Date: Mon, 20 Apr 2026 22:29:15 +0530 Subject: [PATCH 1/3] refactor(bonsai): compact v5 unified 16-verb grammar (option A) Before: U/F/D compact verbs for the 3 ingest paths, with an `!` escape hatch to pass any other DSL through verbatim. Model had to remember two tokenizations and the escape never compressed. After: one positional verb table covering the whole common DSL surface - ingest (U/F/D), edges (E), retrieval (RM/SM/LX/AQ), walks (RL/TR/AN/SG), sys/vault (SS/SC/SH/ST/SX/VS). Python expands each to the full DSL line. Every path hits the same ~3-5x output-token reduction now, not just ingest. Changes: - Replace `_parse_compact_output` with a dispatch table of verb handlers built from small factories (_h_upsert, _h_fact, _h_drop, _h_edge, _h_query, _h_walk, _h_plain). - Swap `CompactTurn.raw_dsl` for `CompactTurn.statements`: pre-rendered DSL lines for every non-ingest verb. `_synthesize_dsl` appends them verbatim after the message node + mention wiring + fact updates. - SKILL.md rewritten to v5: 16 verbs documented, examples per common path, ~900 tokens. - Unit tests: drop `!`-escape block, add per-verb coverage for edges, all 4 retrieval verbs, all 4 walks, all 6 sys/vault ops, and the aliased long forms (REMEMBER/SIMILAR/RECALL/TRAVERSE). Test results: 78/78 bonsai unit tests pass; full suite 1880 pass, 101 skip (unchanged). --- src/graphstore/bonsai_ingestor.py | 193 +++++++++++--- tests/test_bonsai_ingestor.py | 247 ++++++++++++++++-- .../graphstore-bonsai-dsl-compact/SKILL.md | 136 +++++++--- 3 files changed, 461 insertions(+), 115 deletions(-) diff --git a/src/graphstore/bonsai_ingestor.py b/src/graphstore/bonsai_ingestor.py index 9a91cec..924b2d2 100644 --- a/src/graphstore/bonsai_ingestor.py +++ b/src/graphstore/bonsai_ingestor.py @@ -227,66 +227,171 @@ def _scrape_belief_updates( # -------------------------------------------------------------------- -# Compact output mode: LLM emits 3 tagged lines (ENTS/BELIEFS/RETRACTS); -# we synthesize the full DSL in Python. 3-5x fewer output tokens than the -# full-DSL mode, measured on 4B TQ1_0. See tools/skills/graphstore-bonsai- -# dsl-compact/SKILL.md for the exact output contract. +# Compact output mode ("caveman v5"): LLM emits one verb per line covering +# the whole DSL surface. Python inflates to full DSL. Measured ~3-5x fewer +# output tokens than raw DSL on every path. See +# tools/skills/graphstore-bonsai-dsl-compact/SKILL.md for the contract. +# +# Verbs fall into three groups: +# 1. Fact-state (U / F / D): populate entities / beliefs / retracts slots +# so _synthesize_dsl can auto-wire mention edges and cross-message +# belief identity works. +# 2. Edge (E): pre-renders a CREATE EDGE line. +# 3. Retrieval (RM/SM/LX/AQ), walks (RL/TR/AN/SG), sys/vault +# (SS/SC/SH/ST/SX/VS): each pre-renders one full DSL line directly. +# +# Groups 2 and 3 accumulate in turn.statements and get appended verbatim +# after the mention wiring and fact updates. +# +# Unknown verbs and malformed lines are silently dropped (LLM may drift; +# parser is lax so a single bad line doesn't lose the whole turn). # -------------------------------------------------------------------- -# One "key"="value" pair, capturing both sides. "key" matches ent: or fact: -# prefixes; "value" is everything between the escaped-quote-aware delimiters. -_COMPACT_KV_RE = re.compile(r'"([^"\\]+(?:\\.[^"\\]*)*)"\s*=\s*"([^"\\]*(?:\\.[^"\\]*)*)"') -# Bare-id list item (RETRACTS uses these). -_COMPACT_ID_RE = re.compile(r'"([^"\\]+(?:\\.[^"\\]*)*)"') - @dataclass class CompactTurn: - """Parsed structured output of a compact-mode LLM call.""" + """Parsed structured output of a compact-mode LLM call (v5 option A).""" + + entities: list[tuple[str, str]] = field(default_factory=list) + beliefs: list[tuple[str, str]] = field(default_factory=list) + retracts: list[str] = field(default_factory=list) + statements: list[str] = field(default_factory=list) + + +def _dsl_escape(s: str) -> str: + """Escape a Python string for safe embedding inside a DSL "..." literal.""" + return s.replace("\\", "\\\\").replace('"', '\\"') + + +def _h_upsert(turn: CompactTurn, ln: str) -> None: + """U """ + parts = ln.split(None, 2) + if len(parts) < 3: + return + slug = parts[1].removeprefix("ent:").strip('"') + name = parts[2].strip().strip('"') + if slug and name: + turn.entities.append((f"ent:{slug}", name)) + + +def _h_fact(turn: CompactTurn, ln: str) -> None: + """F """ + parts = ln.split(None, 2) + if len(parts) < 3: + return + topic = parts[1].removeprefix("fact:").strip('"') + value = parts[2].strip().strip('"') + if topic and value: + turn.beliefs.append((f"fact:{topic}", value)) + + +def _h_drop(turn: CompactTurn, ln: str) -> None: + """D """ + parts = ln.split() + if len(parts) < 2: + return + topic = parts[1].removeprefix("fact:").strip('"') + if topic: + turn.retracts.append(f"fact:{topic}") + + +def _h_edge(turn: CompactTurn, ln: str) -> None: + """E (ids include ent:/fact: prefix)""" + parts = ln.split() + if len(parts) < 4: + return + from_id = parts[1].strip('"') + to_id = parts[2].strip('"') + kind = parts[3].strip('"') + if from_id and to_id and kind: + turn.statements.append( + f'CREATE EDGE "{_dsl_escape(from_id)}" -> "{_dsl_escape(to_id)}" ' + f'kind = "{_dsl_escape(kind)}"' + ) + + +def _h_query(template: str): + """Factory for verbs whose argument is free-text (wrapped in DSL quotes).""" + def h(turn: CompactTurn, ln: str) -> None: + parts = ln.split(None, 1) + if len(parts) < 2 or not parts[1].strip(): + return + q = _dsl_escape(parts[1].strip()) + turn.statements.append(template.replace("{q}", q)) + return h + - entities: list[tuple[str, str]] = field(default_factory=list) # [(ent_id, name), ...] - beliefs: list[tuple[str, str]] = field(default_factory=list) # [(fact_id, value), ...] - retracts: list[str] = field(default_factory=list) # [fact_id, ...] +def _h_walk(template: str): + """Factory for verbs whose argument is a single anchor id.""" + def h(turn: CompactTurn, ln: str) -> None: + parts = ln.split() + if len(parts) < 2: + return + anchor = parts[1].strip('"') + if anchor: + turn.statements.append(template.replace("{id}", _dsl_escape(anchor))) + return h + + +def _h_plain(template: str): + """Factory for zero-arg verbs (SS/SC/SH/ST/VS).""" + def h(turn: CompactTurn, _ln: str) -> None: + turn.statements.append(template) + return h + + +_COMPACT_HANDLERS: dict[str, Any] = { + "U": _h_upsert, "UP": _h_upsert, "UPSERT": _h_upsert, + "F": _h_fact, "FACT": _h_fact, "B": _h_fact, "ASSERT": _h_fact, + "D": _h_drop, "DROP": _h_drop, "R": _h_drop, "RETRACT": _h_drop, + "E": _h_edge, "EDGE": _h_edge, + "RM": _h_query('REMEMBER "{q}" LIMIT 10'), + "REMEMBER": _h_query('REMEMBER "{q}" LIMIT 10'), + "SM": _h_query('SIMILAR TO "{q}" LIMIT 10'), + "SIMILAR": _h_query('SIMILAR TO "{q}" LIMIT 10'), + "LX": _h_query('LEXICAL SEARCH "{q}" LIMIT 10'), + "LEXICAL": _h_query('LEXICAL SEARCH "{q}" LIMIT 10'), + "AQ": _h_query('ANSWER "{q}"'), + "ANSWER": _h_query('ANSWER "{q}"'), + "RL": _h_walk('RECALL FROM "{id}" DEPTH 2'), + "RECALL": _h_walk('RECALL FROM "{id}" DEPTH 2'), + "TR": _h_walk('TRAVERSE FROM "{id}" DEPTH 2'), + "TRAVERSE": _h_walk('TRAVERSE FROM "{id}" DEPTH 2'), + "AN": _h_walk('ANCESTORS OF "{id}" DEPTH 3'), + "ANCESTORS": _h_walk('ANCESTORS OF "{id}" DEPTH 3'), + "SG": _h_walk('SUBGRAPH FROM "{id}" DEPTH 2'), + "SUBGRAPH": _h_walk('SUBGRAPH FROM "{id}" DEPTH 2'), + "SS": _h_plain('SYS SNAPSHOT'), + "SC": _h_plain('SYS COMPACT'), + "SH": _h_plain('SYS HEALTH'), + "ST": _h_plain('SYS STATS'), + "SX": _h_query('SYS EXPLAIN REMEMBER "{q}"'), + "VS": _h_plain('VAULT SYNC'), +} def _parse_compact_output(cleaned: str) -> CompactTurn: - """Read the 3-line ENTS/BELIEFS/RETRACTS output. + """Parse v5 unified verb-positional output. - Tolerant: missing sections default to empty, unknown prefixes ignored, - case-insensitive on section labels, honors `none` as empty. + Each non-blank, non-fence line is one verb + positional args. U/F/D + populate the entities/beliefs/retracts slots for fact-state wiring; all + other verbs render directly to pre-built DSL lines in turn.statements. + Unknown verbs and malformed lines are silently dropped. """ turn = CompactTurn() for raw_ln in cleaned.splitlines(): ln = raw_ln.strip() if not ln or _FENCE_RE.match(ln): continue - lower = ln.lower() - if lower.startswith("ents:"): - body = ln[5:].strip() - if body.lower() in ("none", ""): - continue - for m in _COMPACT_KV_RE.finditer(body): - turn.entities.append((m.group(1), m.group(2))) - elif lower.startswith("beliefs:"): - body = ln[8:].strip() - if body.lower() in ("none", ""): - continue - for m in _COMPACT_KV_RE.finditer(body): - turn.beliefs.append((m.group(1), m.group(2))) - elif lower.startswith("retracts:"): - body = ln[9:].strip() - if body.lower() in ("none", ""): - continue - for m in _COMPACT_ID_RE.finditer(body): - turn.retracts.append(m.group(1)) + head = ln.split(maxsplit=1)[0] + verb = head.upper().rstrip(":") + handler = _COMPACT_HANDLERS.get(verb) + if handler is None: + continue + handler(turn, ln) return turn -def _dsl_escape(s: str) -> str: - """Escape a Python string for safe embedding inside a DSL "..." literal.""" - return s.replace("\\", "\\\\").replace('"', '\\"') - - def _synthesize_dsl( turn: CompactTurn, *, @@ -297,13 +402,13 @@ def _synthesize_dsl( ) -> list[str]: """Build the full DSL statement list from the parsed compact output. - Deterministic. Same CompactTurn + same identifiers always produce the - same list of statements. Emits: + Deterministic. Emits in order: 1. CREATE NODE for the message (DOCUMENT = user text). 2. UPSERT NODE per entity + matching CREATE EDGE kind = "mentions". Entities are deduped by id (first wins). 3. RETRACT per retract (before any ASSERT). 4. ASSERT per belief. + 5. Pre-rendered statements (edges, queries, walks, sys ops) verbatim. """ out: list[str] = [] text_esc = _dsl_escape(text) @@ -343,6 +448,8 @@ def _synthesize_dsl( f'value = "{_dsl_escape(value)}" CONFIDENCE 0.9 SOURCE "{msg_esc}"' ) + out.extend(turn.statements) + return out diff --git a/tests/test_bonsai_ingestor.py b/tests/test_bonsai_ingestor.py index e231a22..dc22b9d 100644 --- a/tests/test_bonsai_ingestor.py +++ b/tests/test_bonsai_ingestor.py @@ -335,62 +335,251 @@ def test_ingestor_reset_facts_clears_state(tmp_path: Path): # Compact mode: parser + DSL synthesis # -------------------------------------------------------------------- -def test_parse_compact_all_three_sections(): - out = '''ENTS: "ent:priya"="Priya", "ent:openai"="OpenAI" -BELIEFS: "fact:color"="blue" -RETRACTS: "fact:old"''' +def test_parse_compact_all_three_verbs(): + out = '''U priya Priya +U openai OpenAI +F color blue +D old''' turn = _parse_compact_output(out) assert turn.entities == [("ent:priya", "Priya"), ("ent:openai", "OpenAI")] assert turn.beliefs == [("fact:color", "blue")] assert turn.retracts == ["fact:old"] -def test_parse_compact_none_values_are_empty(): - out = '''ENTS: none -BELIEFS: none -RETRACTS: none''' - turn = _parse_compact_output(out) +def test_parse_compact_empty_output_is_empty_turn(): + turn = _parse_compact_output("") assert turn.entities == [] assert turn.beliefs == [] assert turn.retracts == [] -def test_parse_compact_missing_sections_default_empty(): - out = 'ENTS: "ent:x"="X"' - turn = _parse_compact_output(out) - assert turn.entities == [("ent:x", "X")] +def test_parse_compact_entities_only(): + turn = _parse_compact_output("U kailash Kailash") + assert turn.entities == [("ent:kailash", "Kailash")] assert turn.beliefs == [] assert turn.retracts == [] -def test_parse_compact_case_insensitive(): - out = 'ents: "ent:x"="X"\nBELIEFS: "fact:y"="Y"' +def test_parse_compact_multi_word_name_joined_by_whitespace(): + """Rest-of-line is the name; split on first 2 whitespace runs only.""" + turn = _parse_compact_output("U sf San Francisco") + assert turn.entities == [("ent:sf", "San Francisco")] + + +def test_parse_compact_case_insensitive_verbs(): + out = "u priya Priya\nf color blue\nd old" turn = _parse_compact_output(out) - assert turn.entities == [("ent:x", "X")] - assert turn.beliefs == [("fact:y", "Y")] + assert turn.entities == [("ent:priya", "Priya")] + assert turn.beliefs == [("fact:color", "blue")] + assert turn.retracts == ["fact:old"] + + +def test_parse_compact_aliases_upsert_assert_retract(): + out = "UPSERT priya Priya\nASSERT color blue\nRETRACT old" + turn = _parse_compact_output(out) + assert turn.entities == [("ent:priya", "Priya")] + assert turn.beliefs == [("fact:color", "blue")] + assert turn.retracts == ["fact:old"] def test_parse_compact_tolerates_fence_lines(): - out = '''``` -ENTS: "ent:x"="X" -```''' + out = "```\nU x X\n```" turn = _parse_compact_output(out) assert turn.entities == [("ent:x", "X")] -def test_parse_compact_escaped_quote_in_value(): - out = 'ENTS: "ent:a"="Alice \\"Ace\\" Smith"' +def test_parse_compact_strips_prefix_if_model_adds_it(): + """Model sometimes emits 'U ent:x X'; we normalize to slug-only.""" + turn = _parse_compact_output('U ent:priya Priya') + assert turn.entities == [("ent:priya", "Priya")] + + turn2 = _parse_compact_output('F fact:color blue') + assert turn2.beliefs == [("fact:color", "blue")] + + +def test_parse_compact_ignores_unknown_verbs(): + out = "U priya Priya\nFOO some garbage\nD old" turn = _parse_compact_output(out) - assert turn.entities == [("ent:a", 'Alice \\"Ace\\" Smith')] + assert turn.entities == [("ent:priya", "Priya")] + assert turn.retracts == ["fact:old"] -def test_parse_compact_ignores_unknown_prefixes(): - out = '''ENTS: "ent:x"="X" -FOO: not a section -BELIEFS: "fact:y"="Y"''' +def test_parse_compact_ignores_malformed_short_lines(): + """Missing required args -> line dropped, no crash.""" + out = "U justslug\nF onlytopic\nD\n" turn = _parse_compact_output(out) - assert turn.entities == [("ent:x", "X")] - assert turn.beliefs == [("fact:y", "Y")] + assert turn.entities == [] + assert turn.beliefs == [] + assert turn.retracts == [] + + +def test_parse_compact_strips_quotes_if_present(): + """Model occasionally wraps tokens in quotes; handle both.""" + turn = _parse_compact_output('U "priya" "Priya"') + assert turn.entities == [("ent:priya", "Priya")] + + +# -------------------------------------------------------------------- +# Compact v5: non-ingest verbs (edges, retrieval, walks, sys/vault) +# -------------------------------------------------------------------- + +def test_parse_compact_edge_emits_create_edge(): + turn = _parse_compact_output("E ent:priya ent:flipkart works_at") + assert turn.statements == [ + 'CREATE EDGE "ent:priya" -> "ent:flipkart" kind = "works_at"' + ] + assert turn.entities == [] + + +def test_parse_compact_edge_needs_three_args(): + turn = _parse_compact_output("E ent:a ent:b") + assert turn.statements == [] + + +def test_parse_compact_remember(): + turn = _parse_compact_output("RM what I said about coffee") + assert turn.statements == ['REMEMBER "what I said about coffee" LIMIT 10'] + + +def test_parse_compact_similar(): + turn = _parse_compact_output("SM joining a startup") + assert turn.statements == ['SIMILAR TO "joining a startup" LIMIT 10'] + + +def test_parse_compact_lexical(): + turn = _parse_compact_output("LX python parser bug") + assert turn.statements == ['LEXICAL SEARCH "python parser bug" LIMIT 10'] + + +def test_parse_compact_answer(): + turn = _parse_compact_output("AQ where does Priya work") + assert turn.statements == ['ANSWER "where does Priya work"'] + + +def test_parse_compact_recall_walk(): + turn = _parse_compact_output("RL ent:priya") + assert turn.statements == ['RECALL FROM "ent:priya" DEPTH 2'] + + +def test_parse_compact_traverse_walk(): + turn = _parse_compact_output("TR ent:priya") + assert turn.statements == ['TRAVERSE FROM "ent:priya" DEPTH 2'] + + +def test_parse_compact_ancestors_walk(): + turn = _parse_compact_output("AN fact:favorite_color") + assert turn.statements == ['ANCESTORS OF "fact:favorite_color" DEPTH 3'] + + +def test_parse_compact_subgraph_walk(): + turn = _parse_compact_output("SG ent:openai") + assert turn.statements == ['SUBGRAPH FROM "ent:openai" DEPTH 2'] + + +def test_parse_compact_sys_snapshot(): + turn = _parse_compact_output("SS") + assert turn.statements == ['SYS SNAPSHOT'] + + +def test_parse_compact_sys_compact_verb(): + turn = _parse_compact_output("SC") + assert turn.statements == ['SYS COMPACT'] + + +def test_parse_compact_sys_health(): + turn = _parse_compact_output("SH") + assert turn.statements == ['SYS HEALTH'] + + +def test_parse_compact_sys_stats(): + turn = _parse_compact_output("ST") + assert turn.statements == ['SYS STATS'] + + +def test_parse_compact_sys_explain(): + turn = _parse_compact_output("SX what I said about coffee") + assert turn.statements == ['SYS EXPLAIN REMEMBER "what I said about coffee"'] + + +def test_parse_compact_vault_sync(): + turn = _parse_compact_output("VS") + assert turn.statements == ['VAULT SYNC'] + + +def test_parse_compact_mixed_ingest_and_query(): + out = '''U priya Priya +U openai OpenAI +RM what I said about coffee''' + turn = _parse_compact_output(out) + assert turn.entities == [("ent:priya", "Priya"), ("ent:openai", "OpenAI")] + assert turn.statements == ['REMEMBER "what I said about coffee" LIMIT 10'] + + +def test_parse_compact_escapes_quotes_in_query_text(): + turn = _parse_compact_output('RM she said "go"') + assert turn.statements == ['REMEMBER "she said \\"go\\"" LIMIT 10'] + + +def test_parse_compact_query_verb_without_body_dropped(): + turn = _parse_compact_output("RM \nSM") + assert turn.statements == [] + + +def test_parse_compact_walk_verb_without_anchor_dropped(): + turn = _parse_compact_output("RL\nTR") + assert turn.statements == [] + + +def test_parse_compact_plain_verb_ignores_trailing_tokens(): + """SS foo still fires; plain handler ignores the rest of the line.""" + turn = _parse_compact_output("SS ignored") + assert turn.statements == ['SYS SNAPSHOT'] + + +def test_parse_compact_long_verb_aliases(): + turn = _parse_compact_output( + "REMEMBER coffee\nSIMILAR tea\nRECALL ent:a\nTRAVERSE ent:b" + ) + assert turn.statements == [ + 'REMEMBER "coffee" LIMIT 10', + 'SIMILAR TO "tea" LIMIT 10', + 'RECALL FROM "ent:a" DEPTH 2', + 'TRAVERSE FROM "ent:b" DEPTH 2', + ] + + +def test_parse_compact_edge_escapes_quotes_in_ids(): + """Quote-escape applies inside CREATE EDGE even if ids carry weird chars.""" + turn = _parse_compact_output('E ent:a ent:b weird"kind') + assert turn.statements == [ + 'CREATE EDGE "ent:a" -> "ent:b" kind = "weird\\"kind"' + ] + + +# -------------------------------------------------------------------- +# DSL synthesis (v5 pre-rendered statements) +# -------------------------------------------------------------------- + +def test_synthesize_appends_statements_verbatim(): + turn = CompactTurn( + entities=[("ent:x", "X")], + statements=['REMEMBER "hello" LIMIT 3', 'SYS STATS'], + ) + dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="hi") + assert dsl[-2] == 'REMEMBER "hello" LIMIT 3' + assert dsl[-1] == 'SYS STATS' + + +def test_synthesize_statements_only_still_includes_create_node(): + turn = CompactTurn(statements=['REMEMBER "x" LIMIT 10']) + dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="x") + assert any(d.startswith('CREATE NODE "m:0"') for d in dsl) + assert 'REMEMBER "x" LIMIT 10' in dsl + + +def test_compact_turn_default_statements_empty(): + turn = CompactTurn() + assert turn.statements == [] def test_dsl_escape_handles_quote_and_backslash(): diff --git a/tools/skills/graphstore-bonsai-dsl-compact/SKILL.md b/tools/skills/graphstore-bonsai-dsl-compact/SKILL.md index 0a90e8e..1f688df 100644 --- a/tools/skills/graphstore-bonsai-dsl-compact/SKILL.md +++ b/tools/skills/graphstore-bonsai-dsl-compact/SKILL.md @@ -1,96 +1,146 @@ --- name: graphstore-bonsai-dsl-compact -description: Ultra-compact NL->semantic-fields skill. LLM emits only the novel information in three tagged lines (ENTS, BELIEFS, RETRACTS). Python templates build the full DSL deterministically. ~6-7x fewer output tokens than the full-DSL skill, measured on 4B TQ1_0. +description: Unified verb-positional caveman grammar covering every common GraphStore DSL operation. LLM emits 2-letter verbs + positional args, Python expands to full DSL. ~3-5x fewer output tokens than raw DSL on every path - ingest, query, walk, ops. compatibility: graphstore >= 0.4.0 metadata: author: orkait - version: "1.0" - target_tokens: 320 - mode: compact + version: "5.0" + target_tokens: 900 + mode: unified-positional --- -Read the user turn. Output EXACTLY three lines in this order: +Read the user turn. Output zero or more ops, one per line. No prose, no quotes (unless required inside a query string), no `` tags, no fences. + +Each line: ` [arg2...]`. Multi-word trailing args (names, query text) are allowed; the verb's shape fixes how Python splits the tokens. + +## Ingest (user said something about an entity / themselves) + +``` +U Upsert entity. Python auto-wires mentions edge from msg. +F User's first-person fact ("I", "my"). topic=snake_case. +D Drop a fact (retract). Requires matching known fact. +``` + +## Graph edges (explicit relationships between entities) ``` -ENTS: "="", or "none"> -BELIEFS: "="", or "none"> -RETRACTS: ", or "none"> +E Create edge with given kind. IDs include their prefix (ent:X or fact:X). ``` -No DSL, no prose, no `` tags, no markdown fences. Three lines. Nothing else. +## Semantic retrieval (user asked a question) -- ENTS lists every named person / org / place / product in the message. Slug is lowercase with underscores. One entry per unique entity per message. -- BELIEFS lists **only** first-person statements about the speaker themselves. The sentence must use "I", "my", "me", "mine", or similar. A third-person observation like "Priya moved to Bangalore" is NOT a belief; those entities go in ENTS. Topic = short snake_case. -- RETRACTS lists existing fact_ids the new message contradicts. Only valid when `### KNOWN FACTS` appears above and the user overrides one. Use the same fact_id from KNOWN FACTS. +``` +RM REMEMBER (4-signal NL retrieval, default LIMIT 10) +SM SIMILAR TO (vector only, default LIMIT 10) +LX LEXICAL SEARCH (BM25 only, default LIMIT 10) +AQ ANSWER (LLM-answered recall) +``` -Use `none` when a category is empty. Escape `"` inside values as `\"`. +## Structural walks (from a known anchor id) + +``` +RL RECALL FROM anchor DEPTH 2 (spreading activation) +TR TRAVERSE FROM anchor DEPTH 2 (deterministic walk) +AN ANCESTORS OF anchor DEPTH 3 +SG SUBGRAPH FROM anchor DEPTH 2 +``` + +## SYS / vault ops + +``` +SS SYS SNAPSHOT +SC SYS COMPACT +SH SYS HEALTH +ST SYS STATS +SX SYS EXPLAIN REMEMBER (dry-run a retrieval) +VS VAULT SYNC +``` + +## Rules + +- Third-person observations emit `U`, NOT `F`. Beliefs require first-person pronouns. +- Empty output is valid - emit nothing if nothing applies. +- If `### KNOWN FACTS` appears above, reuse those topic names exactly when updating same concept. +- Slugs and topics must be single tokens (lowercase + underscores). Names / values / query text can be multi-word. +- For query verbs, write the question as free text - no quotes, Python adds them. --- -**Input (third-person observation; BELIEFS stays empty):** -Session s1, msg m:s1:0, user: "Kailash joined OpenAI as DB engineer." +**Input:** "Kailash joined OpenAI." **Output:** ``` -ENTS: "ent:kailash"="Kailash", "ent:openai"="OpenAI" -BELIEFS: none -RETRACTS: none +U kailash Kailash +U openai OpenAI ``` -**Input (third-person with a location; still no beliefs):** -Session s1, msg m:s1:1, user: "Priya moved to Bangalore and joined Flipkart." +--- + +**Input:** "Priya works at Flipkart since 2023 as a frontend engineer." **Output:** ``` -ENTS: "ent:priya"="Priya", "ent:bangalore"="Bangalore", "ent:flipkart"="Flipkart" -BELIEFS: none -RETRACTS: none +U priya Priya +U flipkart Flipkart +E ent:priya ent:flipkart works_at ``` --- -**Input:** -Session s2, msg m:s2:0, user: "My favorite color is blue." +**Input:** "My favorite color is blue." **Output:** ``` -ENTS: none -BELIEFS: "fact:favorite_color"="blue" -RETRACTS: none +F favorite_color blue ``` --- -**Input (user contradicts a prior fact, use its exact fact_id):** - +**Input (correction; known fact exists):** ``` -### KNOWN FACTS (reuse these fact_ids; emit RETRACT + ASSERT to update) -[fact:favorite_drink] kind="belief" value="coffee" confidence=0.90 +### KNOWN FACTS +[fact:favorite_drink] kind="belief" value="coffee" -Session s3, msg m:s3:1, user: "Actually I prefer tea now." +user: "Actually I prefer tea now." +``` +**Output:** +``` +D favorite_drink +F favorite_drink tea ``` +--- + +**Input:** "Remember what I said about coffee." + **Output:** ``` -ENTS: none -BELIEFS: "fact:favorite_drink"="tea" -RETRACTS: "fact:favorite_drink" +RM what I said about coffee ``` --- -**Input (multi-entity + belief + belief update):** +**Input:** "Find messages similar to 'joining a startup'." +**Output:** ``` -### KNOWN FACTS -[fact:lives_in] kind="belief" value="Delhi" confidence=0.90 +SM joining a startup +``` + +--- -Session s4, msg m:s4:2, user: "Priya moved to Bangalore and joined Flipkart. I now live in Pune." +**Input:** "How is Priya connected to OpenAI?" + +**Output:** ``` +RL ent:priya +``` + +--- + +**Input:** "Take a snapshot." **Output:** ``` -ENTS: "ent:priya"="Priya", "ent:bangalore"="Bangalore", "ent:flipkart"="Flipkart", "ent:pune"="Pune" -BELIEFS: "fact:lives_in"="Pune" -RETRACTS: "fact:lives_in" +SS ``` From 8f40ff0136a58a58339f9c14ceb4c2b38fc932bc Mon Sep 17 00:00:00 2001 From: Kailas Mahavarkar <66670953+KailasMahavarkar@users.noreply.github.com> Date: Tue, 21 Apr 2026 02:51:19 +0530 Subject: [PATCH 2/3] refactor(bonsai): full English-verb grammar + lite variant + auto n_ctx The NL->DSL ingestor now covers 100% of the grammar.lark NL-addressable surface (94 rules) via English-keyword @-verbs. Short-code abbreviations (@U/@F/@RM/etc.) are gone - every verb is a readable DSL keyword (@UPSERT, @BELIEF, @REMEMBER, @SNAPSHOT, @CHECKPOINT, @CRON_ADD, @EVOLVE_RULE, ...). Full dispatch has ~100 entries including grammar aliases (ASSERT->BELIEF, FORGET_NODE->FORGET, etc.). Two prompt variants now ship with the package: - bonsai_dsl_prompt.txt: full 94-verb surface, ~1700 tokens, n_ctx=4096 - bonsai_dsl_prompt_lite.txt: 16-verb ingest+retrieval subset, ~800 tokens, n_ctx=2048. Fewer competing verbs means the model picks correctly on conversational turns. Load time 19s -> 8s. n_ctx auto-picks smallest power-of-two that fits the loaded prompt + typical user-msg budget + max_output + headroom. Callers can still pin n_ctx explicitly. Parser rewritten as a factory dispatch table: - _h_slug / _h_topic / _h_walk / _h_pair / _h_query / _h_plain / _h_raw - Special handlers for update_node, merge, increment, propagate, describe, unregister, contradictions, cron_add, optimize, clear, wal, nodes, vault_triplet, snapshot (auto-timestamp fallback). Compact/raw-DSL mode removed. Single prompt-driven mode. `compact` kwarg and `_DEFAULT_COMPACT_*` symbols deleted. `CompactTurn` renamed `ParsedTurn`, `_parse_compact_output` -> `_parse_verb_output`, `_COMPACT_HANDLERS` -> `_VERB_HANDLERS`. Grammar bugs fixed in this pass: - @SNAPSHOT without name auto-fills a UTC timestamp (SNAPSHOT STRING is required by grammar; bare @SS was emitting invalid DSL). - @COMPACT rewritten to SYS OPTIMIZE COMPACT (SYS COMPACT isn't a real grammar rule; SYS OPTIMIZE COMPACT is). Prompt file moved out of tools/skills/ into src/graphstore/ so it ships with the wheel. pyproject package-data now includes *.txt. Performance envelope measured on AMD 9700X / DDR5-5200: - Cold load: 8s (lite) / 19s (full) - Cold load with persistent kv_cache_path: 0.4s (19x faster) - Peak decode: 27-30 tok/s (memory-bandwidth bound at ~810 MB weight read per token for 4B TQ1_0) - Per-call wall: 0.3-2s; overall 15-20 tok/s Tests: 89 unit tests pass; 107 synthesized DSL templates parse clean against grammar.lark (verify_v6_templates.py check). --- pyproject.toml | 2 +- src/graphstore/bonsai_dsl_prompt.txt | 130 +++ src/graphstore/bonsai_dsl_prompt_lite.txt | 90 +++ src/graphstore/bonsai_ingestor.py | 740 +++++++++++++++--- tests/test_bonsai_ingestor.py | 333 +++++--- .../graphstore-bonsai-dsl-compact/SKILL.md | 146 ---- 6 files changed, 1062 insertions(+), 379 deletions(-) create mode 100644 src/graphstore/bonsai_dsl_prompt.txt create mode 100644 src/graphstore/bonsai_dsl_prompt_lite.txt delete mode 100644 tools/skills/graphstore-bonsai-dsl-compact/SKILL.md diff --git a/pyproject.toml b/pyproject.toml index 7a3d123..8879fcc 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,7 +119,7 @@ where = ["src"] include = ["graphstore*"] [tool.setuptools.package-data] -"graphstore" = ["py.typed"] +"graphstore" = ["py.typed", "*.txt"] "graphstore.query" = ["py.typed"] "graphstore.dsl" = ["*.lark"] diff --git a/src/graphstore/bonsai_dsl_prompt.txt b/src/graphstore/bonsai_dsl_prompt.txt new file mode 100644 index 0000000..c602a40 --- /dev/null +++ b/src/graphstore/bonsai_dsl_prompt.txt @@ -0,0 +1,130 @@ +Read user turn. Output zero or more ops, one per line. Every op MUST start with @. Lines without @ are ignored (never narrate, fence, or emit ). Quote any arg that contains spaces. Anchor ids keep prefix (ent:/fact:/msg:); slug-only verbs auto-prefix ent:. Slugs / topics / edge kinds are snake_case. + +VERB TABLE (pick one per line; emit exactly the shape shown) + +INGEST EDGES NODE OPS +@UPSERT slug Name @EDGE from to kind @UPDATE_NODE slug field value +@BELIEF topic value @UPDATE_EDGE from to field value @DELETE_NODE slug +@RETRACT topic @DELETE_EDGE from to @FORGET slug + @DELETE_EDGES_FROM slug @CONNECT_NODE slug + @DELETE_EDGES_TO slug @MERGE src dst + @EDGES_FROM slug @INCREMENT slug field num + @EDGES_TO slug @PROPAGATE anchor field depth + @COUNTERFACTUAL topic + @NODE id + @NODES [where-body] + @UPDATE_NODES where+SET-body + @DELETE_NODES where-body + +QUERY (user asked a question) WALKS (from anchor id) PATHS / DIST +@REMEMBER query @RECALL a @PATH a b +@SIMILAR query @TRAVERSE a @PATHS a b +@LEXICAL query @ANCESTORS a @SHORTEST_PATH a b +@ANSWER question @DESCENDANTS a @DISTANCE a b +@COUNT_NODES @SUBGRAPH a @WEIGHTED_SHORTEST a b +@COUNT_EDGES @WEIGHTED_DISTANCE a b + @COMMON a b + +PATTERN / AGGREGATE (raw) VAULT (markdown notes) CONTEXT / DOC +@MATCH pattern-body @VAULT_NEW path @BIND_CONTEXT name +@AGGREGATE body @VAULT_READ path @DISCARD_CONTEXT name + @VAULT_WRITE path section content @INGEST path + @VAULT_APPEND path section content + @VAULT_LIST + @VAULT_BACKLINKS path + @VAULT_SEARCH query + @VAULT_DAILY + @VAULT_ARCHIVE path + @VAULT_SYNC + +SNAPSHOTS / OPTIMIZE SYS INSPECT SYS MAINT (admin) +@SNAPSHOT [name] @HEALTH @CHECKPOINT +@ROLLBACK name @STATS @REBUILD +@SNAPSHOTS @KINDS @CLEAR LOG|CACHE +@COMPACT @EDGE_KINDS @WAL STATUS|REPLAY +@OPTIMIZE target @EMBEDDERS @EXPIRE + @STATUS @CONTRADICTIONS field group + @SLOW_QUERIES @DUPLICATES + @FREQUENT_QUERIES @CONNECT_ALL + @FAILED_QUERIES @CONSOLIDATE + @LOG @REEMBED + @EXPLAIN query @RETAIN + @DESCRIBE type name @EVICT + @REGISTER_NODE body + @REGISTER_EDGE body + @UNREGISTER type name + +CRON EVOLVE (metacognitive rules) +@CRON_ADD name "sched" "query" @EVOLVE_LIST @EVOLVE_SHOW name +@CRON_DELETE name @EVOLVE_HISTORY @EVOLVE_ENABLE name +@CRON_ENABLE name @EVOLVE_RESET @EVOLVE_DISABLE name +@CRON_DISABLE name @EVOLVE_DELETE name +@CRON_LIST @EVOLVE_RULE body +@CRON_RUN name + +RULES +- Lines without @ drop silently. Emit nothing rather than guessing. +- Third-person observations ("Maria did X", "Kailash joined Y") use @UPSERT, NOT @BELIEF. Beliefs need first-person pronouns ("I", "my", "me"). +- KNOWN FACTS block above: reuse those fact_ids exactly. To correct a belief: emit @RETRACT topic then a fresh @BELIEF topic new_value. +- Query verbs take free text; Python wraps quotes. Do not quote the query yourself. +- Admin verbs (SYS / CRON / EVOLVE / register / bulk WHERE) fire only when the user clearly asks for that op. +- Do not echo these examples verbatim. Generate ops from the actual turn. + +EX + +Nikhil started at Stripe last month. +@UPSERT nikhil Nikhil +@UPSERT stripe Stripe +@EDGE ent:nikhil ent:stripe works_at + +My dentist is Dr. Chen. +@BELIEF dentist Dr. Chen + +KNOWN FACTS: +[fact:lunch_spot] kind="belief" value="cafe_paloma" + +user: Actually I go to Cafe Centro for lunch now. +@RETRACT lunch_spot +@BELIEF lunch_spot Cafe Centro + +Remember what I told you last week. +@REMEMBER what I told you last week + +Find messages that feel like the argument we had. +@SIMILAR the argument we had + +Where does Maria work? +@ANSWER where does Maria work + +How is Nikhil linked to Stripe? +@RECALL ent:nikhil + +Path from Nikhil to Stripe. +@PATH ent:nikhil ent:stripe + +Forget my old gym. +@FORGET old_gym + +Change my title to senior engineer. +@UPDATE_NODE me title senior engineer + +What if I never joined Stripe? +@COUNTERFACTUAL joined_stripe + +Take a snapshot called before-migration. +@SNAPSHOT before-migration + +Roll back to before-migration. +@ROLLBACK before-migration + +How many nodes do I have? +@COUNT_NODES + +Describe the node for Maria. +@DESCRIBE NODE ent:maria + +Show me the stats. +@STATS + +Compact the storage. +@COMPACT diff --git a/src/graphstore/bonsai_dsl_prompt_lite.txt b/src/graphstore/bonsai_dsl_prompt_lite.txt new file mode 100644 index 0000000..9a55fbe --- /dev/null +++ b/src/graphstore/bonsai_dsl_prompt_lite.txt @@ -0,0 +1,90 @@ +Read user turn. Output zero or more ops, one per line. Every op MUST start with @. Lines without @ are ignored (never narrate, fence, or emit ).Do not quote multi-word arguments. The parser handles trailing text. Anchor ids keep prefix (ent:/fact:/msg:); slug-only verbs auto-prefix ent:. Slugs / topics / edge kinds are snake_case. + +VERB PICK RULE (read first!) +- Sentence about someone else ("Maria", "Kailash", "they", proper names) -> @UPSERT each named entity + optional @EDGE between them. NEVER @BELIEF. +- Sentence about the user ("I", "my", "me") -> @BELIEF topic value. +- User asks a question -> @REMEMBER / @SIMILAR / @LEXICAL / @ANSWER. +- User corrects a prior belief (KNOWN FACTS block present) -> @RETRACT topic + @BELIEF topic new_value. +- User asks about connections -> @RECALL / @TRAVERSE / @ANCESTORS / @DESCENDANTS / @SUBGRAPH / @PATH / @SHORTEST_PATH / @COMMON. + +VERB TABLE (pick one per line; emit exactly the shape shown) + +INGEST EDGE WALKS (from anchor id) +@UPSERT slug Name @EDGE from to kind @RECALL a +@BELIEF topic value @TRAVERSE a +@RETRACT topic @ANCESTORS a + @DESCENDANTS a + @SUBGRAPH a + +QUERY (user asked a question) PATHS +@REMEMBER query @PATH a b +@SIMILAR query @SHORTEST_PATH a b +@LEXICAL query @COMMON a b +@ANSWER question + +RULES +- Lines without @ drop silently. Emit nothing rather than guessing. +- Slugs, topics, edge kinds are snake_case single tokens. +- Query verbs take free text; Python wraps quotes. Do not quote the query yourself. +- KNOWN FACTS block above: reuse those fact_ids exactly. Pair @RETRACT topic with a fresh @BELIEF topic new_value when correcting. +- Do not echo these examples verbatim. Generate ops from the actual turn. + +EXAMPLES + +(third-person: two entities + the relationship) +Kailash joined OpenAI. +@UPSERT kailash Kailash +@UPSERT openai OpenAI +@EDGE ent:kailash ent:openai joined + +(third-person with extra context) +Nikhil started at Stripe last month as a staff engineer. +@UPSERT nikhil Nikhil +@UPSERT stripe Stripe +@EDGE ent:nikhil ent:stripe works_at + +(third-person standalone) +Maria moved to Berlin. +@UPSERT maria Maria +@UPSERT berlin Berlin +@EDGE ent:maria ent:berlin lives_in + +(first-person fact - note the pronoun "my") +My dentist is Dr. Chen. +@BELIEF dentist Dr. Chen + +(first-person fact - note the pronoun "I") +I prefer tea to coffee. +@BELIEF drink_preference tea + +(correction via KNOWN FACTS block) +KNOWN FACTS: +[fact:lunch_spot] kind="belief" value="cafe_paloma" + +user: Actually I go to Cafe Centro for lunch now. +@RETRACT lunch_spot +@BELIEF lunch_spot Cafe Centro + +(retrieval: "remember" -> NL recall) +Remember what I told you last week. +@REMEMBER what I told you last week + +(retrieval: "similar to" -> vector) +Find messages that feel like the argument we had. +@SIMILAR the argument we had + +(retrieval: direct question -> synthesized answer) +Where does Maria work? +@ANSWER where does Maria work + +(walk: "connected" / "linked" from an anchor) +How is Nikhil linked to Stripe? +@RECALL ent:nikhil + +(path between two anchors) +Path from Nikhil to Stripe. +@PATH ent:nikhil ent:stripe + +(common neighbors) +What do Nikhil and Maria have in common? +@COMMON ent:nikhil ent:maria diff --git a/src/graphstore/bonsai_ingestor.py b/src/graphstore/bonsai_ingestor.py index 924b2d2..a3112bc 100644 --- a/src/graphstore/bonsai_ingestor.py +++ b/src/graphstore/bonsai_ingestor.py @@ -227,30 +227,31 @@ def _scrape_belief_updates( # -------------------------------------------------------------------- -# Compact output mode ("caveman v5"): LLM emits one verb per line covering -# the whole DSL surface. Python inflates to full DSL. Measured ~3-5x fewer -# output tokens than raw DSL on every path. See -# tools/skills/graphstore-bonsai-dsl-compact/SKILL.md for the contract. +# Verb-prefixed output parser: LLM emits one @VERB op per line covering +# the whole DSL surface. Python inflates each verb to the full DSL line. +# See src/graphstore/bonsai_dsl_prompt.txt for the contract. +# +# Line format: `@ [arg2...]`. Lines without a leading `@` +# are silently dropped - English reasoning, fences, and leaks are +# inert at parser level, so the model can drift safely without corrupting +# the emitted DSL. # # Verbs fall into three groups: -# 1. Fact-state (U / F / D): populate entities / beliefs / retracts slots -# so _synthesize_dsl can auto-wire mention edges and cross-message -# belief identity works. -# 2. Edge (E): pre-renders a CREATE EDGE line. -# 3. Retrieval (RM/SM/LX/AQ), walks (RL/TR/AN/SG), sys/vault -# (SS/SC/SH/ST/SX/VS): each pre-renders one full DSL line directly. +# 1. Fact-state (UPSERT / BELIEF / RETRACT): populate entities / beliefs / +# retracts slots so _synthesize_dsl can auto-wire mention edges and +# cross-message belief identity works. +# 2. Edge (EDGE): pre-renders a CREATE EDGE line. +# 3. Retrieval, walks, vault, sys ops: each pre-renders one full DSL +# line directly. # # Groups 2 and 3 accumulate in turn.statements and get appended verbatim # after the mention wiring and fact updates. -# -# Unknown verbs and malformed lines are silently dropped (LLM may drift; -# parser is lax so a single bad line doesn't lose the whole turn). # -------------------------------------------------------------------- @dataclass -class CompactTurn: - """Parsed structured output of a compact-mode LLM call (v5 option A).""" +class ParsedTurn: + """Parsed structured output of one @-verb LLM call.""" entities: list[tuple[str, str]] = field(default_factory=list) beliefs: list[tuple[str, str]] = field(default_factory=list) @@ -263,7 +264,7 @@ def _dsl_escape(s: str) -> str: return s.replace("\\", "\\\\").replace('"', '\\"') -def _h_upsert(turn: CompactTurn, ln: str) -> None: +def _h_upsert(turn: ParsedTurn, ln: str) -> None: """U """ parts = ln.split(None, 2) if len(parts) < 3: @@ -274,7 +275,7 @@ def _h_upsert(turn: CompactTurn, ln: str) -> None: turn.entities.append((f"ent:{slug}", name)) -def _h_fact(turn: CompactTurn, ln: str) -> None: +def _h_fact(turn: ParsedTurn, ln: str) -> None: """F """ parts = ln.split(None, 2) if len(parts) < 3: @@ -285,7 +286,7 @@ def _h_fact(turn: CompactTurn, ln: str) -> None: turn.beliefs.append((f"fact:{topic}", value)) -def _h_drop(turn: CompactTurn, ln: str) -> None: +def _h_drop(turn: ParsedTurn, ln: str) -> None: """D """ parts = ln.split() if len(parts) < 2: @@ -295,7 +296,7 @@ def _h_drop(turn: CompactTurn, ln: str) -> None: turn.retracts.append(f"fact:{topic}") -def _h_edge(turn: CompactTurn, ln: str) -> None: +def _h_edge(turn: ParsedTurn, ln: str) -> None: """E (ids include ent:/fact: prefix)""" parts = ln.split() if len(parts) < 4: @@ -312,7 +313,7 @@ def _h_edge(turn: CompactTurn, ln: str) -> None: def _h_query(template: str): """Factory for verbs whose argument is free-text (wrapped in DSL quotes).""" - def h(turn: CompactTurn, ln: str) -> None: + def h(turn: ParsedTurn, ln: str) -> None: parts = ln.split(None, 1) if len(parts) < 2 or not parts[1].strip(): return @@ -323,7 +324,7 @@ def h(turn: CompactTurn, ln: str) -> None: def _h_walk(template: str): """Factory for verbs whose argument is a single anchor id.""" - def h(turn: CompactTurn, ln: str) -> None: + def h(turn: ParsedTurn, ln: str) -> None: parts = ln.split() if len(parts) < 2: return @@ -334,66 +335,569 @@ def h(turn: CompactTurn, ln: str) -> None: def _h_plain(template: str): - """Factory for zero-arg verbs (SS/SC/SH/ST/VS).""" - def h(turn: CompactTurn, _ln: str) -> None: + """Factory for zero-arg verbs (SC/SH/ST/VS).""" + def h(turn: ParsedTurn, _ln: str) -> None: turn.statements.append(template) return h -_COMPACT_HANDLERS: dict[str, Any] = { - "U": _h_upsert, "UP": _h_upsert, "UPSERT": _h_upsert, - "F": _h_fact, "FACT": _h_fact, "B": _h_fact, "ASSERT": _h_fact, - "D": _h_drop, "DROP": _h_drop, "R": _h_drop, "RETRACT": _h_drop, - "E": _h_edge, "EDGE": _h_edge, - "RM": _h_query('REMEMBER "{q}" LIMIT 10'), - "REMEMBER": _h_query('REMEMBER "{q}" LIMIT 10'), - "SM": _h_query('SIMILAR TO "{q}" LIMIT 10'), - "SIMILAR": _h_query('SIMILAR TO "{q}" LIMIT 10'), - "LX": _h_query('LEXICAL SEARCH "{q}" LIMIT 10'), - "LEXICAL": _h_query('LEXICAL SEARCH "{q}" LIMIT 10'), - "AQ": _h_query('ANSWER "{q}"'), - "ANSWER": _h_query('ANSWER "{q}"'), - "RL": _h_walk('RECALL FROM "{id}" DEPTH 2'), - "RECALL": _h_walk('RECALL FROM "{id}" DEPTH 2'), - "TR": _h_walk('TRAVERSE FROM "{id}" DEPTH 2'), - "TRAVERSE": _h_walk('TRAVERSE FROM "{id}" DEPTH 2'), - "AN": _h_walk('ANCESTORS OF "{id}" DEPTH 3'), - "ANCESTORS": _h_walk('ANCESTORS OF "{id}" DEPTH 3'), - "SG": _h_walk('SUBGRAPH FROM "{id}" DEPTH 2'), - "SUBGRAPH": _h_walk('SUBGRAPH FROM "{id}" DEPTH 2'), - "SS": _h_plain('SYS SNAPSHOT'), - "SC": _h_plain('SYS COMPACT'), - "SH": _h_plain('SYS HEALTH'), - "ST": _h_plain('SYS STATS'), - "SX": _h_query('SYS EXPLAIN REMEMBER "{q}"'), - "VS": _h_plain('VAULT SYNC'), +def _h_snapshot(turn: ParsedTurn, ln: str) -> None: + """@SS [name] -> SYS SNAPSHOT "name". + + Grammar requires a name. If the model didn't supply one, auto-fill with + a UTC timestamp so the emission is always parseable. + """ + from datetime import datetime, timezone + parts = ln.split(None, 1) + if len(parts) >= 2 and parts[1].strip(): + name = parts[1].strip().strip('"') + else: + name = datetime.now(timezone.utc).strftime("snap-%Y%m%dT%H%M%SZ") + turn.statements.append(f'SYS SNAPSHOT "{_dsl_escape(name)}"') + + +def _h_slug(template: str): + """Factory for verbs taking a bare slug (auto-prefixed `ent:`). + + Used for node-level ops where the model emits `@DN my_node`; we quote + + prefix to `ent:my_node` so the DSL parser accepts it. Also accepts a + pre-prefixed id verbatim (`@DN ent:my_node`). + """ + def h(turn: ParsedTurn, ln: str) -> None: + parts = ln.split() + if len(parts) < 2: + return + slug = parts[1].removeprefix("ent:").strip('"') + if slug: + turn.statements.append(template.replace("{slug}", _dsl_escape(slug))) + return h + + +def _h_topic(template: str): + """Factory for verbs taking a bare topic (auto-prefixed `fact:`).""" + def h(turn: ParsedTurn, ln: str) -> None: + parts = ln.split() + if len(parts) < 2: + return + topic = parts[1].removeprefix("fact:").strip('"') + if topic: + turn.statements.append(template.replace("{topic}", _dsl_escape(topic))) + return h + + +def _h_pair(template: str): + """Factory for 2-anchor verbs (@PA, @SP, @CO). Template uses `{a}` / `{b}`. + + Ids are taken verbatim (caller supplies full `ent:`/`fact:` prefix). + """ + def h(turn: ParsedTurn, ln: str) -> None: + parts = ln.split() + if len(parts) < 3: + return + a = parts[1].strip('"') + b = parts[2].strip('"') + if a and b: + turn.statements.append( + template.replace("{a}", _dsl_escape(a)).replace("{b}", _dsl_escape(b)) + ) + return h + + +def _h_update_node(turn: ParsedTurn, ln: str) -> None: + """@UN slug field value ... -> UPDATE NODE "ent:slug" SET field = "value ..." + + Value is the rest-of-line (multi-word OK). Slug auto-prefixed `ent:`. + """ + parts = ln.split(None, 3) + if len(parts) < 4: + return + slug = parts[1].removeprefix("ent:").strip('"') + field = parts[2].strip() + value = parts[3].strip().strip('"') + if not (slug and field and value): + return + turn.statements.append( + f'UPDATE NODE "ent:{_dsl_escape(slug)}" SET {field} = "{_dsl_escape(value)}"' + ) + + +def _h_merge(turn: ParsedTurn, ln: str) -> None: + """@M src dst -> MERGE NODE "ent:src" INTO "ent:dst" (auto-prefix)""" + parts = ln.split() + if len(parts) < 3: + return + src = parts[1].removeprefix("ent:").strip('"') + dst = parts[2].removeprefix("ent:").strip('"') + if src and dst: + turn.statements.append( + f'MERGE NODE "ent:{_dsl_escape(src)}" INTO "ent:{_dsl_escape(dst)}"' + ) + + +def _h_raw(template: str): + """Factory for verbs whose rest-of-line is a raw DSL body (passthrough). + + Used for verbs whose full grammar is too complex for positional encoding + (MATCH patterns, AGGREGATE clauses, EVOLVE RULE conditions, WHERE-filtered + bulk ops). The model emits the full DSL tail and Python just prefixes + the leading keyword(s). No escaping applied. + """ + def h(turn: ParsedTurn, ln: str) -> None: + parts = ln.split(None, 1) + if len(parts) < 2 or not parts[1].strip(): + return + turn.statements.append(template.replace("{body}", parts[1].strip())) + return h + + +def _h_update_edge(turn: ParsedTurn, ln: str) -> None: + """@UE from to field value... -> UPDATE EDGE "from" -> "to" SET field = "value..." + + First two args are anchor ids (verbatim). Third is identifier. Rest is value. + """ + parts = ln.split(None, 4) + if len(parts) < 5: + return + a = parts[1].strip('"') + b = parts[2].strip('"') + field = parts[3].strip() + value = parts[4].strip().strip('"') + if a and b and field and value: + turn.statements.append( + f'UPDATE EDGE "{_dsl_escape(a)}" -> "{_dsl_escape(b)}" SET {field} = "{_dsl_escape(value)}"' + ) + + +def _h_increment(turn: ParsedTurn, ln: str) -> None: + """@IC slug field num -> INCREMENT NODE "ent:slug" field BY num""" + parts = ln.split() + if len(parts) < 4: + return + slug = parts[1].removeprefix("ent:").strip('"') + field = parts[2].strip() + try: + num = float(parts[3]) + except ValueError: + return + num_str = str(int(num)) if num == int(num) else str(num) + if slug and field: + turn.statements.append( + f'INCREMENT NODE "ent:{_dsl_escape(slug)}" {field} BY {num_str}' + ) + + +def _h_propagate(turn: ParsedTurn, ln: str) -> None: + """@PG anchor field depth -> PROPAGATE "anchor" FIELD field DEPTH n""" + parts = ln.split() + if len(parts) < 4: + return + anchor = parts[1].strip('"') + field = parts[2].strip() + try: + depth = int(parts[3]) + except ValueError: + return + if anchor and field: + turn.statements.append( + f'PROPAGATE "{_dsl_escape(anchor)}" FIELD {field} DEPTH {depth}' + ) + + +def _h_describe(turn: ParsedTurn, ln: str) -> None: + """@SD type name -> SYS DESCRIBE NODE|EDGE "name" """ + parts = ln.split() + if len(parts) < 3: + return + t = parts[1].upper() + if t not in ("NODE", "EDGE"): + return + name = parts[2].strip('"') + if name: + turn.statements.append(f'SYS DESCRIBE {t} "{_dsl_escape(name)}"') + + +def _h_unregister(turn: ParsedTurn, ln: str) -> None: + """@SUR type name -> SYS UNREGISTER NODE|EDGE KIND "name" """ + parts = ln.split() + if len(parts) < 3: + return + t = parts[1].upper() + if t not in ("NODE", "EDGE"): + return + name = parts[2].strip('"') + if name: + turn.statements.append(f'SYS UNREGISTER {t} KIND "{_dsl_escape(name)}"') + + +def _h_contradictions(turn: ParsedTurn, ln: str) -> None: + """@SCT field group -> SYS CONTRADICTIONS FIELD field GROUP BY group""" + parts = ln.split() + if len(parts) < 3: + return + field = parts[1].strip() + group = parts[2].strip() + if field and group: + turn.statements.append( + f'SYS CONTRADICTIONS FIELD {field} GROUP BY {group}' + ) + + +def _h_cron_add(turn: ParsedTurn, ln: str) -> None: + """@CRA name schedule query... -> SYS CRON ADD "name" SCHEDULE "sched" QUERY "..." + + Uses shell-style quoting so cron expressions with spaces can be wrapped in + quotes (`@CRA nightly "0 0 * * *" SYS STATS`). Everything after the + schedule token becomes the query body. + """ + import shlex + try: + tokens = shlex.split(ln) + except ValueError: + return + if len(tokens) < 4: + return + name, schedule = tokens[1], tokens[2] + query = " ".join(tokens[3:]) + if name and schedule and query: + turn.statements.append( + f'SYS CRON ADD "{_dsl_escape(name)}" ' + f'SCHEDULE "{_dsl_escape(schedule)}" ' + f'QUERY "{_dsl_escape(query)}"' + ) + + +def _h_optimize(turn: ParsedTurn, ln: str) -> None: + """@SO [target] -> SYS OPTIMIZE [target]. target in {COMPACT,STRINGS,EDGES,VECTORS,BLOBS,CACHE}.""" + parts = ln.split() + valid = {"COMPACT", "STRINGS", "EDGES", "VECTORS", "BLOBS", "CACHE"} + if len(parts) >= 2: + t = parts[1].strip().upper() + if t in valid: + turn.statements.append(f'SYS OPTIMIZE {t}') + else: + turn.statements.append('SYS OPTIMIZE') + + +def _h_clear(turn: ParsedTurn, ln: str) -> None: + """@SCL target -> SYS CLEAR LOG|CACHE""" + parts = ln.split() + if len(parts) < 2: + return + t = parts[1].strip().upper() + if t in ("LOG", "CACHE"): + turn.statements.append(f'SYS CLEAR {t}') + + +def _h_wal(turn: ParsedTurn, ln: str) -> None: + """@SWA action -> SYS WAL STATUS|REPLAY""" + parts = ln.split() + if len(parts) < 2: + return + a = parts[1].strip().upper() + if a in ("STATUS", "REPLAY"): + turn.statements.append(f'SYS WAL {a}') + + +def _h_vault_triplet(template: str): + """Factory for @VW / @VAP: path + section + (multi-word) content.""" + def h(turn: ParsedTurn, ln: str) -> None: + parts = ln.split(None, 3) + if len(parts) < 4: + return + path = parts[1].strip('"') + section = parts[2].strip('"') + content = parts[3].strip().strip('"') + if path and section and content: + turn.statements.append( + template.replace("{p}", _dsl_escape(path)) + .replace("{s}", _dsl_escape(section)) + .replace("{c}", _dsl_escape(content)) + ) + return h + + +def _h_nodes(turn: ParsedTurn, ln: str) -> None: + """@NS [where-body] -> NODES [WHERE body] LIMIT 20""" + parts = ln.split(None, 1) + if len(parts) >= 2 and parts[1].strip(): + turn.statements.append(f'NODES WHERE {parts[1].strip()} LIMIT 20') + else: + turn.statements.append('NODES LIMIT 20') + + +# Handler instances reused across the short-code + English-keyword aliases. +# Building once and aliasing keeps the dispatch table lean and makes it +# obvious that `@RM` and `@REMEMBER` are the same handler, not two copies. + +_H_RM = _h_query('REMEMBER "{q}" LIMIT 10') +_H_SM = _h_query('SIMILAR TO "{q}" LIMIT 10') +_H_LX = _h_query('LEXICAL SEARCH "{q}" LIMIT 10') +_H_AQ = _h_query('ANSWER "{q}"') + +_H_RL = _h_walk('RECALL FROM "{id}" DEPTH 2') +_H_TR = _h_walk('TRAVERSE FROM "{id}" DEPTH 2') +_H_AN = _h_walk('ANCESTORS OF "{id}" DEPTH 3') +_H_DE = _h_walk('DESCENDANTS OF "{id}" DEPTH 3') +_H_SG = _h_walk('SUBGRAPH FROM "{id}" DEPTH 2') +_H_NO = _h_walk('NODE "{id}"') + +_H_PA = _h_pair('PATH FROM "{a}" TO "{b}" MAX_DEPTH 3') +_H_PAS = _h_pair('PATHS FROM "{a}" TO "{b}" MAX_DEPTH 3') +_H_SP = _h_pair('SHORTEST PATH FROM "{a}" TO "{b}"') +_H_DI = _h_pair('DISTANCE FROM "{a}" TO "{b}" MAX_DEPTH 5') +_H_WSP = _h_pair('WEIGHTED SHORTEST PATH FROM "{a}" TO "{b}"') +_H_WDI = _h_pair('WEIGHTED DISTANCE FROM "{a}" TO "{b}"') +_H_CO = _h_pair('COMMON NEIGHBORS OF "{a}" AND "{b}"') +_H_EX = _h_pair('DELETE EDGE "{a}" -> "{b}"') + +_H_DN = _h_slug('DELETE NODE "ent:{slug}"') +_H_FG = _h_slug('FORGET NODE "ent:{slug}"') +_H_CND = _h_slug('CONNECT NODE "ent:{slug}"') +_H_DEF = _h_slug('DELETE EDGES FROM "ent:{slug}"') +_H_DET = _h_slug('DELETE EDGES TO "ent:{slug}"') +_H_EF = _h_slug('EDGES FROM "ent:{slug}" LIMIT 20') +_H_ET = _h_slug('EDGES TO "ent:{slug}" LIMIT 20') + +_H_CF = _h_topic('WHAT IF RETRACT "fact:{topic}"') + +_H_MA = _h_raw('MATCH {body}') +_H_AG = _h_raw('AGGREGATE NODES {body}') +_H_UNS = _h_raw('UPDATE NODES WHERE {body}') +_H_DNS = _h_raw('DELETE NODES WHERE {body}') +_H_SRN = _h_raw('SYS REGISTER NODE KIND {body}') +_H_SRE = _h_raw('SYS REGISTER EDGE KIND {body}') +_H_EVR = _h_raw('SYS EVOLVE RULE {body}') + +_H_VN = _h_query('VAULT NEW "{q}"') +_H_VR_READ = _h_query('VAULT READ "{q}"') +_H_VB = _h_query('VAULT BACKLINKS "{q}"') +_H_VQ = _h_query('VAULT SEARCH "{q}" LIMIT 10') +_H_VH = _h_query('VAULT ARCHIVE "{q}"') +_H_VW = _h_vault_triplet('VAULT WRITE "{p}" SECTION "{s}" CONTENT "{c}"') +_H_VAP = _h_vault_triplet('VAULT APPEND "{p}" SECTION "{s}" CONTENT "{c}"') + +_H_BC = _h_query('BIND CONTEXT "{q}"') +_H_XC = _h_query('DISCARD CONTEXT "{q}"') +_H_IG = _h_query('INGEST "{q}"') +_H_SR = _h_query('SYS ROLLBACK TO "{q}"') +_H_SX = _h_query('SYS EXPLAIN REMEMBER "{q}"') + +_H_PLAIN_COUNT_NODES = _h_plain('COUNT NODES') +_H_PLAIN_COUNT_EDGES = _h_plain('COUNT EDGES') + +_H_PLAIN_COMPACT = _h_plain('SYS OPTIMIZE COMPACT') +_H_PLAIN_HEALTH = _h_plain('SYS HEALTH') +_H_PLAIN_STATS = _h_plain('SYS STATS') +_H_PLAIN_KINDS = _h_plain('SYS KINDS') +_H_PLAIN_EDGE_KINDS = _h_plain('SYS EDGE KINDS') +_H_PLAIN_EMBEDDERS = _h_plain('SYS EMBEDDERS') +_H_PLAIN_STATUS = _h_plain('SYS STATUS') +_H_PLAIN_SLOW = _h_plain('SYS SLOW QUERIES LIMIT 20') +_H_PLAIN_FREQUENT = _h_plain('SYS FREQUENT QUERIES LIMIT 20') +_H_PLAIN_FAILED = _h_plain('SYS FAILED QUERIES LIMIT 20') +_H_PLAIN_LOG = _h_plain('SYS LOG LIMIT 50') +_H_PLAIN_SNAPSHOTS = _h_plain('SYS SNAPSHOTS') +_H_PLAIN_VAULT_LIST = _h_plain('VAULT LIST') +_H_PLAIN_VAULT_DAILY = _h_plain('VAULT DAILY') +_H_PLAIN_VAULT_SYNC = _h_plain('VAULT SYNC') +_H_PLAIN_CHECKPOINT = _h_plain('SYS CHECKPOINT') +_H_PLAIN_REBUILD = _h_plain('SYS REBUILD INDICES') +_H_PLAIN_EXPIRE = _h_plain('SYS EXPIRE') +_H_PLAIN_DUPLICATES = _h_plain('SYS DUPLICATES') +_H_PLAIN_SYS_CONNECT = _h_plain('SYS CONNECT') +_H_PLAIN_CONSOLIDATE = _h_plain('SYS CONSOLIDATE') +_H_PLAIN_REEMBED = _h_plain('SYS REEMBED') +_H_PLAIN_RETAIN = _h_plain('SYS RETAIN') +_H_PLAIN_EVICT = _h_plain('SYS EVICT') +_H_PLAIN_CRON_LIST = _h_plain('SYS CRON LIST') +_H_PLAIN_EVOLVE_LIST = _h_plain('SYS EVOLVE LIST') +_H_PLAIN_EVOLVE_HISTORY = _h_plain('SYS EVOLVE HISTORY LIMIT 50') +_H_PLAIN_EVOLVE_RESET = _h_plain('SYS EVOLVE RESET') + +_H_Q_CRON_DELETE = _h_query('SYS CRON DELETE "{q}"') +_H_Q_CRON_ENABLE = _h_query('SYS CRON ENABLE "{q}"') +_H_Q_CRON_DISABLE = _h_query('SYS CRON DISABLE "{q}"') +_H_Q_CRON_RUN = _h_query('SYS CRON RUN "{q}"') +_H_Q_EVOLVE_SHOW = _h_query('SYS EVOLVE SHOW "{q}"') +_H_Q_EVOLVE_ENABLE = _h_query('SYS EVOLVE ENABLE "{q}"') +_H_Q_EVOLVE_DISABLE = _h_query('SYS EVOLVE DISABLE "{q}"') +_H_Q_EVOLVE_DELETE = _h_query('SYS EVOLVE DELETE "{q}"') + + +_VERB_HANDLERS: dict[str, Any] = { + # Ingest: entities, beliefs, retract. BELIEF/ASSERT/BELIEVE all map to + # the same handler so the model can use whichever phrasing feels natural. + "UPSERT": _h_upsert, + "BELIEF": _h_fact, "BELIEVE": _h_fact, "ASSERT": _h_fact, "FACT": _h_fact, + "RETRACT": _h_drop, "DROP": _h_drop, + + # Edges + "EDGE": _h_edge, "CREATE_EDGE": _h_edge, + "UPDATE_EDGE": _h_update_edge, + "DELETE_EDGE": _H_EX, + "DELETE_EDGES_FROM": _H_DEF, + "DELETE_EDGES_TO": _H_DET, + "EDGES_FROM": _H_EF, + "EDGES_TO": _H_ET, + + # Node lifecycle + "UPDATE_NODE": _h_update_node, + "DELETE_NODE": _H_DN, + "FORGET": _H_FG, "FORGET_NODE": _H_FG, + "CONNECT_NODE": _H_CND, + "MERGE": _h_merge, "MERGE_NODE": _h_merge, + "INCREMENT": _h_increment, + "PROPAGATE": _h_propagate, + "COUNTERFACTUAL": _H_CF, "WHAT_IF": _H_CF, + "NODE": _H_NO, + "NODES": _h_nodes, + + # Bulk WHERE-filtered (raw passthrough) + "UPDATE_NODES": _H_UNS, + "DELETE_NODES": _H_DNS, + + # Retrieval (user asked a question) + "REMEMBER": _H_RM, + "SIMILAR": _H_SM, "SIMILAR_TO": _H_SM, + "LEXICAL": _H_LX, "LEXICAL_SEARCH": _H_LX, + "ANSWER": _H_AQ, + + # Counts + "COUNT_NODES": _H_PLAIN_COUNT_NODES, + "COUNT_EDGES": _H_PLAIN_COUNT_EDGES, + + # Walks + "RECALL": _H_RL, + "TRAVERSE": _H_TR, + "ANCESTORS": _H_AN, + "DESCENDANTS": _H_DE, + "SUBGRAPH": _H_SG, + + # Paths / distance + "PATH": _H_PA, + "PATHS": _H_PAS, + "SHORTEST": _H_SP, "SHORTEST_PATH": _H_SP, + "DISTANCE": _H_DI, + "WEIGHTED_SHORTEST": _H_WSP, "WEIGHTED_SHORTEST_PATH": _H_WSP, + "WEIGHTED_DISTANCE": _H_WDI, + "COMMON": _H_CO, "COMMON_NEIGHBORS": _H_CO, + + # Pattern / aggregate (raw passthrough) + "MATCH": _H_MA, + "AGGREGATE": _H_AG, + + # Vault + "VAULT_NEW": _H_VN, + "VAULT_READ": _H_VR_READ, + "VAULT_WRITE": _H_VW, + "VAULT_APPEND": _H_VAP, + "VAULT_LIST": _H_PLAIN_VAULT_LIST, + "VAULT_BACKLINKS": _H_VB, + "VAULT_SEARCH": _H_VQ, + "VAULT_DAILY": _H_PLAIN_VAULT_DAILY, + "VAULT_ARCHIVE": _H_VH, + "VAULT_SYNC": _H_PLAIN_VAULT_SYNC, + + # Context + doc ingest + "BIND_CONTEXT": _H_BC, + "DISCARD_CONTEXT": _H_XC, + "INGEST": _H_IG, + + # Snapshots / rollback / optimize + "SNAPSHOT": _h_snapshot, + "ROLLBACK": _H_SR, + "SNAPSHOTS": _H_PLAIN_SNAPSHOTS, + "COMPACT": _H_PLAIN_COMPACT, + "OPTIMIZE": _h_optimize, + + # SYS introspection + "HEALTH": _H_PLAIN_HEALTH, + "STATS": _H_PLAIN_STATS, + "KINDS": _H_PLAIN_KINDS, + "EDGE_KINDS": _H_PLAIN_EDGE_KINDS, + "EMBEDDERS": _H_PLAIN_EMBEDDERS, + "STATUS": _H_PLAIN_STATUS, + "SLOW_QUERIES": _H_PLAIN_SLOW, + "FREQUENT_QUERIES": _H_PLAIN_FREQUENT, + "FAILED_QUERIES": _H_PLAIN_FAILED, + "LOG": _H_PLAIN_LOG, + "EXPLAIN": _H_SX, + "DESCRIBE": _h_describe, + + # SYS schema registration + "REGISTER_NODE": _H_SRN, "REGISTER_NODE_KIND": _H_SRN, + "REGISTER_EDGE": _H_SRE, "REGISTER_EDGE_KIND": _H_SRE, + "UNREGISTER": _h_unregister, + + # SYS maintenance (admin) + "CHECKPOINT": _H_PLAIN_CHECKPOINT, + "REBUILD": _H_PLAIN_REBUILD, + "CLEAR": _h_clear, + "WAL": _h_wal, + "EXPIRE": _H_PLAIN_EXPIRE, + "CONTRADICTIONS": _h_contradictions, + "DUPLICATES": _H_PLAIN_DUPLICATES, + "CONNECT_ALL": _H_PLAIN_SYS_CONNECT, + "CONSOLIDATE": _H_PLAIN_CONSOLIDATE, + "REEMBED": _H_PLAIN_REEMBED, + "RETAIN": _H_PLAIN_RETAIN, + "EVICT": _H_PLAIN_EVICT, + + # Cron + "CRON_ADD": _h_cron_add, + "CRON_DELETE": _H_Q_CRON_DELETE, + "CRON_ENABLE": _H_Q_CRON_ENABLE, + "CRON_DISABLE": _H_Q_CRON_DISABLE, + "CRON_LIST": _H_PLAIN_CRON_LIST, + "CRON_RUN": _H_Q_CRON_RUN, + + # Evolve (metacognitive rules) + "EVOLVE_LIST": _H_PLAIN_EVOLVE_LIST, + "EVOLVE_SHOW": _H_Q_EVOLVE_SHOW, + "EVOLVE_ENABLE": _H_Q_EVOLVE_ENABLE, + "EVOLVE_DISABLE": _H_Q_EVOLVE_DISABLE, + "EVOLVE_DELETE": _H_Q_EVOLVE_DELETE, + "EVOLVE_HISTORY": _H_PLAIN_EVOLVE_HISTORY, + "EVOLVE_RESET": _H_PLAIN_EVOLVE_RESET, + "EVOLVE_RULE": _H_EVR, } -def _parse_compact_output(cleaned: str) -> CompactTurn: - """Parse v5 unified verb-positional output. +def _parse_verb_output(cleaned: str) -> ParsedTurn: + """Parse v6 @-prefixed verb-positional output. + + Every op line starts with `@`. Python drops any line that doesn't, + making English drift / reasoning leaks / markdown fences inert at the + parser level. For valid lines, strip the `@`, dispatch the verb to its + handler. @U/@F/@D populate the entities/beliefs/retracts slots for + fact-state wiring; all other verbs render directly to pre-built DSL + lines in turn.statements. - Each non-blank, non-fence line is one verb + positional args. U/F/D - populate the entities/beliefs/retracts slots for fact-state wiring; all - other verbs render directly to pre-built DSL lines in turn.statements. - Unknown verbs and malformed lines are silently dropped. + Tolerant: accepts lowercase verbs, extra whitespace after `@`, trailing + colons on the verb token (seen in reasoning models that mimic chat + prefixes). Unknown verbs drop silently. """ - turn = CompactTurn() + turn = ParsedTurn() for raw_ln in cleaned.splitlines(): ln = raw_ln.strip() if not ln or _FENCE_RE.match(ln): continue - head = ln.split(maxsplit=1)[0] + if not ln.startswith("@"): + continue + payload = ln[1:].lstrip() + if not payload: + continue + head = payload.split(maxsplit=1)[0] verb = head.upper().rstrip(":") - handler = _COMPACT_HANDLERS.get(verb) + handler = _VERB_HANDLERS.get(verb) if handler is None: continue - handler(turn, ln) + handler(turn, payload) return turn def _synthesize_dsl( - turn: CompactTurn, + turn: ParsedTurn, *, msg_id: str, session_id: str, @@ -487,15 +991,15 @@ def _render_known_facts_block(facts: dict[str, FactState], max_facts: int = 40) # Ingestor # -------------------------------------------------------------------- -_DEFAULT_SKILL_PATH = ( - Path(__file__).resolve().parent.parent.parent - / "tools" / "skills" / "graphstore-bonsai-dsl" / "SKILL.md" -) - -_DEFAULT_COMPACT_SKILL_PATH = ( - Path(__file__).resolve().parent.parent.parent - / "tools" / "skills" / "graphstore-bonsai-dsl-compact" / "SKILL.md" -) +# The verb-prefixed prompt lives inside the package so it ships with the +# wheel and does not depend on the tools/skills/ tree being present at runtime. +# - full: covers the complete grammar (ingest + edges + node ops + queries + +# walks + paths + vault + snapshots + sys admin + cron + evolve). +# - lite: only ingest (UPSERT/BELIEF/RETRACT/EDGE) + retrieval (REMEMBER / +# SIMILAR / LEXICAL / ANSWER + walks + paths). Smaller prompt = less +# model confusion when the caller never uses admin ops. +_DEFAULT_PROMPT_PATH = Path(__file__).resolve().parent / "bonsai_dsl_prompt.txt" +_DEFAULT_LITE_PROMPT_PATH = Path(__file__).resolve().parent / "bonsai_dsl_prompt_lite.txt" class BonsaiIngestor: @@ -535,36 +1039,69 @@ def __init__( *, gs: Any | None = None, skill_path: str | Path | None = None, - compact: bool = False, - n_ctx: int = 2048, + n_ctx: int | None = None, + n_batch: int = 512, n_threads: int | None = None, chat_format: str = "qwen", - max_output_tokens: int = 400, + max_output_tokens: int = 256, temperature: float = 0.0, kv_cache_path: str | Path | None = None, + flash_attn: bool = False, ) -> None: self._model_path = Path(model_path) if not self._model_path.exists(): raise FileNotFoundError(f"bonsai model not found: {self._model_path}") self._gs = gs - self._compact = compact - if skill_path: - self._skill_path = Path(skill_path) - else: - self._skill_path = _DEFAULT_COMPACT_SKILL_PATH if compact else _DEFAULT_SKILL_PATH - self._n_ctx = n_ctx - # Compact mode emits ~30 tokens of structured output. Cap lower so - # stray model verbosity doesn't burn decode time. - self._max_output_tokens = max_output_tokens if not compact else min(max_output_tokens, 160) + self._skill_path = Path(skill_path) if skill_path else _DEFAULT_PROMPT_PATH + # Dense user turns can legitimately need 10-15 ops (30-100 tokens). + # Cap high enough to cover that. Post-op English drift is inert + # because the parser ignores non-@ lines, so over-provisioning only + # costs wall time on bad turns, never correctness. + self._max_output_tokens = max_output_tokens self._temperature = temperature self._chat_format = chat_format self._n_threads = n_threads + # n_batch defaults to llama.cpp's 512. On CPU, bigger batches saturate + # memory bandwidth and actually slow things down (measured: n_batch=2048 + # was 18% slower overall than 512 on this hardware). Kept exposed as a + # kwarg for GPU callers where bigger batches can help. + self._n_batch = n_batch + self._flash_attn = flash_attn self._skill_text = "" self._skill_fingerprint = "" self._system_prompt = "" self._reload_skill() + # Auto-pick n_ctx based on actual prompt size unless caller pinned + # one explicitly. Full prompt (~1700 tokens) needs 4096; lite prompt + # (~600 tokens) fits 2048 comfortably and halves KV cache RAM + load + # time. Typical user-message budget: 300 tokens (includes KNOWN FACTS + # block when present). + if n_ctx is None: + _USER_MSG_BUDGET = 300 + needed = ( + self._estimate_tokens(self._system_prompt) + + _USER_MSG_BUDGET + + self._max_output_tokens + + self._CTX_HEADROOM + ) + for candidate in (2048, 4096, 8192, 16384, 32768): + if needed <= candidate: + n_ctx = candidate + break + else: + n_ctx = 32768 + _log.info( + "bonsai: auto-picked n_ctx=%d (prompt~%d + user~%d + output=%d + headroom=%d)", + n_ctx, + self._estimate_tokens(self._system_prompt), + _USER_MSG_BUDGET, + self._max_output_tokens, + self._CTX_HEADROOM, + ) + self._n_ctx = n_ctx + self._llm: Any | None = None self._lock = threading.Lock() @@ -685,14 +1222,17 @@ def _ensure_llm(self) -> Any: kwargs: dict[str, Any] = { "model_path": str(self._model_path), "n_ctx": self._n_ctx, + "n_batch": self._n_batch, "chat_format": self._chat_format, + "flash_attn": self._flash_attn, "verbose": False, } if self._n_threads is not None: kwargs["n_threads"] = self._n_threads _log.info( - "bonsai: loading %s n_ctx=%d threads=%s chat_format=%s", - self._model_path.name, self._n_ctx, self._n_threads, self._chat_format, + "bonsai: loading %s n_ctx=%d n_batch=%d flash_attn=%s threads=%s chat_format=%s", + self._model_path.name, self._n_ctx, self._n_batch, + self._flash_attn, self._n_threads, self._chat_format, ) self._llm = Llama(**kwargs) self._try_load_kv_cache(self._llm) @@ -764,14 +1304,10 @@ def ingest( ) -> IngestResult: """Convert `text` to DSL statements and (optionally) execute them. - In full-DSL mode (compact=False) the LLM emits DSL directly; msg_id - and session_id come from the text the caller supplies ("Session s1, - msg m:s1:0, user: ...") so the extra kwargs are unused. - - In compact mode (compact=True) the LLM emits ENTS/BELIEFS/RETRACTS - and Python synthesizes the DSL. The caller must pass msg_id (and - may override session_id / role); these become the identifiers in - the synthesized CREATE NODE / CREATE EDGE statements. + The LLM emits @-verb lines; Python synthesizes the full DSL. The + caller must pass `msg_id` (and may override `session_id` / `role`); + these become the identifiers in the synthesized CREATE NODE / + CREATE EDGE statements for mentions-edge wiring. `dry_run=True` returns the DSL without touching the store. """ @@ -779,9 +1315,9 @@ def ingest( raise IngestEmpty("input text is empty or whitespace-only") if not dry_run and self._gs is None: raise ValueError("ingest requires a GraphStore (pass gs=...) or dry_run=True") - if self._compact and not msg_id: + if not msg_id: raise ValueError( - "compact=True ingest requires an explicit msg_id " + "ingest requires an explicit msg_id " "(DSL synthesis needs the exact CREATE NODE id)" ) @@ -844,16 +1380,12 @@ def _ingest_locked( f"raw={raw!r}" ) - if self._compact: - assert msg_id is not None # guarded in ingest() - turn = _parse_compact_output(cleaned) - deduped = _synthesize_dsl( - turn, msg_id=msg_id, session_id=session_id, role=role, text=text, - ) - dup_dropped: list[tuple[str, str]] = [] - else: - raw_lines = _split_lines(cleaned) - deduped, dup_dropped = _dedupe_upserts(raw_lines) + assert msg_id is not None # guarded in ingest() + turn = _parse_verb_output(cleaned) + deduped = _synthesize_dsl( + turn, msg_id=msg_id, session_id=session_id, role=role, text=text, + ) + dup_dropped: list[tuple[str, str]] = [] from graphstore.dsl.parser import parse as _dsl_parse diff --git a/tests/test_bonsai_ingestor.py b/tests/test_bonsai_ingestor.py index dc22b9d..06cc845 100644 --- a/tests/test_bonsai_ingestor.py +++ b/tests/test_bonsai_ingestor.py @@ -13,14 +13,14 @@ from graphstore.bonsai_ingestor import ( BonsaiIngestor, - CompactTurn, + ParsedTurn, FactState, IngestEmpty, IngestOverflow, IngestResult, _dedupe_upserts, _dsl_escape, - _parse_compact_output, + _parse_verb_output, _render_known_facts_block, _scrape_belief_updates, _split_lines, @@ -332,236 +332,317 @@ def test_ingestor_reset_facts_clears_state(tmp_path: Path): # -------------------------------------------------------------------- -# Compact mode: parser + DSL synthesis +# Verb parser (English-keyword @-prefix grammar) # -------------------------------------------------------------------- -def test_parse_compact_all_three_verbs(): - out = '''U priya Priya -U openai OpenAI -F color blue -D old''' - turn = _parse_compact_output(out) +def test_parse_all_three_ingest_verbs(): + out = '''@UPSERT priya Priya +@UPSERT openai OpenAI +@BELIEF color blue +@RETRACT old''' + turn = _parse_verb_output(out) assert turn.entities == [("ent:priya", "Priya"), ("ent:openai", "OpenAI")] assert turn.beliefs == [("fact:color", "blue")] assert turn.retracts == ["fact:old"] -def test_parse_compact_empty_output_is_empty_turn(): - turn = _parse_compact_output("") +def test_parse_empty_output_is_empty_turn(): + turn = _parse_verb_output("") assert turn.entities == [] assert turn.beliefs == [] assert turn.retracts == [] -def test_parse_compact_entities_only(): - turn = _parse_compact_output("U kailash Kailash") +def test_parse_entities_only(): + turn = _parse_verb_output("@UPSERT kailash Kailash") assert turn.entities == [("ent:kailash", "Kailash")] assert turn.beliefs == [] assert turn.retracts == [] -def test_parse_compact_multi_word_name_joined_by_whitespace(): +def test_parse_multi_word_name_joined_by_whitespace(): """Rest-of-line is the name; split on first 2 whitespace runs only.""" - turn = _parse_compact_output("U sf San Francisco") + turn = _parse_verb_output("@UPSERT sf San Francisco") assert turn.entities == [("ent:sf", "San Francisco")] -def test_parse_compact_case_insensitive_verbs(): - out = "u priya Priya\nf color blue\nd old" - turn = _parse_compact_output(out) +def test_parse_case_insensitive_verbs(): + out = "@upsert priya Priya\n@belief color blue\n@retract old" + turn = _parse_verb_output(out) assert turn.entities == [("ent:priya", "Priya")] assert turn.beliefs == [("fact:color", "blue")] assert turn.retracts == ["fact:old"] -def test_parse_compact_aliases_upsert_assert_retract(): - out = "UPSERT priya Priya\nASSERT color blue\nRETRACT old" - turn = _parse_compact_output(out) - assert turn.entities == [("ent:priya", "Priya")] +def test_parse_assert_alias_maps_to_belief(): + """ASSERT is a grammar keyword; we accept it as an alias for @BELIEF.""" + turn = _parse_verb_output("@ASSERT color blue") assert turn.beliefs == [("fact:color", "blue")] - assert turn.retracts == ["fact:old"] -def test_parse_compact_tolerates_fence_lines(): - out = "```\nU x X\n```" - turn = _parse_compact_output(out) +def test_parse_tolerates_fence_lines(): + out = "```\n@UPSERT x X\n```" + turn = _parse_verb_output(out) assert turn.entities == [("ent:x", "X")] -def test_parse_compact_strips_prefix_if_model_adds_it(): - """Model sometimes emits 'U ent:x X'; we normalize to slug-only.""" - turn = _parse_compact_output('U ent:priya Priya') +def test_parse_strips_prefix_if_model_adds_it(): + """Model sometimes emits '@UPSERT ent:x X'; we normalize to slug-only.""" + turn = _parse_verb_output('@UPSERT ent:priya Priya') assert turn.entities == [("ent:priya", "Priya")] - turn2 = _parse_compact_output('F fact:color blue') + turn2 = _parse_verb_output('@BELIEF fact:color blue') assert turn2.beliefs == [("fact:color", "blue")] -def test_parse_compact_ignores_unknown_verbs(): - out = "U priya Priya\nFOO some garbage\nD old" - turn = _parse_compact_output(out) +def test_parse_ignores_unknown_verbs(): + out = "@UPSERT priya Priya\n@FOO some garbage\n@RETRACT old" + turn = _parse_verb_output(out) assert turn.entities == [("ent:priya", "Priya")] assert turn.retracts == ["fact:old"] -def test_parse_compact_ignores_malformed_short_lines(): +def test_parse_ignores_malformed_short_lines(): """Missing required args -> line dropped, no crash.""" - out = "U justslug\nF onlytopic\nD\n" - turn = _parse_compact_output(out) + out = "@UPSERT justslug\n@BELIEF onlytopic\n@RETRACT\n" + turn = _parse_verb_output(out) assert turn.entities == [] assert turn.beliefs == [] assert turn.retracts == [] -def test_parse_compact_strips_quotes_if_present(): +def test_parse_strips_quotes_if_present(): """Model occasionally wraps tokens in quotes; handle both.""" - turn = _parse_compact_output('U "priya" "Priya"') + turn = _parse_verb_output('@UPSERT "priya" "Priya"') assert turn.entities == [("ent:priya", "Priya")] # -------------------------------------------------------------------- -# Compact v5: non-ingest verbs (edges, retrieval, walks, sys/vault) +# @-prefix contract # -------------------------------------------------------------------- -def test_parse_compact_edge_emits_create_edge(): - turn = _parse_compact_output("E ent:priya ent:flipkart works_at") +def test_parse_drops_lines_without_at_prefix(): + """Any line not starting with @ drops silently (English drift inert).""" + out = '''UPSERT priya Priya +Wait, let me think about this. +This is free-form prose. +@UPSERT kailash Kailash''' + turn = _parse_verb_output(out) + assert turn.entities == [("ent:kailash", "Kailash")] + + +def test_parse_accepts_space_after_at(): + """'@ UPSERT priya' still parses (tolerant).""" + turn = _parse_verb_output("@ UPSERT priya Priya") + assert turn.entities == [("ent:priya", "Priya")] + + +def test_parse_bare_at_dropped(): + turn = _parse_verb_output("@\n@UPSERT x X\n@") + assert turn.entities == [("ent:x", "X")] + + +def test_parse_english_drift_after_ops_ignored(): + out = '''@UPSERT priya Priya +Wait - that's not correct. Let me reconsider.''' + turn = _parse_verb_output(out) + assert turn.entities == [("ent:priya", "Priya")] + assert turn.statements == [] + + +# -------------------------------------------------------------------- +# Non-ingest verbs (edges, retrieval, walks, sys/vault) +# -------------------------------------------------------------------- + +def test_parse_edge_emits_create_edge(): + turn = _parse_verb_output("@EDGE ent:priya ent:flipkart works_at") assert turn.statements == [ 'CREATE EDGE "ent:priya" -> "ent:flipkart" kind = "works_at"' ] assert turn.entities == [] -def test_parse_compact_edge_needs_three_args(): - turn = _parse_compact_output("E ent:a ent:b") +def test_parse_edge_needs_three_args(): + turn = _parse_verb_output("@EDGE ent:a ent:b") assert turn.statements == [] -def test_parse_compact_remember(): - turn = _parse_compact_output("RM what I said about coffee") +def test_parse_remember(): + turn = _parse_verb_output("@REMEMBER what I said about coffee") assert turn.statements == ['REMEMBER "what I said about coffee" LIMIT 10'] -def test_parse_compact_similar(): - turn = _parse_compact_output("SM joining a startup") +def test_parse_similar(): + turn = _parse_verb_output("@SIMILAR joining a startup") assert turn.statements == ['SIMILAR TO "joining a startup" LIMIT 10'] -def test_parse_compact_lexical(): - turn = _parse_compact_output("LX python parser bug") +def test_parse_lexical(): + turn = _parse_verb_output("@LEXICAL python parser bug") assert turn.statements == ['LEXICAL SEARCH "python parser bug" LIMIT 10'] -def test_parse_compact_answer(): - turn = _parse_compact_output("AQ where does Priya work") +def test_parse_answer(): + turn = _parse_verb_output("@ANSWER where does Priya work") assert turn.statements == ['ANSWER "where does Priya work"'] -def test_parse_compact_recall_walk(): - turn = _parse_compact_output("RL ent:priya") +def test_parse_recall_walk(): + turn = _parse_verb_output("@RECALL ent:priya") assert turn.statements == ['RECALL FROM "ent:priya" DEPTH 2'] -def test_parse_compact_traverse_walk(): - turn = _parse_compact_output("TR ent:priya") +def test_parse_traverse_walk(): + turn = _parse_verb_output("@TRAVERSE ent:priya") assert turn.statements == ['TRAVERSE FROM "ent:priya" DEPTH 2'] -def test_parse_compact_ancestors_walk(): - turn = _parse_compact_output("AN fact:favorite_color") +def test_parse_ancestors_walk(): + turn = _parse_verb_output("@ANCESTORS fact:favorite_color") assert turn.statements == ['ANCESTORS OF "fact:favorite_color" DEPTH 3'] -def test_parse_compact_subgraph_walk(): - turn = _parse_compact_output("SG ent:openai") +def test_parse_descendants_walk(): + turn = _parse_verb_output("@DESCENDANTS ent:priya") + assert turn.statements == ['DESCENDANTS OF "ent:priya" DEPTH 3'] + + +def test_parse_subgraph_walk(): + turn = _parse_verb_output("@SUBGRAPH ent:openai") assert turn.statements == ['SUBGRAPH FROM "ent:openai" DEPTH 2'] -def test_parse_compact_sys_snapshot(): - turn = _parse_compact_output("SS") - assert turn.statements == ['SYS SNAPSHOT'] +def test_parse_path_and_shortest(): + assert _parse_verb_output("@PATH ent:a ent:b").statements == [ + 'PATH FROM "ent:a" TO "ent:b" MAX_DEPTH 3' + ] + assert _parse_verb_output("@SHORTEST_PATH ent:a ent:b").statements == [ + 'SHORTEST PATH FROM "ent:a" TO "ent:b"' + ] + assert _parse_verb_output("@COMMON ent:a ent:b").statements == [ + 'COMMON NEIGHBORS OF "ent:a" AND "ent:b"' + ] -def test_parse_compact_sys_compact_verb(): - turn = _parse_compact_output("SC") - assert turn.statements == ['SYS COMPACT'] +def test_parse_snapshot_with_name(): + turn = _parse_verb_output('@SNAPSHOT before-cleanup') + assert turn.statements == ['SYS SNAPSHOT "before-cleanup"'] -def test_parse_compact_sys_health(): - turn = _parse_compact_output("SH") - assert turn.statements == ['SYS HEALTH'] +def test_parse_snapshot_auto_timestamp_when_bare(): + import re + turn = _parse_verb_output("@SNAPSHOT") + assert len(turn.statements) == 1 + assert re.fullmatch( + r'SYS SNAPSHOT "snap-\d{8}T\d{6}Z"', + turn.statements[0], + ), turn.statements[0] -def test_parse_compact_sys_stats(): - turn = _parse_compact_output("ST") - assert turn.statements == ['SYS STATS'] +def test_parse_rollback_and_snapshots_list(): + assert _parse_verb_output("@ROLLBACK v1").statements == [ + 'SYS ROLLBACK TO "v1"' + ] + assert _parse_verb_output("@SNAPSHOTS").statements == ['SYS SNAPSHOTS'] -def test_parse_compact_sys_explain(): - turn = _parse_compact_output("SX what I said about coffee") - assert turn.statements == ['SYS EXPLAIN REMEMBER "what I said about coffee"'] +def test_parse_compact_optimize(): + assert _parse_verb_output("@COMPACT").statements == ['SYS OPTIMIZE COMPACT'] + +def test_parse_health_stats_kinds(): + assert _parse_verb_output("@HEALTH").statements == ['SYS HEALTH'] + assert _parse_verb_output("@STATS").statements == ['SYS STATS'] + assert _parse_verb_output("@KINDS").statements == ['SYS KINDS'] -def test_parse_compact_vault_sync(): - turn = _parse_compact_output("VS") - assert turn.statements == ['VAULT SYNC'] + +def test_parse_explain(): + turn = _parse_verb_output("@EXPLAIN what I said about coffee") + assert turn.statements == ['SYS EXPLAIN REMEMBER "what I said about coffee"'] -def test_parse_compact_mixed_ingest_and_query(): - out = '''U priya Priya -U openai OpenAI -RM what I said about coffee''' - turn = _parse_compact_output(out) +def test_parse_mixed_ingest_and_query(): + out = '''@UPSERT priya Priya +@UPSERT openai OpenAI +@REMEMBER what I said about coffee''' + turn = _parse_verb_output(out) assert turn.entities == [("ent:priya", "Priya"), ("ent:openai", "OpenAI")] assert turn.statements == ['REMEMBER "what I said about coffee" LIMIT 10'] -def test_parse_compact_escapes_quotes_in_query_text(): - turn = _parse_compact_output('RM she said "go"') +def test_parse_escapes_quotes_in_query_text(): + turn = _parse_verb_output('@REMEMBER she said "go"') assert turn.statements == ['REMEMBER "she said \\"go\\"" LIMIT 10'] -def test_parse_compact_query_verb_without_body_dropped(): - turn = _parse_compact_output("RM \nSM") +def test_parse_query_verb_without_body_dropped(): + turn = _parse_verb_output("@REMEMBER \n@SIMILAR") assert turn.statements == [] -def test_parse_compact_walk_verb_without_anchor_dropped(): - turn = _parse_compact_output("RL\nTR") +def test_parse_walk_verb_without_anchor_dropped(): + turn = _parse_verb_output("@RECALL\n@TRAVERSE") assert turn.statements == [] -def test_parse_compact_plain_verb_ignores_trailing_tokens(): - """SS foo still fires; plain handler ignores the rest of the line.""" - turn = _parse_compact_output("SS ignored") - assert turn.statements == ['SYS SNAPSHOT'] +def test_parse_plain_verb_ignores_trailing_tokens(): + """@HEALTH foo still fires; plain handler ignores the rest of the line.""" + turn = _parse_verb_output("@HEALTH ignored") + assert turn.statements == ['SYS HEALTH'] -def test_parse_compact_long_verb_aliases(): - turn = _parse_compact_output( - "REMEMBER coffee\nSIMILAR tea\nRECALL ent:a\nTRAVERSE ent:b" - ) +def test_parse_edge_escapes_quotes_in_ids(): + """Quote-escape applies inside CREATE EDGE even if ids carry weird chars.""" + turn = _parse_verb_output('@EDGE ent:a ent:b weird"kind') assert turn.statements == [ - 'REMEMBER "coffee" LIMIT 10', - 'SIMILAR TO "tea" LIMIT 10', - 'RECALL FROM "ent:a" DEPTH 2', - 'TRAVERSE FROM "ent:b" DEPTH 2', + 'CREATE EDGE "ent:a" -> "ent:b" kind = "weird\\"kind"' ] -def test_parse_compact_edge_escapes_quotes_in_ids(): - """Quote-escape applies inside CREATE EDGE even if ids carry weird chars.""" - turn = _parse_compact_output('E ent:a ent:b weird"kind') +# -------------------------------------------------------------------- +# Node lifecycle verbs (update/delete/forget/merge/counterfactual) +# -------------------------------------------------------------------- + +def test_parse_update_node(): + turn = _parse_verb_output("@UPDATE_NODE me title senior engineer") assert turn.statements == [ - 'CREATE EDGE "ent:a" -> "ent:b" kind = "weird\\"kind"' + 'UPDATE NODE "ent:me" SET title = "senior engineer"' ] +def test_parse_delete_node(): + turn = _parse_verb_output("@DELETE_NODE obsolete") + assert turn.statements == ['DELETE NODE "ent:obsolete"'] + + +def test_parse_forget_node(): + turn = _parse_verb_output("@FORGET old_gym") + assert turn.statements == ['FORGET NODE "ent:old_gym"'] + + +def test_parse_merge_nodes(): + turn = _parse_verb_output("@MERGE maria marie") + assert turn.statements == [ + 'MERGE NODE "ent:maria" INTO "ent:marie"' + ] + + +def test_parse_counterfactual(): + turn = _parse_verb_output("@COUNTERFACTUAL joined_stripe") + assert turn.statements == ['WHAT IF RETRACT "fact:joined_stripe"'] + + +def test_parse_count_nodes_and_edges(): + assert _parse_verb_output("@COUNT_NODES").statements == ['COUNT NODES'] + assert _parse_verb_output("@COUNT_EDGES").statements == ['COUNT EDGES'] + + # -------------------------------------------------------------------- # DSL synthesis (v5 pre-rendered statements) # -------------------------------------------------------------------- def test_synthesize_appends_statements_verbatim(): - turn = CompactTurn( + turn = ParsedTurn( entities=[("ent:x", "X")], statements=['REMEMBER "hello" LIMIT 3', 'SYS STATS'], ) @@ -571,14 +652,14 @@ def test_synthesize_appends_statements_verbatim(): def test_synthesize_statements_only_still_includes_create_node(): - turn = CompactTurn(statements=['REMEMBER "x" LIMIT 10']) + turn = ParsedTurn(statements=['REMEMBER "x" LIMIT 10']) dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="x") assert any(d.startswith('CREATE NODE "m:0"') for d in dsl) assert 'REMEMBER "x" LIMIT 10' in dsl -def test_compact_turn_default_statements_empty(): - turn = CompactTurn() +def test_parsed_turn_default_statements_empty(): + turn = ParsedTurn() assert turn.statements == [] @@ -588,7 +669,7 @@ def test_dsl_escape_handles_quote_and_backslash(): def test_synthesize_minimal_turn_emits_only_message_node(): - turn = CompactTurn() + turn = ParsedTurn() dsl = _synthesize_dsl(turn, msg_id="m:s1:0", session_id="s1", role="user", text="hi") assert len(dsl) == 1 assert 'CREATE NODE "m:s1:0"' in dsl[0] @@ -596,7 +677,7 @@ def test_synthesize_minimal_turn_emits_only_message_node(): def test_synthesize_with_entities_emits_upsert_plus_matching_edge(): - turn = CompactTurn(entities=[("ent:priya", "Priya"), ("ent:openai", "OpenAI")]) + turn = ParsedTurn(entities=[("ent:priya", "Priya"), ("ent:openai", "OpenAI")]) dsl = _synthesize_dsl(turn, msg_id="m:s1:0", session_id="s1", role="user", text="x") assert len(dsl) == 1 + 2 + 2 assert 'UPSERT NODE "ent:priya"' in dsl[1] @@ -606,7 +687,7 @@ def test_synthesize_with_entities_emits_upsert_plus_matching_edge(): def test_synthesize_dedupes_duplicate_entities(): - turn = CompactTurn(entities=[("ent:x", "X"), ("ent:x", "X")]) + turn = ParsedTurn(entities=[("ent:x", "X"), ("ent:x", "X")]) dsl = _synthesize_dsl(turn, msg_id="m:0", session_id="s", role="user", text="x") upserts = [d for d in dsl if d.startswith("UPSERT")] edges = [d for d in dsl if d.startswith("CREATE EDGE")] @@ -615,7 +696,7 @@ def test_synthesize_dedupes_duplicate_entities(): def test_synthesize_belief_and_retract_use_same_fact_id(): - turn = CompactTurn( + turn = ParsedTurn( beliefs=[("fact:drink", "tea")], retracts=["fact:drink"], ) @@ -627,7 +708,7 @@ def test_synthesize_belief_and_retract_use_same_fact_id(): def test_synthesize_escapes_quotes_in_text_and_name(): - turn = CompactTurn(entities=[("ent:a", 'Alice "Ace"')]) + turn = ParsedTurn(entities=[("ent:a", 'Alice "Ace"')]) dsl = _synthesize_dsl( turn, msg_id="m:0", session_id="s", role="user", text='She said "go".', @@ -639,7 +720,7 @@ def test_synthesize_escapes_quotes_in_text_and_name(): def test_synthesize_all_together_contract(): """End-to-end: messages + entity + belief + retract.""" - turn = CompactTurn( + turn = ParsedTurn( entities=[("ent:priya", "Priya")], beliefs=[("fact:color", "green")], retracts=["fact:color"], @@ -652,27 +733,23 @@ def test_synthesize_all_together_contract(): assert kinds == ["CREATE", "UPSERT", "CREATE", "RETRACT", "ASSERT"] -def test_compact_mode_requires_msg_id(tmp_path: Path): +def test_ingest_requires_msg_id(tmp_path: Path): skill = tmp_path / "skill.md" - skill.write_text("compact skill body") + skill.write_text("prompt body") model = tmp_path / "fake.gguf" model.write_bytes(b"") - ing = BonsaiIngestor(model_path=model, skill_path=skill, compact=True) - with pytest.raises(ValueError, match="compact=True ingest requires"): + ing = BonsaiIngestor(model_path=model, skill_path=skill) + with pytest.raises(ValueError, match="ingest requires an explicit msg_id"): ing.ingest("hello", dry_run=True) -def test_compact_mode_defaults_to_compact_skill_path(tmp_path: Path): - """When no skill_path is passed and compact=True, uses the compact default.""" - model = tmp_path / "fake.gguf" - model.write_bytes(b"") - - # Default compact skill path must exist in the repo or this raises; we - # accept that and just assert on the chosen path rather than instantiate. - from graphstore.bonsai_ingestor import _DEFAULT_COMPACT_SKILL_PATH, _DEFAULT_SKILL_PATH - assert _DEFAULT_COMPACT_SKILL_PATH != _DEFAULT_SKILL_PATH - assert "compact" in str(_DEFAULT_COMPACT_SKILL_PATH) +def test_default_prompt_path_ships_in_package(tmp_path: Path): + """Default prompt file lives inside the package and contains at least one @-verb.""" + from graphstore.bonsai_ingestor import _DEFAULT_PROMPT_PATH + assert _DEFAULT_PROMPT_PATH.exists() + body = _DEFAULT_PROMPT_PATH.read_text() + assert "@UPSERT" in body and "@REMEMBER" in body # -------------------------------------------------------------------- diff --git a/tools/skills/graphstore-bonsai-dsl-compact/SKILL.md b/tools/skills/graphstore-bonsai-dsl-compact/SKILL.md deleted file mode 100644 index 1f688df..0000000 --- a/tools/skills/graphstore-bonsai-dsl-compact/SKILL.md +++ /dev/null @@ -1,146 +0,0 @@ ---- -name: graphstore-bonsai-dsl-compact -description: Unified verb-positional caveman grammar covering every common GraphStore DSL operation. LLM emits 2-letter verbs + positional args, Python expands to full DSL. ~3-5x fewer output tokens than raw DSL on every path - ingest, query, walk, ops. -compatibility: graphstore >= 0.4.0 -metadata: - author: orkait - version: "5.0" - target_tokens: 900 - mode: unified-positional ---- - -Read the user turn. Output zero or more ops, one per line. No prose, no quotes (unless required inside a query string), no `` tags, no fences. - -Each line: ` [arg2...]`. Multi-word trailing args (names, query text) are allowed; the verb's shape fixes how Python splits the tokens. - -## Ingest (user said something about an entity / themselves) - -``` -U Upsert entity. Python auto-wires mentions edge from msg. -F User's first-person fact ("I", "my"). topic=snake_case. -D Drop a fact (retract). Requires matching known fact. -``` - -## Graph edges (explicit relationships between entities) - -``` -E Create edge with given kind. IDs include their prefix (ent:X or fact:X). -``` - -## Semantic retrieval (user asked a question) - -``` -RM REMEMBER (4-signal NL retrieval, default LIMIT 10) -SM SIMILAR TO (vector only, default LIMIT 10) -LX LEXICAL SEARCH (BM25 only, default LIMIT 10) -AQ ANSWER (LLM-answered recall) -``` - -## Structural walks (from a known anchor id) - -``` -RL RECALL FROM anchor DEPTH 2 (spreading activation) -TR TRAVERSE FROM anchor DEPTH 2 (deterministic walk) -AN ANCESTORS OF anchor DEPTH 3 -SG SUBGRAPH FROM anchor DEPTH 2 -``` - -## SYS / vault ops - -``` -SS SYS SNAPSHOT -SC SYS COMPACT -SH SYS HEALTH -ST SYS STATS -SX SYS EXPLAIN REMEMBER (dry-run a retrieval) -VS VAULT SYNC -``` - -## Rules - -- Third-person observations emit `U`, NOT `F`. Beliefs require first-person pronouns. -- Empty output is valid - emit nothing if nothing applies. -- If `### KNOWN FACTS` appears above, reuse those topic names exactly when updating same concept. -- Slugs and topics must be single tokens (lowercase + underscores). Names / values / query text can be multi-word. -- For query verbs, write the question as free text - no quotes, Python adds them. - ---- - -**Input:** "Kailash joined OpenAI." - -**Output:** -``` -U kailash Kailash -U openai OpenAI -``` - ---- - -**Input:** "Priya works at Flipkart since 2023 as a frontend engineer." - -**Output:** -``` -U priya Priya -U flipkart Flipkart -E ent:priya ent:flipkart works_at -``` - ---- - -**Input:** "My favorite color is blue." - -**Output:** -``` -F favorite_color blue -``` - ---- - -**Input (correction; known fact exists):** -``` -### KNOWN FACTS -[fact:favorite_drink] kind="belief" value="coffee" - -user: "Actually I prefer tea now." -``` -**Output:** -``` -D favorite_drink -F favorite_drink tea -``` - ---- - -**Input:** "Remember what I said about coffee." - -**Output:** -``` -RM what I said about coffee -``` - ---- - -**Input:** "Find messages similar to 'joining a startup'." - -**Output:** -``` -SM joining a startup -``` - ---- - -**Input:** "How is Priya connected to OpenAI?" - -**Output:** -``` -RL ent:priya -``` - ---- - -**Input:** "Take a snapshot." - -**Output:** -``` -SS -``` From 959124094a67239626b37a575fdaf554185fa571 Mon Sep 17 00:00:00 2001 From: Kailas Mahavarkar <66670953+KailasMahavarkar@users.noreply.github.com> Date: Tue, 21 Apr 2026 03:00:20 +0530 Subject: [PATCH 3/3] fix(bonsai): lite prompt hardening - personal-fact @ANSWER, conditional @RETRACT LongMemEval smoke revealed two real drift patterns in the lite prompt: 1. Spurious @RETRACT on unrelated turns: with KNOWN FACTS present, the model was emitting @RETRACT + @BELIEF even when the new turn was about a different topic entirely. The correction-flow example in the prompt was the attractor: model pattern-matched any new fact to a correction. 2. @RECALL misfire on personal-fact questions: "Which city did I move to last year?" emitted @RECALL location (wrong verb, bare anchor). Model thought the belief topic "location" was a valid walk anchor. Prompt changes: - VERB PICK RULE now distinguishes personal-fact questions ("Where did I ...?", "Which city did I ...?") which route to @ANSWER, from named-entity connection questions which route to @RECALL. - Added explicit rule: walk/path verbs (@RECALL, @TRAVERSE, @ANCESTORS, @DESCENDANTS, @SUBGRAPH, @PATH, @SHORTEST_PATH, @COMMON) REQUIRE a prefixed anchor id (ent:X / fact:X / msg:X). Bare topic names like "location" are not valid anchors. - Added explicit rule: @RETRACT only fires on correction trigger words ("actually", "not anymore", "changed to", "now prefer", "instead"). Unrelated new turns must NOT emit @RETRACT even if related beliefs are in KNOWN FACTS. - New NEGATIVE example showing KNOWN FACTS [fact:location]="Seattle" plus unrelated turn "I bought a new guitar" -> only @BELIEF purchase guitar. No retract. - Two new @ANSWER examples for personal-fact questions: "Which city did I move to last year?" and "What is my favorite color?". Verified by re-running tools/scripts style LongMemEval smoke on the fixture: both drift cases now produce correct ops (@BELIEF-only for unrelated turns, @ANSWER for personal-fact questions). Tests: 89/89 pass, no unit-test deltas (pure prompt change). --- src/graphstore/bonsai_dsl_prompt_lite.txt | 30 ++++++++++++++++++----- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/graphstore/bonsai_dsl_prompt_lite.txt b/src/graphstore/bonsai_dsl_prompt_lite.txt index 9a55fbe..42b76c6 100644 --- a/src/graphstore/bonsai_dsl_prompt_lite.txt +++ b/src/graphstore/bonsai_dsl_prompt_lite.txt @@ -3,9 +3,12 @@ Read user turn. Output zero or more ops, one per line. Every op MUST start with VERB PICK RULE (read first!) - Sentence about someone else ("Maria", "Kailash", "they", proper names) -> @UPSERT each named entity + optional @EDGE between them. NEVER @BELIEF. - Sentence about the user ("I", "my", "me") -> @BELIEF topic value. -- User asks a question -> @REMEMBER / @SIMILAR / @LEXICAL / @ANSWER. -- User corrects a prior belief (KNOWN FACTS block present) -> @RETRACT topic + @BELIEF topic new_value. -- User asks about connections -> @RECALL / @TRAVERSE / @ANCESTORS / @DESCENDANTS / @SUBGRAPH / @PATH / @SHORTEST_PATH / @COMMON. +- Question asking for a personal fact ("Where did I ...?", "What is my ...?", "When did I ...?", "Which X did I ...?") -> @ANSWER question. +- Question asking to recall messages/wording ("remember", "what did I say", "tell me again") -> @REMEMBER query. +- Question asking for messages that feel similar -> @SIMILAR query. +- Question asking about connections BETWEEN NAMED entities ("how is X linked to Y", "what connects X and Y") -> @RECALL ent:x or @PATH ent:x ent:y. +- Walk / path verbs REQUIRE prefixed anchor ids (ent:X or fact:X or msg:X). NEVER pass a bare topic or plain word like "location" as an anchor. +- @RETRACT fires ONLY when the user explicitly corrects a prior fact. Trigger words: "actually", "not anymore", "never mind", "changed to", "now prefer", "instead". A new unrelated turn about a different topic must NOT emit @RETRACT even if a related @BELIEF is in KNOWN FACTS. VERB TABLE (pick one per line; emit exactly the shape shown) @@ -57,7 +60,7 @@ My dentist is Dr. Chen. I prefer tea to coffee. @BELIEF drink_preference tea -(correction via KNOWN FACTS block) +(correction via KNOWN FACTS - trigger word "Actually") KNOWN FACTS: [fact:lunch_spot] kind="belief" value="cafe_paloma" @@ -65,6 +68,13 @@ user: Actually I go to Cafe Centro for lunch now. @RETRACT lunch_spot @BELIEF lunch_spot Cafe Centro +(NO correction: unrelated turn, do NOT retract prior fact) +KNOWN FACTS: +[fact:location] kind="belief" value="Seattle" + +user: I bought a new guitar yesterday. +@BELIEF purchase guitar + (retrieval: "remember" -> NL recall) Remember what I told you last week. @REMEMBER what I told you last week @@ -73,11 +83,19 @@ Remember what I told you last week. Find messages that feel like the argument we had. @SIMILAR the argument we had -(retrieval: direct question -> synthesized answer) +(retrieval: direct question about named entity -> synthesized answer) Where does Maria work? @ANSWER where does Maria work -(walk: "connected" / "linked" from an anchor) +(retrieval: personal-fact question -> @ANSWER, NOT @RECALL) +Which city did I move to last year? +@ANSWER which city did I move to last year + +(retrieval: personal-fact question -> @ANSWER) +What is my favorite color? +@ANSWER what is my favorite color + +(walk: "connected" / "linked" BETWEEN NAMED entities - note prefixed anchor) How is Nikhil linked to Stripe? @RECALL ent:nikhil