diff --git a/README.md b/README.md index becb5cf8..e05f55c2 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,11 @@ gbrain import ~/notes/ # index your markdown gbrain query "what themes show up across my notes?" ``` +Short note for Roger's M5 + Intel Mac setup: +- both machines keep a repo checkout at `~/Projects/gbrain` +- the active `gbrain` runtime is symlinked to that repo, not edited directly inside Bun's install tree +- run `~/.local/bin/gbrain-update.sh` on either machine to pull, rebuild, run the focused test suite, and verify `gbrain stats` + ``` 3 results (hybrid search, 0.12s): diff --git a/docs/plans/2026-04-18-gbrain-v012-file-by-file-port-checklist.md b/docs/plans/2026-04-18-gbrain-v012-file-by-file-port-checklist.md new file mode 100644 index 00000000..332af09c --- /dev/null +++ b/docs/plans/2026-04-18-gbrain-v012-file-by-file-port-checklist.md @@ -0,0 +1,492 @@ +# GBrain v0.12 Graph Layer — Exact File-by-File Port Checklist + +Date: 2026-04-18 + +## Goal + +Port the useful graph-layer pieces from `upstream/garrytan/link-timeline-extract` onto Roger's current branch without importing the whole v0.12 / Minions / migration surface. + +Current branch: +- `roger/m5-gbrain-hotfixes-2026-04-15` + +Source branch: +- `upstream/garrytan/link-timeline-extract` + +Recommended target branch to create: +- `roger/gbrain-v012-graph-layer-selective-port` + +## High-level strategy + +Do this in two waves. + +### Wave 1 — High-confidence graph layer only +Ship: +- typed graph traversal +- `graph-query` CLI +- self-wiring auto-link on `put_page` +- DB-source backfill extraction +- schema updates required for typed links and timeline dedup + +Do NOT ship yet: +- Minions/jobs queue +- full v0.12 migration orchestrator +- benchmark/docs churn +- backlink-boost ranking changes + +### Wave 2 — Search/heuristic improvements after Wave 1 is stable +Consider porting later: +- richer `inferLinkType` heuristics +- founder-of fix +- optional backlink boost, but only if combined carefully with Roger's canonical exact-ranking logic + +## Non-negotiable preservation rules + +Keep Roger's current local fixes intact: +1. Obsidian wikilink extraction in `src/commands/extract.ts` +2. canonical exact-entity ranking in `src/core/search/hybrid.ts` +3. embedding coercion / NaN prevention in: + - `src/core/utils.ts` + - `src/core/pglite-engine.ts` + - `src/core/postgres-engine.ts` + +If a candidate hunk conflicts with one of those, Roger's current behavior wins unless explicitly reworked. + +## Commit shortlist + +Port now, manually: +- `29006a5` — graph schema/foundation +- `f22dcb2` — auto-link + DB extract + link-extraction library +- `f933b0d` — `graph-query` CLI + +Port after Wave 1 stabilizes: +- `038f1ef` — better `inferLinkType` heuristics +- `cc028a7` — `founder of` fix + +Bring tests from: +- `9520a80` + +Skip for now: +- `dc52464`, `036ed3f`, `675f901` +- all Minions/jobs commits +- benchmark/doc packaging commits + +## Exact file-by-file plan + +### 1. `src/core/link-extraction.ts` +Source: +- add from `f22dcb2` +- later refine from `038f1ef` and `cc028a7` + +Action: +- new file, port in full as a starting point + +Why: +- this is the center of the graph-layer extractor: + - `extractEntityRefs` + - `extractPageLinks` + - `inferLinkType` + - `isAutoLinkEnabled` + +Required follow-up BEFORE enabling on Roger's vault: +- extend it to support Obsidian wikilinks, because the upstream file does not appear to include Roger's `[[wikilink]]` handling +- port Roger's current Obsidian logic from `src/commands/extract.ts` into this shared extractor so both: + - `put_page` auto-link + - `extract --source db` + understand Obsidian links + +Keep: +- upstream typed-link inference structure + +Add on top: +- Obsidian wikilink parsing from Roger branch +- slug normalization behavior that worked on Roger's vault + +Tests to pair with it: +- `test/link-extraction.test.ts` +- Roger's current `test/extract.test.ts` Obsidian cases + +### 2. `src/commands/graph-query.ts` +Source: +- add from `f933b0d` + +Action: +- port nearly verbatim + +Why: +- additive new command +- low conflict risk + +Dependency: +- requires `traversePaths()` and `GraphPath` + +Tests: +- `test/graph-query.test.ts` + +### 3. `src/cli.ts` +Source: +- selective lines from `f933b0d` +- maybe tiny pieces from `29006a5` + +Action: +- manual merge only + +Port exactly: +- add `graph-query` to CLI command set +- add dispatch case that imports `./commands/graph-query.ts` +- add help text for `graph-query` + +Do NOT copy wholesale: +- candidate branch also adds `jobs`, `apply-migrations`, `skillpack-check`, and other unrelated command changes + +Preserve: +- Roger's existing local CLI/runtime behavior + +### 4. `src/core/types.ts` +Source: +- selective pieces from `29006a5` + +Action: +- manual merge + +Port: +- `GraphPath` type +- any graph-health typing needed for traversal responses if required by engine signatures + +Do not pull unrelated type churn unless needed to compile. + +### 5. `src/core/engine.ts` +Source: +- selective pieces from `29006a5` + +Action: +- manual merge + +Port now: +- `getAllSlugs(): Promise>` +- `traversePaths(...)` +- optional: `removeLink(..., linkType?)` +- optional: `addTimelineEntry(..., opts?)` + +Defer if staying minimal: +- `getBacklinkCounts(...)` +- `executeRaw(...)` + +Reason: +- `graph-query` needs `traversePaths` +- DB extract path benefits from `getAllSlugs` +- typed link reconciliation benefits from link-type-aware remove behavior +- backlink-count support is mainly for ranking, which should be deferred until after graph-layer stabilization + +### 6. `src/core/pglite-engine.ts` +Source: +- selective pieces from `29006a5` + +Action: +- manual merge + +Port now: +- `traversePaths(...)` +- `getAllSlugs()` if missing in current implementation +- link-type-aware `removeLink(...)` behavior if needed +- any timeline dedup support required by the updated schema/migrations + +Defer initially: +- `getBacklinkCounts(...)` +- `executeRaw(...)` unless needed for your chosen extract implementation +- health-metric expansion (`link_coverage`, `timeline_coverage`, `most_connected`) unless you want to adopt the whole health model now + +Preserve explicitly: +- current `coerceEmbeddingVector` import and usage +- current `getEmbeddingsByChunkIds(...)` behavior that avoids NaN regressions +- current `rowToChunk(...)` call patterns that rely on coercion + +### 7. `src/core/postgres-engine.ts` +Source: +- selective pieces from `29006a5` +- tiny hook from `f22dcb2` + +Action: +- manual merge + +Port now: +- `traversePaths(...)` +- `getAllSlugs()` if missing +- link-type-aware `removeLink(...)` behavior if needed +- timeline-entry dedup support if needed by schema changes + +Defer initially: +- `getBacklinkCounts(...)` +- `executeRaw(...)` unless needed immediately +- broader health-metric expansion + +Preserve explicitly: +- Roger's current embedding handling +- no regression in `getEmbeddingsByChunkIds(...)` + +### 8. `src/core/import-file.ts` +Source: +- selective pieces from `29006a5` +- note: this file is also needed by `f22dcb2` behavior + +Action: +- manual merge + +Port: +- `ParsedPage` result metadata so `put_page` auto-link can reuse parsed content +- `parsedPage` return on both: + - `status='imported'` + - `status='skipped'` + +Preserve: +- Roger's current `noEmbed` behavior + +Reason: +- current branch already has `noEmbed` +- candidate adds `parsedPage`, which is the piece you need + +### 9. `src/core/operations.ts` +Source: +- selective pieces from `f22dcb2` +- small dependencies from `29006a5` + +Action: +- manual merge only + +Port now: +- updated `put_page` description if desired +- `put_page` post-hook that runs auto-link after import +- `runAutoLink(...)` +- imports of: + - `extractPageLinks` + - `isAutoLinkEnabled` +- support for using `parsedPage` from `importFromContent` + +Do NOT port wholesale: +- candidate file also contains unrelated security hardening and upload validation helpers: + - `validateUploadPath` + - `validatePageSlug` + - `validateFilename` +- unless you explicitly want those now, keep them out of this selective graph-layer pass + +Important local adjustment: +- `runAutoLink(...)` must use a link extractor that understands Roger's Obsidian wikilinks before enabling on real content + +### 10. `src/commands/extract.ts` +Source: +- selective pieces from `f22dcb2` + +Action: +- manual merge only + +Keep from Roger branch: +- `extractObsidianWikilinks(...)` +- filesystem extraction path +- current slug normalization fixes +- current tests covering Obsidian links and uppercase filenames + +Port from candidate: +- `runExtractCore(...)` +- `--source db` path +- `extractLinksFromDB(...)` +- `extractTimelineFromDB(...)` +- any safe command-line parsing additions needed for DB-source extraction + +Important rule: +- if DB-source extraction uses `link-extraction.ts`, that shared extractor must be Obsidian-aware before you turn it on for the Winston vault + +### 11. `src/schema.sql` +Source: +- selective graph-specific pieces from `29006a5` + +Action: +- manual merge, minimal only + +Port now: +- change links uniqueness from: + - `UNIQUE(from_page_id, to_page_id)` + to: + - `UNIQUE(from_page_id, to_page_id, link_type)` +- add timeline dedup index: + - `(page_id, date, summary)` +- optionally drop the legacy timeline trigger if it is obsolete under the new graph/timeline approach + +Do NOT port: +- `minion_jobs` +- `minion_inbox` +- `minion_attachments` +- pg notify trigger for minion jobs +- related RLS/minion additions tied to the queue system + +### 12. `src/core/migrate.ts` +Source: +- do NOT port wholesale from candidate + +Action: +- create new local graph-only migrations on top of Roger's current migration chain + +Reason: +- Roger branch currently stops at version 4 +- candidate branch versions 5/6/7 are Minions-related +- graph layer is renumbered there to 8/9/10 because Minions landed first +- importing candidate migrate file as-is would drag in Minions migrations you do not want + +Recommended local plan: +- add new local migration 5: + - widen links uniqueness to include `link_type` + - deduplicate any conflicting rows first +- add new local migration 6: + - add timeline dedup index + - remove old timeline trigger if safe + +Only add a migration 7 if you need an app-level graph backfill helper, but prefer keeping backfill as a command rather than forced migration. + +### 13. `src/core/pglite-schema.ts` +### 14. `src/core/schema-embedded.ts` +Source: +- do NOT port wholesale from candidate + +Action: +- manually mirror only the graph-specific schema changes from `src/schema.sql` + +Port: +- links uniqueness by `link_type` +- timeline dedup index + +Do NOT port: +- any `minion_*` tables or indexes + +### 15. `src/core/search/hybrid.ts` +Source: +- candidate branch has major changes from `29006a5` + +Action: +- do NOT port in Wave 1 + +Reason: +- candidate branch removes Roger's canonical exact-ranking logic: + - `EXACT_CANONICAL_PATH_BOOST` + - `applyQueryAwareBoosts(...)` +- candidate branch replaces this with backlink boosting +- this is the highest regression risk for Roger's real vault + +Wave 1 rule: +- keep Roger's current `hybrid.ts` unchanged + +Wave 2 optional: +- if graph layer stabilizes, manually add `applyBacklinkBoost(...)` under the existing exact-ranking logic rather than replacing it +- do not adopt candidate ranking logic wholesale + +### 16. `src/core/utils.ts` +Source: +- candidate branch is not better than Roger's current version here + +Action: +- keep Roger's current file unchanged + +Reason: +- current branch has `coerceEmbeddingVector(...)` +- candidate branch appears not to preserve the stronger coercion helper + +### 17. Tests to bring over now +Source: +- `9520a80` + +Action: +- port these first or very early + +Bring now: +- `test/extract-db.test.ts` +- `test/graph-query.test.ts` +- `test/link-extraction.test.ts` +- `test/e2e/graph-quality.test.ts` + +Merge carefully if needed: +- `test/pglite-engine.test.ts` + +Keep Roger's current tests: +- `test/extract.test.ts` +- `test/search.test.ts` +- `test/utils.test.ts` +- `test/e2e/search-quality.test.ts` + +## Suggested Wave 1 implementation order + +1. Add tests: +- `test/extract-db.test.ts` +- `test/graph-query.test.ts` +- `test/link-extraction.test.ts` +- `test/e2e/graph-quality.test.ts` + +2. Add new file: +- `src/core/link-extraction.ts` +- immediately patch it to support Obsidian wikilinks before enabling it + +3. Add graph traversal foundations: +- `src/core/types.ts` +- `src/core/engine.ts` +- `src/core/pglite-engine.ts` +- `src/core/postgres-engine.ts` + +4. Add `graph-query`: +- `src/commands/graph-query.ts` +- `src/cli.ts` + +5. Add `importFromContent` parsed-page support: +- `src/core/import-file.ts` + +6. Add auto-link hook and DB extract: +- `src/core/operations.ts` +- `src/commands/extract.ts` + +7. Add graph-only schema migrations: +- `src/schema.sql` +- `src/core/migrate.ts` +- `src/core/pglite-schema.ts` +- `src/core/schema-embedded.ts` + +8. Run both old and new tests + +## Suggested Wave 2 implementation order + +1. Port `038f1ef` heuristic improvements into `src/core/link-extraction.ts` +2. Port `cc028a7` founder-of fix +3. Re-run graph and exact-entity tests +4. Only then consider optional backlink boosting in `src/core/search/hybrid.ts` + +## Minimum verification commands + +```bash +cd ~/Projects/gbrain +bun test test/extract.test.ts test/utils.test.ts test/search.test.ts test/e2e/search-quality.test.ts +bun test test/extract-db.test.ts test/graph-query.test.ts test/link-extraction.test.ts test/e2e/graph-quality.test.ts +``` + +Smoke checks after Wave 1: + +```bash +gbrain query "Roger Gimbel" +gbrain query "Rodaco" +gbrain query "SelfGrowth" +gbrain graph-query companies/rodaco --type works_at --direction in +``` + +## Acceptance criteria + +Wave 1 is good enough when: +- Obsidian wikilinks still extract correctly +- exact canonical ranking is unchanged or better +- no NaN scores +- `graph-query` works +- `put_page` auto-link works +- DB-source extract works +- no Minions/jobs code is pulled in + +## Bottom line + +Port the graph layer, not the branch. + +Exact instruction: +- manually merge graph foundations and graph-query +- adapt the shared extractor to Roger's Obsidian reality before enabling auto-link +- keep Roger's ranking and embedding fixes untouched in Wave 1 +- treat backlink boosting as a separate later experiment, not part of the first port diff --git a/docs/plans/2026-04-18-gbrain-v012-graph-layer-selective-port.md b/docs/plans/2026-04-18-gbrain-v012-graph-layer-selective-port.md new file mode 100644 index 00000000..0f2cce34 --- /dev/null +++ b/docs/plans/2026-04-18-gbrain-v012-graph-layer-selective-port.md @@ -0,0 +1,234 @@ +# GBrain v0.12 Selective Port Plan + +Date: 2026-04-18 + +## Goal + +Adopt the useful parts of the GBrain v0.12 graph layer from `upstream/garrytan/link-timeline-extract` without doing a full runtime cutover and without regressing Roger's current local fixes for: +- Obsidian wikilink extraction +- canonical exact-entity ranking +- embedding coercion / NaN score prevention + +## Source branches + +Current branch: +- `roger/m5-gbrain-hotfixes-2026-04-15` + +Candidate upstream branch: +- `upstream/garrytan/link-timeline-extract` + +Shared base: +- `b7e3005` (`v0.10.1` era) + +## Decision summary + +Do not wholesale cherry-pick the branch. + +Use a mixed strategy: +- manual port for foundational graph/schema/engine changes that overlap with Roger's hotfixes +- direct cherry-pick or file-copy for mostly additive tests +- skip Minions/jobs/migration/docs churn for now + +## Recommended buckets + +### Bucket A — Port manually, not raw cherry-pick + +These commits are valuable but overlap too heavily with Roger's branch. + +1. `29006a5` +- `feat(schema): graph layer migrations v5/v6/v7 + GraphPath/health types` +- Why valuable: + - adds typed graph path model + - adds engine support for traversal/backlink counts + - foundational schema for graph-query +- Why not cherry-pick directly: + - overlaps with Roger-edited files: + - `src/core/pglite-engine.ts` + - `src/core/postgres-engine.ts` + - `src/core/search/hybrid.ts` + - `src/cli.ts` + - candidate branch appears to remove Roger's canonical exact-ranking logic in `hybrid.ts` +- Manual-port target files: + - `src/core/engine.ts` + - `src/core/types.ts` + - selective parts of `src/core/pglite-engine.ts` + - selective parts of `src/core/postgres-engine.ts` + - selective schema changes from: + - `src/core/pglite-schema.ts` + - `src/core/schema-embedded.ts` + - `src/schema.sql` + - `src/core/migrate.ts` +- Explicit preserve rules: + - keep Roger's `hybrid.ts` canonical exact-ranking behavior + - keep Roger's embedding coercion fixes in utils/engine code + +2. `f22dcb2` +- `feat(graph): auto-link on put_page + extract --source db + security hardening` +- Why valuable: + - adds auto-link post-hook on page writes + - adds `link-extraction.ts` + - adds DB-backed extract path +- Why not cherry-pick directly: + - overlaps with Roger-edited `src/commands/extract.ts` + - candidate extract path appears to drop Roger's explicit Obsidian wikilink support +- Manual-port target files: + - mostly take `src/core/link-extraction.ts` + - selectively merge `src/core/operations.ts` + - selectively merge DB-source extract logic from `src/commands/extract.ts` +- Explicit preserve rules: + - keep Roger's `extractObsidianWikilinks` support and tests + - keep Roger's slug normalization behavior where it fixed uppercase/Obsidian issues + +3. `f933b0d` +- `feat(cli): graph-query command + skill updates + v0.10.3 migration file` +- Why valuable: + - adds the actual `gbrain graph-query` CLI +- Why not cherry-pick directly: + - depends on graph traversal support from `29006a5` + - touches `src/cli.ts`, already modified locally +- Manual-port target files: + - `src/commands/graph-query.ts` + - small `src/cli.ts` wiring only + +### Bucket B — Good follow-up logic once Bucket A is stable + +4. `038f1ef` +- `fix(link-extraction): inferLinkType prose precision — type accuracy 70.7% -> 88.5%` +- Use after the graph layer is working. +- Port only the heuristic changes in `src/core/link-extraction.ts` and the related tests. + +5. `cc028a7` +- `fix(link-extraction): "founder of" pattern + benchmark methodology fix → recall jumps to 93%` +- Also a good follow-up once `link-extraction.ts` is in place. +- Port only the heuristic tweak plus related tests. + +### Bucket C — Bring over tests early + +6. `9520a80` +- `test(graph): unit + e2e + 80-page A/B/C benchmark for graph layer` +- Mostly additive and high-value. +- Safest pieces to bring first: + - `test/extract-db.test.ts` + - `test/graph-query.test.ts` + - `test/link-extraction.test.ts` + - `test/e2e/graph-quality.test.ts` +- Leave benchmark mega-file optional if it slows iteration. +- `test/pglite-engine.test.ts` will likely need manual merge because Roger already has local edits elsewhere in engine behavior. + +### Bucket D — Skip for now + +Do not port these yet: +- `dc52464` / `036ed3f` + - migration/orchestrator renumbering and v0.12 packaging +- `7d61134`, `ddcd35a`, `06e0888`, `675f901` + - benchmark/doc packaging only +- `26a7203`, `d861336`, `53d63b6` and other Minions/jobs-related changes + - too large and orthogonal to the graph-layer goal + +## Port order + +### Task 1: Create a new integration branch +- Branch from current Roger branch +- Suggested name: + - `roger/gbrain-v012-graph-layer-selective-port` + +### Task 2: Add tests first +- Bring in additive graph tests from `9520a80` +- Keep them failing initially if features are missing +- Prefer to land: + - `test/extract-db.test.ts` + - `test/graph-query.test.ts` + - `test/link-extraction.test.ts` + - `test/e2e/graph-quality.test.ts` + +### Task 3: Port graph foundations from `29006a5` +- Manually merge schema/types/engine support +- Avoid replacing Roger's local ranking and embedding fixes +- Required outcome: + - traversal path type exists + - engine traversal exists + - backlink count support exists + +### Task 4: Port graph wiring from `f22dcb2` +- Add `src/core/link-extraction.ts` +- Add `put_page` auto-link hook selectively +- Add DB-source extract path selectively +- Keep Roger's Obsidian wikilink extraction path intact + +### Task 5: Port `graph-query` from `f933b0d` +- Add `src/commands/graph-query.ts` +- Wire `src/cli.ts` + +### Task 6: Improve type inference with `038f1ef` and `cc028a7` +- Port only the heuristic logic and matching tests + +### Task 7: Verify against Roger's real concerns +Minimum checks after each stage: +- no NaN scores +- exact canonical ranking still prefers summary/status/readme/index pages +- Obsidian wikilinks still extract correctly +- graph-query returns useful typed traversals + +## Files likely to need careful manual merge + +Highest conflict risk: +- `src/commands/extract.ts` +- `src/core/operations.ts` +- `src/core/pglite-engine.ts` +- `src/core/postgres-engine.ts` +- `src/core/search/hybrid.ts` +- `src/core/utils.ts` +- `src/cli.ts` + +Likely additive / safer: +- `src/commands/graph-query.ts` +- `src/core/link-extraction.ts` +- `test/extract-db.test.ts` +- `test/graph-query.test.ts` +- `test/link-extraction.test.ts` +- `test/e2e/graph-quality.test.ts` + +## Verification commands + +Run after each major integration step: + +```bash +cd ~/Projects/gbrain +bun test test/extract.test.ts test/utils.test.ts test/search.test.ts test/e2e/search-quality.test.ts +``` + +After graph port pieces land, also run: + +```bash +cd ~/Projects/gbrain +bun test test/extract-db.test.ts test/graph-query.test.ts test/link-extraction.test.ts test/e2e/graph-quality.test.ts +``` + +And verify live-query behavior in staging: + +```bash +gbrain query "Roger Gimbel" +gbrain query "Rodaco" +gbrain query "SelfGrowth" +gbrain graph-query companies/rodaco --type works_at --direction in +``` + +## Acceptance criteria + +A successful selective port preserves all of Roger's current wins while adding the v0.12 graph value: +- Obsidian wikilinks still extract correctly +- canonical exact entity search still prefers summary/status/readme/index pages +- no NaN score regressions +- `gbrain graph-query` works +- auto-link on page write works +- typed link inference works for at least the main relation types +- Minions/jobs code is not pulled into the runtime yet + +## Recommendation + +Best path: port a narrow graph layer, not the whole branch. + +Specifically: +- manually port `29006a5`, `f22dcb2`, `f933b0d` +- then selectively port heuristic improvements from `038f1ef` and `cc028a7` +- use tests from `9520a80` to keep the integration honest diff --git a/scripts/update-runtime.sh b/scripts/update-runtime.sh new file mode 100755 index 00000000..29de39f3 --- /dev/null +++ b/scripts/update-runtime.sh @@ -0,0 +1,107 @@ +#!/bin/bash +set -euo pipefail + +REPO_DIR="${GBRAIN_REPO_DIR:-$HOME/Projects/gbrain}" +BUN_INSTALL="${BUN_INSTALL:-$HOME/.bun}" +RUNTIME_DIR="$BUN_INSTALL/install/global/node_modules/gbrain" +BIN_LINK="$BUN_INSTALL/bin/gbrain" +BACKUP_ROOT="${GBRAIN_BACKUP_ROOT:-$HOME/.gbrain/runtime-backups}" +TEST_CMD=(bun test test/extract.test.ts test/utils.test.ts test/search.test.ts test/e2e/search-quality.test.ts) + +export BUN_INSTALL +export PATH="$BUN_INSTALL/bin:$HOME/.nvm/versions/node/v24.4.0/bin:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH" + +need_cmd() { + command -v "$1" >/dev/null 2>&1 || { + echo "Missing required command: $1" >&2 + exit 1 + } +} + +need_cmd git +need_cmd python3 +need_cmd bun + +[ -d "$REPO_DIR/.git" ] || { + echo "Repo not found: $REPO_DIR" >&2 + exit 1 +} + +cd "$REPO_DIR" + +if [ -n "$(git status --porcelain)" ]; then + echo "Refusing to update with a dirty repo: $REPO_DIR" >&2 + git status --short >&2 || true + exit 1 +fi + +BRANCH="$(git branch --show-current)" +PREV_REV="$(git rev-parse HEAD)" +mkdir -p "$(dirname "$RUNTIME_DIR")" "$(dirname "$BIN_LINK")" "$BACKUP_ROOT" + +echo "Repo: $REPO_DIR" +echo "Branch: $BRANCH" +echo "Starting rev: $PREV_REV" + +git fetch --all --prune +git pull --ff-only +bun install --frozen-lockfile +chmod +x src/cli.ts + +if [ -e "$RUNTIME_DIR" ] && [ ! -L "$RUNTIME_DIR" ]; then + TS="$(date +%Y%m%d-%H%M%S)" + BACKUP_DIR="$BACKUP_ROOT/gbrain-global-pre-relink-$TS" + mv "$RUNTIME_DIR" "$BACKUP_DIR" + echo "Backed up existing runtime dir to: $BACKUP_DIR" +fi + +if [ -L "$RUNTIME_DIR" ]; then + CURRENT_TARGET="$(python3 - <<'PY' +import os +p = os.path.join(os.environ['BUN_INSTALL'], 'install/global/node_modules/gbrain') +print(os.path.realpath(os.path.expanduser(p))) +PY +)" + if [ "$CURRENT_TARGET" != "$REPO_DIR" ]; then + rm "$RUNTIME_DIR" + ln -s "$REPO_DIR" "$RUNTIME_DIR" + fi +else + ln -s "$REPO_DIR" "$RUNTIME_DIR" +fi + +ln -sf "$RUNTIME_DIR/src/cli.ts" "$BIN_LINK" + +rollback() { + echo "Verification failed. Rolling back to $PREV_REV" >&2 + git reset --hard "$PREV_REV" >&2 + bun install --frozen-lockfile >&2 + chmod +x src/cli.ts >&2 || true +} + +if ! "${TEST_CMD[@]}"; then + rollback + exit 1 +fi + +if ! "$BIN_LINK" --help >/dev/null; then + rollback + exit 1 +fi + +if ! "$BIN_LINK" stats >/dev/null; then + rollback + exit 1 +fi + +FINAL_REV="$(git rev-parse HEAD)" +REALPATH_NOW="$(python3 - <<'PY' +import os +p = os.path.join(os.environ['BUN_INSTALL'], 'bin/gbrain') +print(os.path.realpath(os.path.expanduser(p))) +PY +)" + +echo "Updated rev: $FINAL_REV" +echo "Active runtime: $REALPATH_NOW" +echo "Verification: OK" diff --git a/src/cli.ts b/src/cli.ts old mode 100644 new mode 100755 index 33f149f4..cf92d600 --- a/src/cli.ts +++ b/src/cli.ts @@ -18,7 +18,7 @@ for (const op of operations) { } // CLI-only commands that bypass the operation layer -const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot']); +const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query']); async function main() { const args = process.argv.slice(2); @@ -367,6 +367,11 @@ async function handleCliOnly(command: string, args: string[]) { await runAutopilot(engine, args); return; // autopilot doesn't disconnect (long-running) } + case 'graph-query': { + const { runGraphQuery } = await import('./commands/graph-query.ts'); + await runGraphQuery(engine, args); + break; + } } } finally { if (command !== 'serve') await engine.disconnect(); @@ -454,6 +459,8 @@ LINKS unlink Remove link backlinks Incoming links graph [--depth N] Traverse link graph + graph-query [--type T] Edge-based traversal with type/direction filters + [--depth N] [--direction in|out|both] TAGS tags List tags diff --git a/src/commands/extract.ts b/src/commands/extract.ts index 442a0de7..6afd1467 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -10,7 +10,10 @@ import { readFileSync, readdirSync, lstatSync, existsSync } from 'fs'; import { join, relative, dirname } from 'path'; import type { BrainEngine } from '../core/engine.ts'; +import type { PageType } from '../core/types.ts'; import { parseMarkdown } from '../core/markdown.ts'; +import { slugifyPath } from '../core/sync.ts'; +import { extractPageLinks, parseTimelineEntries } from '../core/link-extraction.ts'; // --- Types --- @@ -35,6 +38,12 @@ interface ExtractResult { pages_processed: number; } +function normalizeDateKey(date: unknown): string { + if (typeof date === 'string') return date.slice(0, 10); + if (date instanceof Date) return date.toISOString().slice(0, 10); + return String(date).slice(0, 10); +} + // --- Shared walker --- export function walkMarkdownFiles(dir: string): { path: string; relPath: string }[] { @@ -71,6 +80,26 @@ export function extractMarkdownLinks(content: string): { name: string; relTarget return results; } +/** Extract Obsidian-style wikilinks like [[slug]] or [[slug|Alias]] */ +export function extractObsidianWikilinks(content: string): { name: string; target: string }[] { + const results: { name: string; target: string }[] = []; + const pattern = /\[\[([^\]]+)\]\]/g; + let match; + while ((match = pattern.exec(content)) !== null) { + const raw = match[1].replace(/\\\|/g, '|').trim(); + const pipeIdx = raw.indexOf('|'); + const label = pipeIdx >= 0 ? raw.slice(pipeIdx + 1).trim() : raw; + const target = (pipeIdx >= 0 ? raw.slice(0, pipeIdx) : raw) + .split('#')[0] + .split('^')[0] + .trim() + .replace(/\.md$/i, ''); + if (!target) continue; + results.push({ name: label || target, target }); + } + return results; +} + /** Infer link type from directory structure */ function inferLinkType(fromDir: string, toDir: string, frontmatter?: Record): string { const from = fromDir.split('/')[0]; @@ -123,12 +152,12 @@ export function extractLinksFromFile( content: string, relPath: string, allSlugs: Set, ): ExtractedLink[] { const links: ExtractedLink[] = []; - const slug = relPath.replace('.md', ''); + const slug = slugifyPath(relPath); const fileDir = dirname(relPath); const fm = parseFrontmatterFromContent(content, relPath); for (const { name, relTarget } of extractMarkdownLinks(content)) { - const resolved = join(fileDir, relTarget).replace('.md', ''); + const resolved = slugifyPath(join(fileDir, relTarget)); if (allSlugs.has(resolved)) { links.push({ from_slug: slug, to_slug: resolved, @@ -138,6 +167,17 @@ export function extractLinksFromFile( } } + for (const { name, target } of extractObsidianWikilinks(content)) { + const resolved = slugifyPath(target.includes('/') ? target : join(fileDir, target)); + if (allSlugs.has(resolved)) { + links.push({ + from_slug: slug, to_slug: resolved, + link_type: inferLinkType(fileDir, dirname(resolved), fm), + context: `wikilink: [[${name}]]`, + }); + } + } + links.push(...extractFrontmatterLinks(slug, fm)); return links; } @@ -178,30 +218,62 @@ export async function runExtract(engine: BrainEngine, args: string[]) { const subcommand = args[0]; const dirIdx = args.indexOf('--dir'); const brainDir = (dirIdx >= 0 && dirIdx + 1 < args.length) ? args[dirIdx + 1] : '.'; + const sourceIdx = args.indexOf('--source'); + const source = (sourceIdx >= 0 && sourceIdx + 1 < args.length) ? args[sourceIdx + 1] : 'fs'; + const typeIdx = args.indexOf('--type'); + const typeFilter = (typeIdx >= 0 && typeIdx + 1 < args.length) ? (args[typeIdx + 1] as PageType) : undefined; + const sinceIdx = args.indexOf('--since'); + const since = (sinceIdx >= 0 && sinceIdx + 1 < args.length) ? args[sinceIdx + 1] : undefined; const dryRun = args.includes('--dry-run'); const jsonMode = args.includes('--json'); if (!subcommand || !['links', 'timeline', 'all'].includes(subcommand)) { - console.error('Usage: gbrain extract [--dir ] [--dry-run] [--json]'); + console.error('Usage: gbrain extract [--source fs|db] [--dir ] [--dry-run] [--json] [--type T] [--since DATE]'); process.exit(1); } - if (!existsSync(brainDir)) { + if (source !== 'fs' && source !== 'db') { + console.error(`Invalid --source: ${source}. Must be 'fs' or 'db'.`); + process.exit(1); + } + + if (since !== undefined) { + const sinceMs = new Date(since).getTime(); + if (!Number.isFinite(sinceMs)) { + console.error(`Invalid --since date: "${since}". Must be a parseable date.`); + process.exit(1); + } + } + + if (source === 'fs' && !existsSync(brainDir)) { console.error(`Directory not found: ${brainDir}`); process.exit(1); } const result: ExtractResult = { links_created: 0, timeline_entries_created: 0, pages_processed: 0 }; - if (subcommand === 'links' || subcommand === 'all') { - const r = await extractLinksFromDir(engine, brainDir, dryRun, jsonMode); - result.links_created = r.created; - result.pages_processed = r.pages; - } - if (subcommand === 'timeline' || subcommand === 'all') { - const r = await extractTimelineFromDir(engine, brainDir, dryRun, jsonMode); - result.timeline_entries_created = r.created; - result.pages_processed = Math.max(result.pages_processed, r.pages); + if (source === 'db') { + if (subcommand === 'links' || subcommand === 'all') { + const r = await extractLinksFromDB(engine, dryRun, jsonMode, typeFilter, since); + result.links_created = r.created; + result.pages_processed = r.pages; + } + if (subcommand === 'timeline' || subcommand === 'all') { + const r = await extractTimelineFromDB(engine, dryRun, jsonMode, typeFilter, since); + result.timeline_entries_created = r.created; + result.pages_processed = Math.max(result.pages_processed, r.pages); + } + } else { + if (subcommand === 'links' || subcommand === 'all') { + const r = await extractLinksFromDir(engine, brainDir, dryRun, jsonMode); + result.links_created = r.created; + result.pages_processed = r.pages; + } + if (subcommand === 'timeline' || subcommand === 'all') { + const r = await extractTimelineFromDir(engine, brainDir, dryRun, jsonMode); + result.timeline_entries_created = r.created; + result.pages_processed = Math.max(result.pages_processed, r.pages); + } } if (jsonMode) { @@ -215,7 +287,7 @@ async function extractLinksFromDir( engine: BrainEngine, brainDir: string, dryRun: boolean, jsonMode: boolean, ): Promise<{ created: number; pages: number }> { const files = walkMarkdownFiles(brainDir); - const allSlugs = new Set(files.map(f => f.relPath.replace('.md', ''))); + const allSlugs = new Set(files.map(f => slugifyPath(f.relPath))); // Load existing links for O(1) dedup const existing = new Set(); @@ -271,7 +343,7 @@ async function extractTimelineFromDir( const pages = await engine.listPages({ limit: 100000 }); for (const page of pages) { for (const entry of await engine.getTimeline(page.slug)) { - existing.add(`${page.slug}::${entry.date}::${entry.summary}`); + existing.add(`${page.slug}::${normalizeDateKey(entry.date)}::${entry.summary}`); } } } catch { /* fresh brain */ } @@ -308,11 +380,143 @@ async function extractTimelineFromDir( return { created, pages: files.length }; } +async function extractLinksFromDB( + engine: BrainEngine, + dryRun: boolean, + jsonMode: boolean, + typeFilter: PageType | undefined, + since: string | undefined, +): Promise<{ created: number; pages: number }> { + const pages = await engine.listPages({ limit: 100000 }); + const allSlugs = new Set(pages.map(page => page.slug)); + let processed = 0; + let created = 0; + + for (let i = 0; i < pages.length; i++) { + const page = pages[i]; + if (typeFilter && page.type !== typeFilter) continue; + if (since) { + const updatedMs = new Date(page.updated_at).getTime(); + const sinceMs = new Date(since).getTime(); + if (updatedMs <= sinceMs) continue; + } + + const fullContent = `${page.compiled_truth}\n${page.timeline || ''}`; + const candidates = extractPageLinks(fullContent, page.frontmatter || {}, page.type as PageType, page.slug); + for (const candidate of candidates) { + if (!allSlugs.has(candidate.targetSlug)) continue; + if (dryRun) { + if (jsonMode) { + process.stdout.write(JSON.stringify({ + action: 'add_link', + from: page.slug, + to: candidate.targetSlug, + type: candidate.linkType, + context: candidate.context, + }) + '\n'); + } else { + console.log(` ${page.slug} → ${candidate.targetSlug} (${candidate.linkType})`); + } + created++; + } else { + try { + await engine.addLink(page.slug, candidate.targetSlug, candidate.context, candidate.linkType); + created++; + } catch { + /* skip */ + } + } + } + processed++; + if (jsonMode && !dryRun && (processed % 500 === 0 || i === pages.length - 1)) { + process.stderr.write(JSON.stringify({ event: 'progress', phase: 'extracting_links_db', done: processed, total: pages.length }) + '\n'); + } + } + + if (!jsonMode) { + const label = dryRun ? '(dry run) would create' : 'created'; + console.log(`Links: ${label} ${created} from ${processed} pages (db source)`); + } + return { created, pages: processed }; +} + +async function extractTimelineFromDB( + engine: BrainEngine, + dryRun: boolean, + jsonMode: boolean, + typeFilter: PageType | undefined, + since: string | undefined, +): Promise<{ created: number; pages: number }> { + const pages = await engine.listPages({ limit: 100000 }); + const existing = new Set(); + for (const page of pages) { + for (const entry of await engine.getTimeline(page.slug)) { + existing.add(`${page.slug}::${normalizeDateKey(entry.date)}::${entry.summary}`); + } + } + + let processed = 0; + let created = 0; + + for (let i = 0; i < pages.length; i++) { + const page = pages[i]; + if (typeFilter && page.type !== typeFilter) continue; + if (since) { + const updatedMs = new Date(page.updated_at).getTime(); + const sinceMs = new Date(since).getTime(); + if (updatedMs <= sinceMs) continue; + } + + const fullContent = `${page.compiled_truth}\n${page.timeline || ''}`; + const entries = parseTimelineEntries(fullContent); + for (const entry of entries) { + const key = `${page.slug}::${entry.date}::${entry.summary}`; + if (existing.has(key)) continue; + existing.add(key); + if (dryRun) { + if (jsonMode) { + process.stdout.write(JSON.stringify({ + action: 'add_timeline', + slug: page.slug, + date: entry.date, + summary: entry.summary, + ...(entry.detail ? { detail: entry.detail } : {}), + }) + '\n'); + } else { + console.log(` ${page.slug}: ${entry.date} — ${entry.summary}`); + } + created++; + } else { + try { + await engine.addTimelineEntry(page.slug, { + date: entry.date, + summary: entry.summary, + detail: entry.detail || '', + }); + created++; + } catch { + /* skip */ + } + } + } + processed++; + if (jsonMode && !dryRun && (processed % 500 === 0 || i === pages.length - 1)) { + process.stderr.write(JSON.stringify({ event: 'progress', phase: 'extracting_timeline_db', done: processed, total: pages.length }) + '\n'); + } + } + + if (!jsonMode) { + const label = dryRun ? '(dry run) would create' : 'created'; + console.log(`Timeline: ${label} ${created} entries from ${processed} pages (db source)`); + } + return { created, pages: processed }; +} + // --- Sync integration hooks --- export async function extractLinksForSlugs(engine: BrainEngine, repoPath: string, slugs: string[]): Promise { const allFiles = walkMarkdownFiles(repoPath); - const allSlugs = new Set(allFiles.map(f => f.relPath.replace('.md', ''))); + const allSlugs = new Set(allFiles.map(f => slugifyPath(f.relPath))); let created = 0; for (const slug of slugs) { const filePath = join(repoPath, slug + '.md'); diff --git a/src/commands/graph-query.ts b/src/commands/graph-query.ts new file mode 100644 index 00000000..47151b88 --- /dev/null +++ b/src/commands/graph-query.ts @@ -0,0 +1,91 @@ +/** + * gbrain graph-query — relationship traversal with type and direction filters. + */ + +import type { BrainEngine } from '../core/engine.ts'; +import type { GraphPath } from '../core/types.ts'; + +interface Args { + slug?: string; + linkType?: string; + depth: number; + direction: 'in' | 'out' | 'both'; + showHelp: boolean; +} + +function parseArgs(args: string[]): Args { + const out: Args = { depth: 5, direction: 'out', showHelp: false }; + for (let i = 0; i < args.length; i++) { + const arg = args[i]; + if (arg === '--type' && i + 1 < args.length) out.linkType = args[++i]; + else if (arg === '--depth' && i + 1 < args.length) out.depth = Number(args[++i]); + else if (arg === '--direction' && i + 1 < args.length) { + const direction = args[++i]; + if (direction === 'in' || direction === 'out' || direction === 'both') out.direction = direction; + } + else if (arg === '--help' || arg === '-h') out.showHelp = true; + else if (!arg.startsWith('-') && !out.slug) out.slug = arg; + } + return out; +} + +function printHelp() { + console.log(`Usage: gbrain graph-query [options] + +Traverse the link graph from a page. Returns an indented tree of edges. + +Options: + --type Filter to one link type. + --depth Max traversal depth (default 5). + --direction 'out' (default), 'in', or 'both'. + -h, --help Show this message.`); +} + +export async function runGraphQuery(engine: BrainEngine, argv: string[]) { + const args = parseArgs(argv); + if (args.showHelp || !args.slug) { + printHelp(); + if (!args.slug) process.exit(1); + return; + } + + const paths = await engine.traversePaths(args.slug, { + depth: args.depth, + linkType: args.linkType, + direction: args.direction, + }); + + if (paths.length === 0) { + console.log(`No edges found from ${args.slug}${args.linkType ? ` (--type ${args.linkType})` : ''}.`); + return; + } + + console.log(`[depth 0] ${args.slug}`); + printTree(args.slug, paths, args.direction); +} + +function printTree(rootSlug: string, paths: GraphPath[], direction: 'in' | 'out' | 'both') { + const byParent = new Map(); + for (const path of paths) { + const parent = direction === 'in' ? path.to_slug : path.from_slug; + const list = byParent.get(parent) ?? []; + list.push(path); + byParent.set(parent, list); + } + + function walk(parent: string, indent: number, seen: Set) { + if (seen.has(parent)) return; + seen.add(parent); + const children = byParent.get(parent) ?? []; + children.sort((a, b) => a.depth - b.depth || a.to_slug.localeCompare(b.to_slug)); + for (const child of children) { + const next = direction === 'in' ? child.from_slug : child.to_slug; + const arrow = direction === 'in' ? '<-' : '--'; + const tail = direction === 'in' ? '--' : '->'; + console.log(`${' '.repeat(indent + 1)}${arrow}${child.link_type}${tail} ${next} (depth ${child.depth})`); + walk(next, indent + 1, seen); + } + } + + walk(rootSlug, 0, new Set()); +} diff --git a/src/core/engine.ts b/src/core/engine.ts index 63abf3e3..cb8bc5c3 100644 --- a/src/core/engine.ts +++ b/src/core/engine.ts @@ -2,7 +2,7 @@ import type { Page, PageInput, PageFilters, Chunk, ChunkInput, SearchResult, SearchOpts, - Link, GraphNode, + Link, GraphNode, GraphPath, TimelineEntry, TimelineInput, TimelineOpts, RawData, PageVersion, @@ -51,6 +51,11 @@ export interface BrainEngine { getLinks(slug: string): Promise; getBacklinks(slug: string): Promise; traverseGraph(slug: string, depth?: number): Promise; + traversePaths( + slug: string, + opts?: { depth?: number; linkType?: string; direction?: 'in' | 'out' | 'both' }, + ): Promise; + getBacklinkCounts(slugs: string[]): Promise>; // Tags addTag(slug: string, tag: string): Promise; diff --git a/src/core/link-extraction.ts b/src/core/link-extraction.ts new file mode 100644 index 00000000..3d7332cd --- /dev/null +++ b/src/core/link-extraction.ts @@ -0,0 +1,249 @@ +import { dirname, join } from 'path'; +import type { BrainEngine } from './engine.ts'; +import type { PageType } from './types.ts'; + +export interface EntityRef { + name: string; + slug: string; + dir: string; +} + +export interface LinkCandidate { + targetSlug: string; + linkType: string; + context: string; +} + +export interface TimelineCandidate { + date: string; + summary: string; + detail: string; +} + +const ENTITY_REF_RE = /\[([^\]]+)\]\((?:\.\.\/)*((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/([^)\s]+?))(?:\.md)?\)/g; +const TIMELINE_LINE_RE = /^\s*-?\s*\*\*(\d{4}-\d{2}-\d{2})\*\*\s*[|\-–—]+\s*(.+?)\s*$/; +const WORKS_AT_RE = /\b(?:CEO of|CTO of|COO of|CFO of|CMO of|CRO of|VP at|VP of|VPs? Engineering|VPs? Product|works at|worked at|working at|employed by|employed at|joined as|joined the team|engineer at|engineer for|director at|director of|head of|leads engineering|leads product|currently at|previously at|previously worked at|spent .* (?:years|months) at|stint at|tenure at)\b/i; +const INVESTED_RE = /\b(?:invested in|invests in|investing in|invest in|investment in|investments in|backed by|funding from|funded by|raised from|led the (?:seed|Series|round|investment|round)|led .{0,30}(?:Series [A-Z]|seed|round|investment)|participated in (?:the )?(?:seed|Series|round)|wrote (?:a |the )?check|first check|early investor|portfolio (?:company|includes)|board seat (?:at|in|on)|term sheet for)\b/i; +const FOUNDED_RE = /\b(?:founded|co-?founded|started the company|incorporated|founder of|founders? (?:include|are)|the founder|is a co-?founder|is one of the founders)\b/i; +const ADVISES_RE = /\b(?:advises|advised|advisor (?:to|at|for|of)|advisory (?:board|role|position)|board advisor|on .{0,20} advisory board|joined .{0,20} advisory board)\b/i; +const PARTNER_ROLE_RE = /\b(?:partner at|partner of|venture partner|VC partner|invested early|investor at|investor in|portfolio|venture capital|early-stage investor|seed investor|fund [A-Z]|invests across|backs companies)\b/i; +const ADVISOR_ROLE_RE = /\b(?:full-time advisor|professional advisor|advises (?:multiple|several|various))\b/i; + +function stripCodeBlocks(content: string): string { + let out = ''; + let i = 0; + while (i < content.length) { + if (content.startsWith('```', i)) { + const end = content.indexOf('```', i + 3); + if (end === -1) { + out += ' '.repeat(content.length - i); + break; + } + out += ' '.repeat(end + 3 - i); + i = end + 3; + continue; + } + if (content[i] === '`') { + const end = content.indexOf('`', i + 1); + if (end === -1 || content.slice(i + 1, end).includes('\n')) { + out += content[i]; + i++; + continue; + } + out += ' '.repeat(end + 1 - i); + i = end + 1; + continue; + } + out += content[i]; + i++; + } + return out; +} + +export function extractEntityRefs(content: string): EntityRef[] { + const stripped = stripCodeBlocks(content); + const refs: EntityRef[] = []; + const re = new RegExp(ENTITY_REF_RE.source, ENTITY_REF_RE.flags); + let match: RegExpExecArray | null; + while ((match = re.exec(stripped)) !== null) { + const slug = match[2]; + refs.push({ + name: match[1], + slug, + dir: slug.split('/')[0], + }); + } + return refs; +} + +function extractObsidianRefs(content: string, currentSlug?: string): EntityRef[] { + const refs: EntityRef[] = []; + const stripped = stripCodeBlocks(content); + const re = /\[\[([^\]]+)\]\]/g; + let match: RegExpExecArray | null; + while ((match = re.exec(stripped)) !== null) { + const raw = match[1].replace(/\\\|/g, '|').trim(); + const pipeIdx = raw.indexOf('|'); + const label = pipeIdx >= 0 ? raw.slice(pipeIdx + 1).trim() : raw; + const targetRaw = (pipeIdx >= 0 ? raw.slice(0, pipeIdx) : raw) + .split('#')[0] + .split('^')[0] + .trim() + .replace(/\.md$/i, ''); + if (!targetRaw) continue; + const resolved = targetRaw.includes('/') + ? targetRaw + : currentSlug + ? join(dirname(currentSlug), targetRaw).replace(/\\/g, '/') + : targetRaw; + refs.push({ + name: label || resolved, + slug: resolved, + dir: resolved.split('/')[0], + }); + } + return refs; +} + +function excerpt(content: string, idx: number, width: number): string { + const half = Math.floor(width / 2); + const start = Math.max(0, idx - half); + const end = Math.min(content.length, idx + half); + return content.slice(start, end).replace(/\s+/g, ' ').trim(); +} + +export function inferLinkType(pageType: PageType, context: string, globalContext?: string, targetSlug?: string): string { + if (pageType === 'media') return 'mentions'; + if (pageType === 'meeting') return 'attended'; + if (FOUNDED_RE.test(context)) return 'founded'; + if (INVESTED_RE.test(context)) return 'invested_in'; + if (ADVISES_RE.test(context)) return 'advises'; + if (WORKS_AT_RE.test(context)) return 'works_at'; + if (pageType === 'person' && globalContext && targetSlug?.startsWith('companies/')) { + if (PARTNER_ROLE_RE.test(globalContext)) return 'invested_in'; + if (ADVISOR_ROLE_RE.test(globalContext)) return 'advises'; + } + return 'mentions'; +} + +export function extractPageLinks( + content: string, + frontmatter: Record, + pageType: PageType, + currentSlug?: string, +): LinkCandidate[] { + const candidates: LinkCandidate[] = []; + + for (const ref of extractEntityRefs(content)) { + const idx = content.indexOf(ref.name); + const context = idx >= 0 ? excerpt(content, idx, 240) : ref.name; + candidates.push({ + targetSlug: ref.slug, + linkType: inferLinkType(pageType, context, content, ref.slug), + context, + }); + } + + for (const ref of extractObsidianRefs(content, currentSlug)) { + const idx = content.indexOf(ref.name); + const context = idx >= 0 ? excerpt(content, idx, 240) : ref.name; + candidates.push({ + targetSlug: ref.slug, + linkType: inferLinkType(pageType, context, content, ref.slug), + context, + }); + } + + const stripped = stripCodeBlocks(content); + const bareRe = /\b((?:people|companies|meetings|concepts|deal|civic|project|source|media|yc)\/[a-z0-9][a-z0-9-]*)\b/g; + let match: RegExpExecArray | null; + while ((match = bareRe.exec(stripped)) !== null) { + const charBefore = match.index > 0 ? stripped[match.index - 1] : ''; + if (charBefore === '/' || charBefore === '(') continue; + const context = excerpt(stripped, match.index, 240); + candidates.push({ + targetSlug: match[1], + linkType: inferLinkType(pageType, context, content, match[1]), + context, + }); + } + + const source = frontmatter.source; + if (typeof source === 'string' && source.length > 0 && /^[a-z][a-z0-9-]*\/[a-z0-9][a-z0-9-]*$/.test(source)) { + candidates.push({ + targetSlug: source, + linkType: 'source', + context: `frontmatter source: ${source}`, + }); + } + + const seen = new Set(); + const result: LinkCandidate[] = []; + for (const candidate of candidates) { + const key = `${candidate.targetSlug}\u0000${candidate.linkType}`; + if (seen.has(key)) continue; + seen.add(key); + result.push(candidate); + } + return result; +} + +export function parseTimelineEntries(content: string): TimelineCandidate[] { + const result: TimelineCandidate[] = []; + const lines = content.split('\n'); + + let i = 0; + while (i < lines.length) { + const match = TIMELINE_LINE_RE.exec(lines[i]); + if (!match) { + i++; + continue; + } + + const date = match[1]; + const summary = match[2].trim(); + if (!isValidDate(date) || summary.length === 0) { + i++; + continue; + } + + const detailLines: string[] = []; + let j = i + 1; + while (j < lines.length) { + const next = lines[j]; + if (TIMELINE_LINE_RE.test(next)) break; + if (/^#{1,6}\s/.test(next)) break; + if (next.trim().length === 0 && detailLines.length === 0) { + j++; + continue; + } + if (next.trim().length === 0 && detailLines.length > 0) break; + if (/^\s+/.test(next) || (!next.startsWith('-') && !next.startsWith('*') && !next.startsWith('#'))) { + detailLines.push(next.trim()); + j++; + continue; + } + break; + } + + result.push({ date, summary, detail: detailLines.join(' ').trim() }); + i = j; + } + + return result; +} + +function isValidDate(value: string): boolean { + if (!/^\d{4}-\d{2}-\d{2}$/.test(value)) return false; + const [year, month, day] = value.split('-').map(Number); + if (month < 1 || month > 12) return false; + if (day < 1 || day > 31) return false; + const date = new Date(Date.UTC(year, month - 1, day)); + return date.getUTCFullYear() === year && date.getUTCMonth() === month - 1 && date.getUTCDate() === day; +} + +export async function isAutoLinkEnabled(engine: BrainEngine): Promise { + const value = await engine.getConfig('auto_link'); + if (value == null) return true; + return !['false', '0', 'no', 'off'].includes(value.trim().toLowerCase()); +} diff --git a/src/core/operations.ts b/src/core/operations.ts index 687a01b8..cbb417db 100644 --- a/src/core/operations.ts +++ b/src/core/operations.ts @@ -5,10 +5,13 @@ import type { BrainEngine } from './engine.ts'; import type { GBrainConfig } from './config.ts'; +import type { PageType } from './types.ts'; import { importFromContent } from './import-file.ts'; +import { parseMarkdown } from './markdown.ts'; import { hybridSearch } from './search/hybrid.ts'; import { expandQuery } from './search/expansion.ts'; import { dedupResults } from './search/dedup.ts'; +import { extractPageLinks, isAutoLinkEnabled } from './link-extraction.ts'; import * as db from './db.ts'; // --- Types --- @@ -124,12 +127,79 @@ const put_page: Operation = { mutating: true, handler: async (ctx, p) => { if (ctx.dryRun) return { dry_run: true, action: 'put_page', slug: p.slug }; - const result = await importFromContent(ctx.engine, p.slug as string, p.content as string); - return { slug: result.slug, status: result.status === 'imported' ? 'created_or_updated' : result.status, chunks: result.chunks }; + const slug = p.slug as string; + const content = p.content as string; + const noEmbed = !process.env.OPENAI_API_KEY; + const result = await importFromContent(ctx.engine, slug, content, { noEmbed }); + + let autoLinks: { created: number; removed: number; errors: number } | { error: string } | undefined; + try { + const enabled = await isAutoLinkEnabled(ctx.engine); + if (enabled) { + const parsed = parseMarkdown(content, `${slug}.md`); + autoLinks = await runAutoLink(ctx.engine, slug, { + type: parsed.type as PageType, + compiled_truth: parsed.compiled_truth, + timeline: parsed.timeline || '', + frontmatter: parsed.frontmatter, + }); + } + } catch (e) { + autoLinks = { error: e instanceof Error ? e.message : String(e) }; + } + + return { + slug: result.slug, + status: result.status === 'imported' ? 'created_or_updated' : result.status, + chunks: result.chunks, + ...(autoLinks ? { auto_links: autoLinks } : {}), + }; }, cliHints: { name: 'put', positional: ['slug'], stdin: 'content' }, }; +async function runAutoLink( + engine: BrainEngine, + slug: string, + parsed: { type: PageType; compiled_truth: string; timeline: string; frontmatter: Record }, +): Promise<{ created: number; removed: number; errors: number }> { + const fullContent = `${parsed.compiled_truth}\n${parsed.timeline || ''}`; + const pages = await engine.listPages({ limit: 100000 }); + const allSlugs = new Set(pages.map(page => page.slug)); + const candidates = extractPageLinks(fullContent, parsed.frontmatter, parsed.type, slug); + const valid = candidates.filter(candidate => allSlugs.has(candidate.targetSlug)); + const existing = await engine.getLinks(slug); + const desiredKeys = new Set(valid.map(candidate => `${candidate.targetSlug}\u0000${candidate.linkType}`)); + const existingKeys = new Set(existing.map(link => `${link.to_slug}\u0000${link.link_type}`)); + + let created = 0; + let removed = 0; + let errors = 0; + + for (const candidate of valid) { + try { + await engine.addLink(slug, candidate.targetSlug, candidate.context, candidate.linkType); + if (!existingKeys.has(`${candidate.targetSlug}\u0000${candidate.linkType}`)) created++; + } catch { + errors++; + } + } + + for (const link of existing) { + const key = `${link.to_slug}\u0000${link.link_type}`; + if (!desiredKeys.has(key)) { + try { + await engine.removeLink(slug, link.to_slug); + removed++; + } catch { + errors++; + } + } + } + + return { created, removed, errors }; +} + const delete_page: Operation = { name: 'delete_page', description: 'Delete a page', diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index cc1ca310..76c04098 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -11,7 +11,7 @@ import type { Page, PageInput, PageFilters, PageType, Chunk, ChunkInput, SearchResult, SearchOpts, - Link, GraphNode, + Link, GraphNode, GraphPath, TimelineEntry, TimelineInput, TimelineOpts, RawData, PageVersion, @@ -19,7 +19,7 @@ import type { IngestLogEntry, IngestLogInput, EngineConfig, } from './types.ts'; -import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult } from './utils.ts'; +import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult, coerceEmbeddingVector } from './utils.ts'; type PGLiteDB = PGlite; @@ -237,12 +237,8 @@ export class PGLiteEngine implements BrainEngine { ); const result = new Map(); for (const row of rows as Record[]) { - if (row.embedding) { - const emb = typeof row.embedding === 'string' - ? new Float32Array(JSON.parse(row.embedding)) - : row.embedding as Float32Array; - result.set(row.id as number, emb); - } + const emb = coerceEmbeddingVector(row.embedding); + if (emb) result.set(row.id as number, emb); } return result; } @@ -402,6 +398,128 @@ export class PGLiteEngine implements BrainEngine { })); } + async traversePaths( + slug: string, + opts?: { depth?: number; linkType?: string; direction?: 'in' | 'out' | 'both' }, + ): Promise { + const depth = opts?.depth ?? 5; + const direction = opts?.direction ?? 'out'; + const linkType = opts?.linkType ?? null; + const linkTypeMatches = linkType !== null; + const params: unknown[] = linkTypeMatches ? [slug, depth, linkType] : [slug, depth]; + const linkTypeWhere = linkTypeMatches ? 'AND l.link_type = $3' : ''; + + let sql: string; + if (direction === 'out') { + sql = ` + WITH RECURSIVE walk AS ( + SELECT p.id, p.slug, 0::int AS depth, ARRAY[p.id] AS visited + FROM pages p WHERE p.slug = $1 + UNION ALL + SELECT p2.id, p2.slug, w.depth + 1, w.visited || p2.id + FROM walk w + JOIN links l ON l.from_page_id = w.id + JOIN pages p2 ON p2.id = l.to_page_id + WHERE w.depth < $2 + AND NOT (p2.id = ANY(w.visited)) + ${linkTypeWhere} + ) + SELECT w.slug AS from_slug, p2.slug AS to_slug, + l.link_type, l.context, w.depth + 1 AS depth + FROM walk w + JOIN links l ON l.from_page_id = w.id + JOIN pages p2 ON p2.id = l.to_page_id + WHERE w.depth < $2 + ${linkTypeWhere} + ORDER BY depth, from_slug, to_slug + `; + } else if (direction === 'in') { + sql = ` + WITH RECURSIVE walk AS ( + SELECT p.id, p.slug, 0::int AS depth, ARRAY[p.id] AS visited + FROM pages p WHERE p.slug = $1 + UNION ALL + SELECT p2.id, p2.slug, w.depth + 1, w.visited || p2.id + FROM walk w + JOIN links l ON l.to_page_id = w.id + JOIN pages p2 ON p2.id = l.from_page_id + WHERE w.depth < $2 + AND NOT (p2.id = ANY(w.visited)) + ${linkTypeWhere} + ) + SELECT p2.slug AS from_slug, w.slug AS to_slug, + l.link_type, l.context, w.depth + 1 AS depth + FROM walk w + JOIN links l ON l.to_page_id = w.id + JOIN pages p2 ON p2.id = l.from_page_id + WHERE w.depth < $2 + ${linkTypeWhere} + ORDER BY depth, from_slug, to_slug + `; + } else { + sql = ` + WITH RECURSIVE walk AS ( + SELECT p.id, 0::int AS depth, ARRAY[p.id] AS visited + FROM pages p WHERE p.slug = $1 + UNION ALL + SELECT p2.id, w.depth + 1, w.visited || p2.id + FROM walk w + JOIN links l ON (l.from_page_id = w.id OR l.to_page_id = w.id) + JOIN pages p2 ON p2.id = CASE WHEN l.from_page_id = w.id THEN l.to_page_id ELSE l.from_page_id END + WHERE w.depth < $2 + AND NOT (p2.id = ANY(w.visited)) + ${linkTypeWhere} + ) + SELECT pf.slug AS from_slug, pt.slug AS to_slug, + l.link_type, l.context, w.depth + 1 AS depth + FROM walk w + JOIN links l ON (l.from_page_id = w.id OR l.to_page_id = w.id) + JOIN pages pf ON pf.id = l.from_page_id + JOIN pages pt ON pt.id = l.to_page_id + WHERE w.depth < $2 + ${linkTypeWhere} + ORDER BY depth, from_slug, to_slug + `; + } + + const { rows } = await this.db.query(sql, params); + const seen = new Set(); + const result: GraphPath[] = []; + for (const r of rows as Record[]) { + const key = `${r.from_slug}|${r.to_slug}|${r.link_type}|${r.depth}`; + if (seen.has(key)) continue; + seen.add(key); + result.push({ + from_slug: r.from_slug as string, + to_slug: r.to_slug as string, + link_type: r.link_type as string, + context: (r.context as string) || '', + depth: r.depth as number, + }); + } + return result; + } + + async getBacklinkCounts(slugs: string[]): Promise> { + const result = new Map(); + if (slugs.length === 0) return result; + for (const slug of slugs) result.set(slug, 0); + + const { rows } = await this.db.query( + `SELECT p.slug AS slug, COUNT(l.id)::int AS cnt + FROM pages p + LEFT JOIN links l ON l.to_page_id = p.id + WHERE p.slug = ANY($1::text[]) + GROUP BY p.slug`, + [slugs] + ); + + for (const row of rows as { slug: string; cnt: number }[]) { + result.set(row.slug, Number(row.cnt)); + } + return result; + } + // Tags async addTag(slug: string, tag: string): Promise { await this.db.query( diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index dc536c73..ab3b487f 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -7,7 +7,7 @@ import type { Page, PageInput, PageFilters, Chunk, ChunkInput, SearchResult, SearchOpts, - Link, GraphNode, + Link, GraphNode, GraphPath, TimelineEntry, TimelineInput, TimelineOpts, RawData, PageVersion, @@ -17,7 +17,7 @@ import type { } from './types.ts'; import { GBrainError } from './types.ts'; import * as db from './db.ts'; -import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult } from './utils.ts'; +import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult, coerceEmbeddingVector } from './utils.ts'; export class PostgresEngine implements BrainEngine { private _sql: ReturnType | null = null; @@ -275,7 +275,8 @@ export class PostgresEngine implements BrainEngine { `; const result = new Map(); for (const row of rows) { - if (row.embedding) result.set(row.id as number, row.embedding as Float32Array); + const emb = coerceEmbeddingVector(row.embedding); + if (emb) result.set(row.id as number, emb); } return result; } @@ -437,6 +438,126 @@ export class PostgresEngine implements BrainEngine { })); } + async traversePaths( + slug: string, + opts?: { depth?: number; linkType?: string; direction?: 'in' | 'out' | 'both' }, + ): Promise { + const sql = this.sql; + const depth = opts?.depth ?? 5; + const direction = opts?.direction ?? 'out'; + const linkType = opts?.linkType ?? null; + const linkTypeMatches = linkType !== null; + + let rows; + if (direction === 'out') { + rows = await sql` + WITH RECURSIVE walk AS ( + SELECT p.id, p.slug, 0::int as depth, ARRAY[p.id] as visited + FROM pages p WHERE p.slug = ${slug} + UNION ALL + SELECT p2.id, p2.slug, w.depth + 1, w.visited || p2.id + FROM walk w + JOIN links l ON l.from_page_id = w.id + JOIN pages p2 ON p2.id = l.to_page_id + WHERE w.depth < ${depth} + AND NOT (p2.id = ANY(w.visited)) + AND (${!linkTypeMatches} OR l.link_type = ${linkType ?? ''}) + ) + SELECT w.slug as from_slug, p2.slug as to_slug, + l.link_type, l.context, w.depth + 1 as depth + FROM walk w + JOIN links l ON l.from_page_id = w.id + JOIN pages p2 ON p2.id = l.to_page_id + WHERE w.depth < ${depth} + AND (${!linkTypeMatches} OR l.link_type = ${linkType ?? ''}) + ORDER BY depth, from_slug, to_slug + `; + } else if (direction === 'in') { + rows = await sql` + WITH RECURSIVE walk AS ( + SELECT p.id, p.slug, 0::int as depth, ARRAY[p.id] as visited + FROM pages p WHERE p.slug = ${slug} + UNION ALL + SELECT p2.id, p2.slug, w.depth + 1, w.visited || p2.id + FROM walk w + JOIN links l ON l.to_page_id = w.id + JOIN pages p2 ON p2.id = l.from_page_id + WHERE w.depth < ${depth} + AND NOT (p2.id = ANY(w.visited)) + AND (${!linkTypeMatches} OR l.link_type = ${linkType ?? ''}) + ) + SELECT p2.slug as from_slug, w.slug as to_slug, + l.link_type, l.context, w.depth + 1 as depth + FROM walk w + JOIN links l ON l.to_page_id = w.id + JOIN pages p2 ON p2.id = l.from_page_id + WHERE w.depth < ${depth} + AND (${!linkTypeMatches} OR l.link_type = ${linkType ?? ''}) + ORDER BY depth, from_slug, to_slug + `; + } else { + rows = await sql` + WITH RECURSIVE walk AS ( + SELECT p.id, 0::int as depth, ARRAY[p.id] as visited + FROM pages p WHERE p.slug = ${slug} + UNION ALL + SELECT p2.id, w.depth + 1, w.visited || p2.id + FROM walk w + JOIN links l ON (l.from_page_id = w.id OR l.to_page_id = w.id) + JOIN pages p2 ON p2.id = CASE WHEN l.from_page_id = w.id THEN l.to_page_id ELSE l.from_page_id END + WHERE w.depth < ${depth} + AND NOT (p2.id = ANY(w.visited)) + AND (${!linkTypeMatches} OR l.link_type = ${linkType ?? ''}) + ) + SELECT pf.slug as from_slug, pt.slug as to_slug, + l.link_type, l.context, w.depth + 1 as depth + FROM walk w + JOIN links l ON (l.from_page_id = w.id OR l.to_page_id = w.id) + JOIN pages pf ON pf.id = l.from_page_id + JOIN pages pt ON pt.id = l.to_page_id + WHERE w.depth < ${depth} + AND (${!linkTypeMatches} OR l.link_type = ${linkType ?? ''}) + ORDER BY depth, from_slug, to_slug + `; + } + + const seen = new Set(); + const result: GraphPath[] = []; + for (const r of rows as Record[]) { + const key = `${r.from_slug}|${r.to_slug}|${r.link_type}|${r.depth}`; + if (seen.has(key)) continue; + seen.add(key); + result.push({ + from_slug: r.from_slug as string, + to_slug: r.to_slug as string, + link_type: r.link_type as string, + context: (r.context as string) || '', + depth: Number(r.depth), + }); + } + return result; + } + + async getBacklinkCounts(slugs: string[]): Promise> { + const result = new Map(); + if (slugs.length === 0) return result; + for (const slug of slugs) result.set(slug, 0); + + const sql = this.sql; + const rows = await sql` + SELECT p.slug as slug, COUNT(l.id)::int as cnt + FROM pages p + LEFT JOIN links l ON l.to_page_id = p.id + WHERE p.slug = ANY(${slugs}::text[]) + GROUP BY p.slug + `; + + for (const row of rows as { slug: string; cnt: number }[]) { + result.set(row.slug, Number(row.cnt)); + } + return result; + } + // Tags async addTag(slug: string, tag: string): Promise { const sql = this.sql; diff --git a/src/core/search/hybrid.ts b/src/core/search/hybrid.ts index 2230e206..4ffc2d83 100644 --- a/src/core/search/hybrid.ts +++ b/src/core/search/hybrid.ts @@ -18,6 +18,67 @@ import { autoDetectDetail } from './intent.ts'; const RRF_K = 60; const COMPILED_TRUTH_BOOST = 2.0; +const STRUCTURED_ENTITY_TYPES = new Set([ + 'people-profile', + 'person', + 'company', + 'agent-profile', + 'service-profile', + 'project-status', + 'project-summary', + 'company-summary', + 'infrastructure-summary', + 'infra-status', +]); +const TYPE_HINTS_BY_TYPE: Record = { + 'agent-profile': ['agent', 'bot'], + 'service-profile': ['service', 'tool', 'system'], + 'company': ['company', 'business'], + 'company-summary': ['company', 'business'], + 'person': ['person', 'profile'], + 'people-profile': ['person', 'profile'], + 'project-status': ['project', 'status'], + 'project-summary': ['project', 'summary'], + 'infrastructure-summary': ['infrastructure', 'machine', 'server', 'host'], + 'infra-status': ['infrastructure', 'machine', 'server', 'host', 'status'], +}; +const BRAND_ALIAS_SUFFIXES = new Set(['ai']); +const CANONICAL_PAGE_SUFFIXES = new Set(['summary', 'status', 'readme', 'index']); +const EXACT_CANONICAL_PATH_BOOST = 8.0; +const EXPLICIT_QUERY_PREFERENCE_BOOST = 12.0; +const EXPLICIT_QUERY_PREFERENCES: Array<{ preferredSlug: string; aliases: string[] }> = [ + { + preferredSlug: 'knowledge/companies/rodaco/summary', + aliases: ['rodaco', 'rodaco ai', 'rodaco company'], + }, + { + preferredSlug: 'knowledge/agents/rodaco', + aliases: ['rodaco agent', 'rodaco bot', 'rodaco assistant'], + }, + { + preferredSlug: 'knowledge/agents/hermes', + aliases: ['hermes', 'hermes agent', 'hermes bot', 'hermes assistant'], + }, + { + preferredSlug: 'knowledge/agents/atlas', + aliases: ['atlas', 'atlas agent', 'atlas bot', 'atlas assistant'], + }, + { + preferredSlug: 'knowledge/agents/winston', + aliases: ['winston', 'winston agent', 'winston bot', 'winston assistant'], + }, + { + preferredSlug: 'knowledge/agents/jeeves', + aliases: ['jeeves', 'jeeves agent', 'jeeves bot', 'jeeves assistant'], + }, + { + preferredSlug: 'knowledge/agents/gbrain', + aliases: ['gbrain', 'gbrain service', 'gbrain system', 'gbrain tool'], + }, +]; +const EXPLICIT_QUERY_PREFERENCE_MAP = new Map( + EXPLICIT_QUERY_PREFERENCES.flatMap(({ preferredSlug, aliases }) => aliases.map(alias => [alias, preferredSlug] as const)), +); const DEBUG = process.env.GBRAIN_SEARCH_DEBUG === '1'; export interface HybridSearchOpts extends SearchOpts { @@ -40,7 +101,7 @@ export async function hybridSearch( ): Promise { const limit = opts?.limit || 20; const offset = opts?.offset || 0; - const innerLimit = Math.min(limit * 2, MAX_SEARCH_LIMIT); + const innerLimit = computeCandidateLimit(query, limit); // Auto-detect detail level from query intent when caller doesn't specify const detail = opts?.detail ?? autoDetectDetail(query); @@ -55,7 +116,7 @@ export async function hybridSearch( // Skip vector search entirely if no OpenAI key is configured if (!process.env.OPENAI_API_KEY) { - return dedupResults(keywordResults).slice(offset, offset + limit); + return applyQueryAwareBoosts(dedupResults(keywordResults), query).slice(offset, offset + limit); } // Determine query variants (optionally with expansion) @@ -85,7 +146,7 @@ export async function hybridSearch( } if (vectorLists.length === 0) { - return dedupResults(keywordResults).slice(offset, offset + limit); + return applyQueryAwareBoosts(dedupResults(keywordResults), query).slice(offset, offset + limit); } // Merge all result lists via RRF (includes normalization + boost) @@ -106,7 +167,7 @@ export async function hybridSearch( return hybridSearch(engine, query, { ...opts, detail: 'high' }); } - return deduped.slice(offset, offset + limit); + return applyQueryAwareBoosts(deduped, query).slice(offset, offset + limit); } /** @@ -158,6 +219,124 @@ export function rrfFusion(lists: SearchResult[][], k: number, applyBoost = true) .map(({ result, score }) => ({ ...result, score })); } +function normalizeMatchText(value: string): string { + return value + .toLowerCase() + .replace(/\.md$/i, '') + .replace(/[\\/_-]+/g, ' ') + .replace(/[^a-z0-9\s]+/g, ' ') + .replace(/\s+/g, ' ') + .trim(); +} + +function deriveSlugKeys(slug: string): string[] { + const parts = slug.split('/').filter(Boolean); + if (parts.length === 0) return []; + const last = parts[parts.length - 1] || ''; + const parent = parts[parts.length - 2] || ''; + const keys = new Set(); + keys.add(normalizeMatchText(last)); + if (['summary', 'readme', 'index', 'status'].includes(last) && parent) { + keys.add(normalizeMatchText(parent)); + } + keys.add(normalizeMatchText(slug)); + return Array.from(keys).filter(Boolean); +} + +function isExactEntityLikeQuery(query: string): boolean { + const normalized = normalizeMatchText(query); + if (!normalized) return false; + const words = normalized.split(/\s+/).filter(Boolean); + if (words.length === 0 || words.length > 4) return false; + if (normalized.length < 2 || normalized.length > 80) return false; + if (/[?*]/.test(query)) return false; + return true; +} + +function computeCandidateLimit(query: string, limit: number): number { + const base = Math.min(limit * 2, MAX_SEARCH_LIMIT); + if (!isExactEntityLikeQuery(query)) return base; + return clampSearchLimit(Math.max(base, 100)); +} + +function matchesQueryTypeHint(result: SearchResult, normalizedQuery: string): boolean { + const title = normalizeMatchText(result.title || ''); + if (!title || !normalizedQuery.startsWith(title)) return false; + const remainder = normalizedQuery.slice(title.length).trim(); + if (!remainder) return false; + const hints = TYPE_HINTS_BY_TYPE[String(result.type || '')] || []; + if (hints.length === 0) return false; + const tokens = remainder.split(/\s+/).filter(Boolean); + return tokens.length > 0 && tokens.every(token => hints.includes(token)); +} + +function primaryEntityKey(result: SearchResult): string { + const slugParts = (result.slug || '').split('/').filter(Boolean); + if (slugParts.length === 0) return ''; + const lastPart = slugParts[slugParts.length - 1] || ''; + const parentPart = slugParts[slugParts.length - 2] || ''; + if (CANONICAL_PAGE_SUFFIXES.has(lastPart) && parentPart) { + return normalizeMatchText(parentPart); + } + return normalizeMatchText(lastPart); +} + +function matchesBrandAliasSuffix(result: SearchResult, normalizedQuery: string): boolean { + const type = String(result.type || ''); + if (type !== 'company' && type !== 'company-summary') return false; + const entityKey = primaryEntityKey(result); + if (!entityKey || !normalizedQuery.startsWith(`${entityKey} `)) return false; + const remainder = normalizedQuery.slice(entityKey.length).trim(); + if (!remainder) return false; + const tokens = remainder.split(/\s+/).filter(Boolean); + return tokens.length > 0 && tokens.every(token => BRAND_ALIAS_SUFFIXES.has(token)); +} + +function matchesExplicitQueryPreference(result: SearchResult, normalizedQuery: string): boolean { + const preferredSlug = EXPLICIT_QUERY_PREFERENCE_MAP.get(normalizedQuery); + if (!preferredSlug) return false; + return result.slug === preferredSlug; +} + +function queryAwareBoost(result: SearchResult, normalizedQuery: string): number { + if (!normalizedQuery) return 1; + + const title = normalizeMatchText(result.title || ''); + const slugParts = (result.slug || '').split('/').filter(Boolean); + const slugKeys = deriveSlugKeys(result.slug || ''); + const type = String(result.type || ''); + const exactTitle = title === normalizedQuery; + const exactSlug = slugKeys.includes(normalizedQuery); + const structured = STRUCTURED_ENTITY_TYPES.has(type); + const queryTypeHint = matchesQueryTypeHint(result, normalizedQuery); + const brandAliasSuffix = matchesBrandAliasSuffix(result, normalizedQuery); + const explicitQueryPreference = matchesExplicitQueryPreference(result, normalizedQuery); + const lastPart = slugParts[slugParts.length - 1] || ''; + const parentPart = slugParts[slugParts.length - 2] || ''; + const canonicalPathMatch = CANONICAL_PAGE_SUFFIXES.has(lastPart) + && normalizeMatchText(parentPart) === normalizedQuery; + + let boost = 1; + if (exactTitle) boost *= 2.5; + if (exactSlug) boost *= 3.0; + if ((exactTitle || exactSlug) && structured) boost *= 1.5; + if (queryTypeHint) boost *= 3.5; + if (queryTypeHint && structured) boost *= 1.5; + if (brandAliasSuffix && structured) boost *= 2.5; + if (explicitQueryPreference) boost *= EXPLICIT_QUERY_PREFERENCE_BOOST; + if (canonicalPathMatch && structured) boost *= EXACT_CANONICAL_PATH_BOOST; + return boost; +} + +export function applyQueryAwareBoosts(results: SearchResult[], query: string): SearchResult[] { + const normalizedQuery = normalizeMatchText(query); + if (!normalizedQuery) return results; + + return results + .map(result => ({ ...result, score: result.score * queryAwareBoost(result, normalizedQuery) })) + .sort((a, b) => b.score - a.score); +} + /** * Cosine re-scoring: blend RRF score with query-chunk cosine similarity. * Runs before dedup so semantically better chunks survive. diff --git a/src/core/types.ts b/src/core/types.ts index e24dac1c..eac5da56 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -1,5 +1,5 @@ // Page types -export type PageType = 'person' | 'company' | 'deal' | 'yc' | 'civic' | 'project' | 'concept' | 'source' | 'media'; +export type PageType = 'person' | 'company' | 'deal' | 'yc' | 'civic' | 'project' | 'concept' | 'source' | 'media' | 'meeting'; export interface Page { id: number; @@ -90,6 +90,14 @@ export interface GraphNode { links: { to_slug: string; link_type: string }[]; } +export interface GraphPath { + from_slug: string; + to_slug: string; + link_type: string; + context: string; + depth: number; +} + // Timeline export interface TimelineEntry { id: number; diff --git a/src/core/utils.ts b/src/core/utils.ts index 726c5731..ac9680e1 100644 --- a/src/core/utils.ts +++ b/src/core/utils.ts @@ -43,6 +43,26 @@ export function rowToPage(row: Record): Page { }; } +export function coerceEmbeddingVector(value: unknown): Float32Array | null { + if (!value) return null; + if (value instanceof Float32Array) return value; + if (Array.isArray(value)) return new Float32Array(value.map(Number)); + if (typeof value === 'string') { + try { + const parsed = JSON.parse(value); + if (Array.isArray(parsed)) return new Float32Array(parsed.map(Number)); + } catch { + const trimmed = value.trim(); + if (trimmed.startsWith('[') && trimmed.endsWith(']')) { + const nums = trimmed.slice(1, -1).split(',').map(s => Number(s.trim())); + if (nums.every(n => Number.isFinite(n))) return new Float32Array(nums); + } + return null; + } + } + return null; +} + export function rowToChunk(row: Record, includeEmbedding = false): Chunk { return { id: row.id as number, @@ -50,7 +70,7 @@ export function rowToChunk(row: Record, includeEmbedding = fals chunk_index: row.chunk_index as number, chunk_text: row.chunk_text as string, chunk_source: row.chunk_source as 'compiled_truth' | 'timeline', - embedding: includeEmbedding && row.embedding ? row.embedding as Float32Array : null, + embedding: includeEmbedding ? coerceEmbeddingVector(row.embedding) : null, model: row.model as string, token_count: row.token_count as number | null, embedded_at: row.embedded_at ? new Date(row.embedded_at as string) : null, diff --git a/test/e2e/graph-quality.test.ts b/test/e2e/graph-quality.test.ts new file mode 100644 index 00000000..e919d312 --- /dev/null +++ b/test/e2e/graph-quality.test.ts @@ -0,0 +1,242 @@ +/** + * E2E test for the v0.10.1 knowledge graph layer. + * + * Runs the full pipeline against in-memory PGLite (no API keys, no external DB). + * 1. Seed pages with entity refs and timeline content + * 2. Run link-extract + timeline-extract + * 3. Verify graph populated + * 4. Test auto-link via put_page operation handler + * 5. Test reconciliation (edit page, stale links removed) + * 6. Test graph-query traversal + */ + +import { describe, test, expect, beforeAll, afterAll, beforeEach } from 'bun:test'; +import { PGLiteEngine } from '../../src/core/pglite-engine.ts'; +import { runExtract } from '../../src/commands/extract.ts'; +import { operationsByName } from '../../src/core/operations.ts'; +import type { OperationContext } from '../../src/core/operations.ts'; + +let engine: PGLiteEngine; + +beforeAll(async () => { + engine = new PGLiteEngine(); + await engine.connect({}); + await engine.initSchema(); +}); + +afterAll(async () => { + await engine.disconnect(); +}); + +async function truncateAll() { + for (const t of ['content_chunks', 'links', 'tags', 'raw_data', 'timeline_entries', 'page_versions', 'ingest_log', 'pages']) { + await (engine as any).db.exec(`DELETE FROM ${t}`); + } +} + +function makeContext(): OperationContext { + return { + engine, + config: { engine: 'pglite' } as any, + logger: { info: () => {}, warn: () => {}, error: () => {} }, + dryRun: false, + }; +} + +describe('E2E graph quality (v0.10.1 pipeline)', () => { + beforeEach(truncateAll); + + test('full pipeline: seed -> link-extract -> timeline-extract -> verify', async () => { + // Seed 5 pages with entity refs and timeline content. + await engine.putPage('people/alice', { + type: 'person', title: 'Alice', + compiled_truth: 'Alice is the CEO of [Acme](companies/acme).', + timeline: '- **2026-01-15** | Joined as CEO\n- **2026-02-20** | Closed Series A', + }); + await engine.putPage('people/bob', { + type: 'person', title: 'Bob', + compiled_truth: 'Bob is a YC partner who invested in [Acme](companies/acme).', + timeline: '- **2026-03-01** | Wrote check to Acme', + }); + await engine.putPage('companies/acme', { + type: 'company', title: 'Acme', + compiled_truth: '', + timeline: '- **2026-01-01** | Founded', + }); + await engine.putPage('meetings/standup', { + type: 'meeting', title: 'Standup', + compiled_truth: 'Attendees: [Alice](people/alice), [Bob](people/bob).', + timeline: '- **2026-04-01** | Met at YC office', + }); + + // Run extractions. + await runExtract(engine, ['links', '--source', 'db']); + await runExtract(engine, ['timeline', '--source', 'db']); + + // Verify graph populated. + const stats = await engine.getStats(); + expect(stats.link_count).toBeGreaterThan(0); + expect(stats.timeline_entry_count).toBeGreaterThan(0); + + // Verify typed link inference. + const aliceLinks = await engine.getLinks('people/alice'); + const acmeLink = aliceLinks.find(l => l.to_slug === 'companies/acme'); + expect(acmeLink?.link_type).toBe('works_at'); + + const bobLinks = await engine.getLinks('people/bob'); + const bobAcme = bobLinks.find(l => l.to_slug === 'companies/acme'); + expect(bobAcme?.link_type).toBe('invested_in'); + + const meetingLinks = await engine.getLinks('meetings/standup'); + expect(meetingLinks.every(l => l.link_type === 'attended')).toBe(true); + }); + + test('auto-link via put_page operation handler', async () => { + // Seed target pages first. + await engine.putPage('people/alice', { type: 'person', title: 'Alice', compiled_truth: '', timeline: '' }); + await engine.putPage('companies/acme', { type: 'company', title: 'Acme', compiled_truth: '', timeline: '' }); + + // Use put_page operation (not engine.putPage directly) so the auto-link + // post-hook fires. + const putOp = operationsByName['put_page']; + expect(putOp).toBeDefined(); + const result = await putOp.handler(makeContext(), { + slug: 'meetings/auto', + content: `--- +type: meeting +title: Auto Meeting +--- + +Attendees: [Alice](people/alice). Discussed [Acme](companies/acme). +`, + }); + + // The response should include auto_links results. + expect((result as any).auto_links).toBeDefined(); + const autoLinks = (result as any).auto_links; + expect(autoLinks.created).toBeGreaterThan(0); + expect(autoLinks.errors).toBe(0); + + // Verify links actually exist in DB. + const links = await engine.getLinks('meetings/auto'); + expect(links.length).toBe(2); + expect(new Set(links.map(l => l.to_slug))).toEqual(new Set(['people/alice', 'companies/acme'])); + }); + + test('auto-link reconciliation: edit page removes stale links', async () => { + await engine.putPage('people/alice', { type: 'person', title: 'Alice', compiled_truth: '', timeline: '' }); + await engine.putPage('people/bob', { type: 'person', title: 'Bob', compiled_truth: '', timeline: '' }); + + const putOp = operationsByName['put_page']; + + // First write: links to Alice. + await putOp.handler(makeContext(), { + slug: 'notes/test', + content: `--- +type: concept +title: Test Note +--- + +I met [Alice](people/alice) today. +`, + }); + + let links = await engine.getLinks('notes/test'); + expect(links.length).toBe(1); + expect(links[0].to_slug).toBe('people/alice'); + + // Second write: removes Alice ref, adds Bob ref. + const result = await putOp.handler(makeContext(), { + slug: 'notes/test', + content: `--- +type: concept +title: Test Note +--- + +Now I'm meeting with [Bob](people/bob). +`, + }); + + expect((result as any).auto_links.removed).toBe(1); + expect((result as any).auto_links.created).toBe(1); + + links = await engine.getLinks('notes/test'); + expect(links.length).toBe(1); + expect(links[0].to_slug).toBe('people/bob'); + }); + + test('auto-link respects auto_link=false config', async () => { + await engine.setConfig('auto_link', 'false'); + try { + await engine.putPage('people/alice', { type: 'person', title: 'Alice', compiled_truth: '', timeline: '' }); + const putOp = operationsByName['put_page']; + const result = await putOp.handler(makeContext(), { + slug: 'notes/disabled', + content: `--- +type: concept +title: Disabled Auto Link +--- + +Mention of [Alice](people/alice). +`, + }); + + // No auto_links field when disabled (we skip the helper entirely). + expect((result as any).auto_links).toBeUndefined(); + + const links = await engine.getLinks('notes/disabled'); + expect(links.length).toBe(0); + } finally { + await engine.setConfig('auto_link', 'true'); + } + }); + + test('graph-query end-to-end: traversePaths returns expected edges', async () => { + await engine.putPage('people/alice', { type: 'person', title: 'Alice', compiled_truth: '', timeline: '' }); + await engine.putPage('people/bob', { type: 'person', title: 'Bob', compiled_truth: '', timeline: '' }); + await engine.putPage('companies/acme', { type: 'company', title: 'Acme', compiled_truth: '', timeline: '' }); + await engine.addLink('people/alice', 'companies/acme', '', 'works_at'); + await engine.addLink('people/bob', 'companies/acme', '', 'invested_in'); + + // "Who works at Acme?" -> direction in, type works_at. + const paths = await engine.traversePaths('companies/acme', { + direction: 'in', linkType: 'works_at', depth: 1, + }); + expect(paths.length).toBe(1); + expect(paths[0].from_slug).toBe('people/alice'); + expect(paths[0].link_type).toBe('works_at'); + }); + + test('search backlink boost: well-connected pages rank higher', async () => { + // Create 3 pages all matching a search term, but with different inbound link counts. + await engine.putPage('topic/popular', { + type: 'concept', title: 'Popular Topic', + compiled_truth: 'This is the popular topic about widgets.', + timeline: '', + }); + await engine.putPage('topic/medium', { + type: 'concept', title: 'Medium Topic', + compiled_truth: 'This is a medium topic about widgets.', + timeline: '', + }); + await engine.putPage('topic/obscure', { + type: 'concept', title: 'Obscure Topic', + compiled_truth: 'This is an obscure topic about widgets.', + timeline: '', + }); + // Create inbound link references so each topic gets a backlink count. + for (let i = 0; i < 5; i++) { + await engine.putPage(`ref/popular-${i}`, { + type: 'concept', title: `Ref ${i}`, compiled_truth: '', timeline: '', + }); + await engine.addLink(`ref/popular-${i}`, 'topic/popular', '', 'mentions'); + } + await engine.addLink('ref/popular-0', 'topic/medium', '', 'mentions'); + + // Verify backlink counts. + const counts = await engine.getBacklinkCounts(['topic/popular', 'topic/medium', 'topic/obscure']); + expect(counts.get('topic/popular')).toBe(5); + expect(counts.get('topic/medium')).toBe(1); + expect(counts.get('topic/obscure')).toBe(0); + }); +}); diff --git a/test/e2e/search-quality.test.ts b/test/e2e/search-quality.test.ts index af2cd08c..300ee0ee 100644 --- a/test/e2e/search-quality.test.ts +++ b/test/e2e/search-quality.test.ts @@ -10,6 +10,7 @@ import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; import { PGLiteEngine } from '../../src/core/pglite-engine.ts'; +import { hybridSearch } from '../../src/core/search/hybrid.ts'; import type { ChunkInput, SearchResult } from '../../src/core/types.ts'; let engine: PGLiteEngine; @@ -102,6 +103,104 @@ beforeAll(async () => { }, ]; await engine.upsertChunks('concepts/ai-philosophy', aiChunks); + + await engine.putPage('knowledge/people/roger-gimbel/summary', { + type: 'people-profile' as any, + title: 'Summary', + compiled_truth: 'Roger Gimbel is the technical founder/operator behind Rodaco and SelfGrowth.', + timeline: '', + }); + await engine.upsertChunks('knowledge/people/roger-gimbel/summary', [{ + chunk_index: 0, + chunk_text: 'Roger Gimbel is the technical founder/operator behind Rodaco and SelfGrowth.', + chunk_source: 'compiled_truth', + embedding: basisEmbedding(6), + token_count: 12, + }]); + + await engine.putPage('knowledge/people/roger-gimbel', { + type: 'people-profile' as any, + title: 'Roger Gimbel', + compiled_truth: 'Legacy compatibility note. Use the foldered summary as the source of truth.', + timeline: '', + }); + await engine.upsertChunks('knowledge/people/roger-gimbel', [{ + chunk_index: 0, + chunk_text: 'Legacy compatibility note. Use the foldered summary as the source of truth.', + chunk_source: 'compiled_truth', + embedding: basisEmbedding(10), + token_count: 12, + }]); + + await engine.putPage('knowledge/companies/rodaco/summary', { + type: 'company-summary' as any, + title: 'Summary', + compiled_truth: 'Rodaco is Roger\'s company focused on products, advisory, and app development.', + timeline: '', + }); + await engine.upsertChunks('knowledge/companies/rodaco/summary', [{ + chunk_index: 0, + chunk_text: 'Rodaco is Roger\'s company focused on products, advisory, and app development.', + chunk_source: 'compiled_truth', + embedding: basisEmbedding(11), + token_count: 12, + }]); + + await engine.putPage('knowledge/agents/rodaco', { + type: 'agent-profile' as any, + title: 'Rodaco', + compiled_truth: 'Rodaco is the backup OpenClaw agent running on the Intel Mac.', + timeline: '', + }); + await engine.upsertChunks('knowledge/agents/rodaco', [{ + chunk_index: 0, + chunk_text: 'Rodaco is the backup OpenClaw agent running on the Intel Mac.', + chunk_source: 'compiled_truth', + embedding: basisEmbedding(12), + token_count: 12, + }]); + + await engine.putPage('2026-03-20', { + type: 'concept', + title: '2026 03 20', + compiled_truth: 'Current progress for review from Roger Gimbel about project work and inbox items.', + timeline: '', + }); + await engine.upsertChunks('2026-03-20', [{ + chunk_index: 0, + chunk_text: 'Current progress for review from Roger Gimbel about project work and inbox items.', + chunk_source: 'compiled_truth', + embedding: basisEmbedding(7), + token_count: 14, + }]); + + await engine.putPage('projects/control/project-status/selfgrowth', { + type: 'project-status' as any, + title: 'Selfgrowth', + compiled_truth: 'SelfGrowth status page covering current state, smoke checkpoints, and next steps.', + timeline: '', + }); + await engine.upsertChunks('projects/control/project-status/selfgrowth', [{ + chunk_index: 0, + chunk_text: 'SelfGrowth status page covering current state, smoke checkpoints, and next steps.', + chunk_source: 'compiled_truth', + embedding: basisEmbedding(8), + token_count: 12, + }]); + + await engine.putPage('knowledge/projects/selfgrowth-knowledge-pilot/raw/imported-selfgrowth', { + type: 'project', + title: 'selfgrowth', + compiled_truth: 'Raw imported selfgrowth source material from a file import lane.', + timeline: '', + }); + await engine.upsertChunks('knowledge/projects/selfgrowth-knowledge-pilot/raw/imported-selfgrowth', [{ + chunk_index: 0, + chunk_text: 'Raw imported selfgrowth source material from a file import lane.', + chunk_source: 'compiled_truth', + embedding: basisEmbedding(9), + token_count: 11, + }]); }); afterAll(async () => { @@ -215,3 +314,55 @@ describe('compiled truth boost (vector search validates ordering)', () => { expect(results[0].slug).toBe('people/pedro'); }); }); + +describe('hybridSearch canonical ordering', () => { + test('exact person lookup prefers canonical page over digest-like note', async () => { + const original = process.env.OPENAI_API_KEY; + delete process.env.OPENAI_API_KEY; + try { + const results = await hybridSearch(engine, 'Roger Gimbel', { limit: 5 }); + expect(results.length).toBeGreaterThan(0); + expect(results[0].slug).toBe('knowledge/people/roger-gimbel/summary'); + } finally { + if (original) process.env.OPENAI_API_KEY = original; + } + }); + + test('exact project lookup prefers project-status over imported raw page', async () => { + const original = process.env.OPENAI_API_KEY; + delete process.env.OPENAI_API_KEY; + try { + const results = await hybridSearch(engine, 'SelfGrowth', { limit: 5 }); + expect(results.length).toBeGreaterThan(0); + expect(results[0].slug).toBe('projects/control/project-status/selfgrowth'); + } finally { + if (original) process.env.OPENAI_API_KEY = original; + } + }); + + test('exact person lookup prefers maintained summary over compatibility stub', async () => { + const original = process.env.OPENAI_API_KEY; + delete process.env.OPENAI_API_KEY; + try { + const results = await hybridSearch(engine, 'Roger Gimbel', { limit: 5 }); + expect(results.length).toBeGreaterThan(0); + expect(results[0].slug).toBe('knowledge/people/roger-gimbel/summary'); + expect(results.some(r => r.slug === 'knowledge/people/roger-gimbel')).toBe(true); + } finally { + if (original) process.env.OPENAI_API_KEY = original; + } + }); + + test('exact company lookup prefers company summary over same-name agent page', async () => { + const original = process.env.OPENAI_API_KEY; + delete process.env.OPENAI_API_KEY; + try { + const results = await hybridSearch(engine, 'Rodaco', { limit: 5 }); + expect(results.length).toBeGreaterThan(0); + expect(results[0].slug).toBe('knowledge/companies/rodaco/summary'); + expect(results.some(r => r.slug === 'knowledge/agents/rodaco')).toBe(true); + } finally { + if (original) process.env.OPENAI_API_KEY = original; + } + }); +}); diff --git a/test/extract-db.test.ts b/test/extract-db.test.ts new file mode 100644 index 00000000..50364e7a --- /dev/null +++ b/test/extract-db.test.ts @@ -0,0 +1,251 @@ +/** + * Tests for `gbrain extract --source db` (v0.10.3 graph layer). + * + * Verifies the DB-source path of the unified `gbrain extract ` + * command. Companion to test/extract.test.ts which covers the fs-source path. + * + * Runs against in-memory PGLite. Idempotency, --type filtering, --dry-run + * JSON output, and reconciliation correctness. + */ + +import { describe, test, expect, beforeAll, afterAll, beforeEach } from 'bun:test'; +import { PGLiteEngine } from '../src/core/pglite-engine.ts'; +import { runExtract } from '../src/commands/extract.ts'; +import type { PageInput } from '../src/core/types.ts'; + +let engine: PGLiteEngine; + +beforeAll(async () => { + engine = new PGLiteEngine(); + await engine.connect({}); + await engine.initSchema(); +}); + +afterAll(async () => { + await engine.disconnect(); +}); + +async function truncateAll() { + for (const t of ['content_chunks', 'links', 'tags', 'raw_data', 'timeline_entries', 'page_versions', 'ingest_log', 'pages']) { + await (engine as any).db.exec(`DELETE FROM ${t}`); + } +} + +const personPage = (title: string, body = ''): PageInput => ({ + type: 'person', title, compiled_truth: body, timeline: '', +}); + +const companyPage = (title: string, body = ''): PageInput => ({ + type: 'company', title, compiled_truth: body, timeline: '', +}); + +const meetingPage = (title: string, body = ''): PageInput => ({ + type: 'meeting', title, compiled_truth: body, timeline: '', +}); + +describe('gbrain extract links --source db', () => { + beforeEach(truncateAll); + + test('extracts links from meeting page with attendee refs', async () => { + await engine.putPage('people/alice', personPage('Alice')); + await engine.putPage('people/bob', personPage('Bob')); + await engine.putPage('meetings/standup', meetingPage( + 'Standup', + 'Attendees: [Alice](people/alice), [Bob](people/bob).', + )); + + await runExtract(engine, ['links', '--source', 'db']); + + const links = await engine.getLinks('meetings/standup'); + expect(links.length).toBe(2); + expect(new Set(links.map(l => l.to_slug))).toEqual(new Set(['people/alice', 'people/bob'])); + expect(links.every(l => l.link_type === 'attended')).toBe(true); + }); + + test('infers works_at type from CEO context', async () => { + await engine.putPage('companies/acme', companyPage('Acme')); + await engine.putPage('people/alice', personPage( + 'Alice', + '[Alice](people/alice) is the CEO of [Acme](companies/acme).', + )); + + await runExtract(engine, ['links', '--source', 'db']); + const links = await engine.getLinks('people/alice'); + const acmeLink = links.find(l => l.to_slug === 'companies/acme'); + expect(acmeLink?.link_type).toBe('works_at'); + }); + + test('idempotent: running twice produces same link count', async () => { + await engine.putPage('people/alice', personPage('Alice')); + await engine.putPage('companies/acme', companyPage('Acme', '[Alice](people/alice) advises us.')); + + await runExtract(engine, ['links', '--source', 'db']); + const after1 = await engine.getLinks('companies/acme'); + + await runExtract(engine, ['links', '--source', 'db']); + const after2 = await engine.getLinks('companies/acme'); + expect(after2.length).toBe(after1.length); + }); + + test('skips refs to non-existent target pages', async () => { + await engine.putPage('people/alice', personPage( + 'Alice', + 'Met [Phantom](people/phantom-ghost) at the event.', + )); + await runExtract(engine, ['links', '--source', 'db']); + const links = await engine.getLinks('people/alice'); + expect(links.length).toBe(0); + }); + + test('--dry-run --json outputs JSON lines and writes nothing', async () => { + await engine.putPage('people/alice', personPage('Alice')); + await engine.putPage('companies/acme', companyPage( + 'Acme', + '[Alice](people/alice) joined as CEO.', + )); + + const lines: string[] = []; + const originalWrite = process.stdout.write.bind(process.stdout); + process.stdout.write = ((chunk: string | Uint8Array): boolean => { + const str = typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf-8'); + lines.push(str); + return true; + }) as any; + + try { + await runExtract(engine, ['links', '--source', 'db', '--dry-run', '--json']); + } finally { + process.stdout.write = originalWrite; + } + + const jsonLines = lines.filter(l => l.trim().startsWith('{')); + expect(jsonLines.length).toBeGreaterThan(0); + const parsed = JSON.parse(jsonLines[0].trim()); + expect(parsed.action).toBe('add_link'); + expect(parsed.from).toBeTruthy(); + expect(parsed.to).toBeTruthy(); + expect(parsed.type).toBeTruthy(); + + const links = await engine.getLinks('companies/acme'); + expect(links.length).toBe(0); + }); + + test('--type filter only processes matching pages', async () => { + await engine.putPage('people/alice', personPage('Alice')); + await engine.putPage('people/bob', personPage('Bob', '[Alice](people/alice) is great.')); + await engine.putPage('companies/acme', companyPage('Acme', '[Alice](people/alice) joined.')); + + await runExtract(engine, ['links', '--source', 'db', '--type', 'person']); + + const bobLinks = await engine.getLinks('people/bob'); + expect(bobLinks.length).toBe(1); + const acmeLinks = await engine.getLinks('companies/acme'); + expect(acmeLinks.length).toBe(0); + }); +}); + +describe('gbrain extract timeline --source db', () => { + beforeEach(truncateAll); + + test('extracts dated timeline entries from page content', async () => { + await engine.putPage('people/alice', { + type: 'person', title: 'Alice', + compiled_truth: 'Alice is the CEO.', + timeline: `## Timeline +- **2026-01-15** | Joined as CEO +- **2026-02-20** | Closed Series A`, + }); + + await runExtract(engine, ['timeline', '--source', 'db']); + + const entries = await engine.getTimeline('people/alice'); + expect(entries.length).toBe(2); + expect(entries.map(e => e.summary).sort()).toEqual(['Closed Series A', 'Joined as CEO']); + }); + + test('idempotent via DB constraint', async () => { + await engine.putPage('people/alice', { + type: 'person', title: 'Alice', compiled_truth: '', + timeline: '- **2026-01-15** | Same event', + }); + await runExtract(engine, ['timeline', '--source', 'db']); + await runExtract(engine, ['timeline', '--source', 'db']); + const entries = await engine.getTimeline('people/alice'); + expect(entries.length).toBe(1); + }); + + test('skips invalid dates', async () => { + await engine.putPage('people/alice', { + type: 'person', title: 'Alice', compiled_truth: '', + timeline: `- **2026-01-15** | Valid +- **2026-13-45** | Invalid month/day +- **2026-02-30** | Feb 30 doesnt exist`, + }); + await runExtract(engine, ['timeline', '--source', 'db']); + const entries = await engine.getTimeline('people/alice'); + expect(entries.length).toBe(1); + expect(entries[0].summary).toBe('Valid'); + }); + + test('handles multiple date format variants', async () => { + await engine.putPage('people/alice', { + type: 'person', title: 'Alice', compiled_truth: '', + timeline: `- **2026-01-15** | Pipe variant +- **2026-02-20** -- Double dash variant +- **2026-03-10** - Single dash variant`, + }); + await runExtract(engine, ['timeline', '--source', 'db']); + const entries = await engine.getTimeline('people/alice'); + expect(entries.length).toBe(3); + }); + + test('--dry-run --json emits JSON, no DB writes', async () => { + await engine.putPage('people/alice', { + type: 'person', title: 'Alice', compiled_truth: '', + timeline: '- **2026-01-15** | Test event', + }); + + const lines: string[] = []; + const originalWrite = process.stdout.write.bind(process.stdout); + process.stdout.write = ((chunk: string | Uint8Array): boolean => { + const str = typeof chunk === 'string' ? chunk : Buffer.from(chunk).toString('utf-8'); + lines.push(str); + return true; + }) as any; + try { + await runExtract(engine, ['timeline', '--source', 'db', '--dry-run', '--json']); + } finally { + process.stdout.write = originalWrite; + } + + const jsonLines = lines.filter(l => l.trim().startsWith('{')); + expect(jsonLines.length).toBeGreaterThan(0); + const parsed = JSON.parse(jsonLines[0].trim()); + expect(parsed.action).toBe('add_timeline'); + expect(parsed.date).toBe('2026-01-15'); + expect(parsed.summary).toBe('Test event'); + + const entries = await engine.getTimeline('people/alice'); + expect(entries.length).toBe(0); + }); +}); + +describe('gbrain extract all --source db', () => { + beforeEach(truncateAll); + + test('runs both links and timeline in one command', async () => { + await engine.putPage('people/alice', personPage('Alice')); + await engine.putPage('companies/acme', { + type: 'company', title: 'Acme', + compiled_truth: '[Alice](people/alice) joined as CEO.', + timeline: '- **2026-01-15** | Hired Alice', + }); + + await runExtract(engine, ['all', '--source', 'db']); + + const links = await engine.getLinks('companies/acme'); + expect(links.length).toBe(1); + const entries = await engine.getTimeline('companies/acme'); + expect(entries.length).toBe(1); + }); +}); diff --git a/test/extract.test.ts b/test/extract.test.ts index 78720eff..d5163619 100644 --- a/test/extract.test.ts +++ b/test/extract.test.ts @@ -1,7 +1,11 @@ import { describe, it, expect } from 'bun:test'; +import { mkdtempSync, writeFileSync, mkdirSync } from 'fs'; +import { join } from 'path'; +import { tmpdir } from 'os'; import { extractMarkdownLinks, extractLinksFromFile, + extractLinksForSlugs, extractTimelineFromContent, walkMarkdownFiles, } from '../src/commands/extract.ts'; @@ -79,6 +83,36 @@ describe('extractLinksFromFile', () => { const links = extractLinksFromFile(content, 'deals/seed.md', allSlugs); expect(links[0].link_type).toBe('deal_for'); }); + + it('extracts Obsidian wikilinks with aliases', () => { + const content = 'See [[knowledge/agents/gbrain|GBrain]] and [[knowledge/agents/hermes|Hermes]].'; + const allSlugs = new Set(['knowledge/agents/gbrain', 'knowledge/agents/hermes', 'agents']); + const links = extractLinksFromFile(content, 'agents.md', allSlugs); + expect(links.map(l => l.to_slug)).toContain('knowledge/agents/gbrain'); + expect(links.map(l => l.to_slug)).toContain('knowledge/agents/hermes'); + }); + + it('extracts Obsidian wikilinks relative to the current directory', () => { + const content = 'Latest summary: [[weekly-2026-04-12]]'; + const allSlugs = new Set(['briefs/weekly-kaizen-2026-04-13', 'briefs/weekly-2026-04-12']); + const links = extractLinksFromFile(content, 'briefs/weekly-kaizen-2026-04-13.md', allSlugs); + expect(links.map(l => l.to_slug)).toContain('briefs/weekly-2026-04-12'); + }); + + it('extracts Obsidian wikilinks with escaped pipe aliases', () => { + const content = 'See [[knowledge/agents/winston\\|Winston]].'; + const allSlugs = new Set(['knowledge/agents/winston', 'agents']); + const links = extractLinksFromFile(content, 'agents.md', allSlugs); + expect(links.map(l => l.to_slug)).toContain('knowledge/agents/winston'); + }); + + it('slugifies uppercase filenames to match imported page slugs', () => { + const content = 'See [[knowledge/agents/gbrain|GBrain]].'; + const allSlugs = new Set(['agents', 'knowledge/agents/gbrain']); + const links = extractLinksFromFile(content, 'AGENTS.md', allSlugs); + expect(links.map(l => l.from_slug)).toContain('agents'); + expect(links.map(l => l.to_slug)).toContain('knowledge/agents/gbrain'); + }); }); describe('extractTimelineFromContent', () => { @@ -123,3 +157,21 @@ describe('walkMarkdownFiles', () => { expect(typeof walkMarkdownFiles).toBe('function'); }); }); + +describe('extractLinksForSlugs', () => { + it('finds files whose filenames differ from slug casing', async () => { + const root = mkdtempSync(join(tmpdir(), 'gbrain-extract-')); + mkdirSync(join(root, 'knowledge', 'agents'), { recursive: true }); + writeFileSync(join(root, 'AGENTS.md'), 'See [[knowledge/agents/gbrain|GBrain]].'); + writeFileSync(join(root, 'knowledge', 'agents', 'gbrain.md'), '# GBrain'); + + const added: Array<{ from: string; to: string }> = []; + const engine = { + addLink: async (from: string, to: string) => { added.push({ from, to }); }, + } as any; + + const created = await extractLinksForSlugs(engine, root, ['agents']); + expect(created).toBe(1); + expect(added).toEqual([{ from: 'agents', to: 'knowledge/agents/gbrain' }]); + }); +}); diff --git a/test/graph-query.test.ts b/test/graph-query.test.ts new file mode 100644 index 00000000..f232557a --- /dev/null +++ b/test/graph-query.test.ts @@ -0,0 +1,114 @@ +/** + * Tests for `gbrain graph-query` command. + * + * Validates direction (in/out/both) and link_type filters via the underlying + * traversePaths engine method (which is exercised in pglite-engine.test.ts); + * here we assert the CLI output renders correctly. + */ + +import { describe, test, expect, beforeAll, afterAll, beforeEach } from 'bun:test'; +import { PGLiteEngine } from '../src/core/pglite-engine.ts'; +import { runGraphQuery } from '../src/commands/graph-query.ts'; + +let engine: PGLiteEngine; + +beforeAll(async () => { + engine = new PGLiteEngine(); + await engine.connect({}); + await engine.initSchema(); +}); + +afterAll(async () => { + await engine.disconnect(); +}); + +async function truncateAll() { + for (const t of ['content_chunks', 'links', 'tags', 'raw_data', 'timeline_entries', 'page_versions', 'ingest_log', 'pages']) { + await (engine as any).db.exec(`DELETE FROM ${t}`); + } +} + +function captureStdout(fn: () => Promise): Promise { + return (async () => { + const lines: string[] = []; + const orig = console.log; + console.log = (msg: unknown) => { + lines.push(String(msg)); + }; + try { + await fn(); + } finally { + console.log = orig; + } + return lines; + })(); +} + +describe('graph-query command', () => { + beforeEach(async () => { + await truncateAll(); + await engine.putPage('people/alice', { type: 'person', title: 'Alice', compiled_truth: '', timeline: '' }); + await engine.putPage('people/bob', { type: 'person', title: 'Bob', compiled_truth: '', timeline: '' }); + await engine.putPage('people/carol', { type: 'person', title: 'Carol', compiled_truth: '', timeline: '' }); + await engine.putPage('companies/acme', { type: 'company', title: 'Acme', compiled_truth: '', timeline: '' }); + await engine.putPage('meetings/standup', { type: 'meeting', title: 'Standup', compiled_truth: '', timeline: '' }); + await engine.addLink('meetings/standup', 'people/alice', '', 'attended'); + await engine.addLink('meetings/standup', 'people/bob', '', 'attended'); + await engine.addLink('meetings/standup', 'people/carol', '', 'attended'); + await engine.addLink('people/alice', 'companies/acme', '', 'works_at'); + await engine.addLink('people/bob', 'companies/acme', '', 'invested_in'); + }); + + test('default direction (out) traverses outgoing edges', async () => { + const lines = await captureStdout(async () => { + await runGraphQuery(engine, ['meetings/standup', '--depth', '1']); + }); + const joined = lines.join('\n'); + expect(joined).toContain('meetings/standup'); + expect(joined).toContain('people/alice'); + expect(joined).toContain('people/bob'); + expect(joined).toContain('people/carol'); + expect(joined).toContain('attended'); + }); + + test('--type attended filter (per-edge)', async () => { + const lines = await captureStdout(async () => { + await runGraphQuery(engine, ['meetings/standup', '--type', 'attended', '--depth', '1']); + }); + const joined = lines.join('\n'); + // All edges shown should be attended + const edgeLines = lines.filter(l => l.includes('--')); + expect(edgeLines.length).toBeGreaterThan(0); + expect(edgeLines.every(l => l.includes('attended'))).toBe(true); + expect(joined).toContain('people/alice'); + }); + + test('--direction in: incoming edges', async () => { + const lines = await captureStdout(async () => { + await runGraphQuery(engine, ['companies/acme', '--direction', 'in', '--depth', '1']); + }); + const joined = lines.join('\n'); + // Should show people who link TO acme + expect(joined).toContain('companies/acme'); + expect(joined).toContain('people/alice'); + expect(joined).toContain('people/bob'); + }); + + test('--type works_at --direction in: only works_at edges in', async () => { + const lines = await captureStdout(async () => { + await runGraphQuery(engine, ['companies/acme', '--type', 'works_at', '--direction', 'in', '--depth', '1']); + }); + const joined = lines.join('\n'); + expect(joined).toContain('people/alice'); + // Bob is invested_in, not works_at — should not appear + expect(joined).not.toContain('people/bob'); + }); + + test('non-existent slug emits "no edges found"', async () => { + const lines = await captureStdout(async () => { + await runGraphQuery(engine, ['does/not-exist']); + }); + const joined = lines.join('\n'); + expect(joined.toLowerCase()).toContain('no edges found'); + }); +}); diff --git a/test/link-extraction.test.ts b/test/link-extraction.test.ts new file mode 100644 index 00000000..2bf962ca --- /dev/null +++ b/test/link-extraction.test.ts @@ -0,0 +1,321 @@ +import { describe, test, expect } from 'bun:test'; +import { + extractEntityRefs, + extractPageLinks, + inferLinkType, + parseTimelineEntries, + isAutoLinkEnabled, +} from '../src/core/link-extraction.ts'; +import type { BrainEngine } from '../src/core/engine.ts'; + +// ─── extractEntityRefs ───────────────────────────────────────── + +describe('extractEntityRefs', () => { + test('extracts filesystem-relative refs ([Name](../people/slug.md))', () => { + const refs = extractEntityRefs('Met with [Alice Chen](../people/alice-chen.md) at the office.'); + expect(refs.length).toBe(1); + expect(refs[0]).toEqual({ name: 'Alice Chen', slug: 'people/alice-chen', dir: 'people' }); + }); + + test('extracts engine-style slug refs ([Name](people/slug))', () => { + const refs = extractEntityRefs('See [Alice Chen](people/alice-chen) for context.'); + expect(refs.length).toBe(1); + expect(refs[0]).toEqual({ name: 'Alice Chen', slug: 'people/alice-chen', dir: 'people' }); + }); + + test('extracts company refs', () => { + const refs = extractEntityRefs('We invested in [Acme AI](companies/acme-ai).'); + expect(refs.length).toBe(1); + expect(refs[0].dir).toBe('companies'); + expect(refs[0].slug).toBe('companies/acme-ai'); + }); + + test('extracts multiple refs in same content', () => { + const refs = extractEntityRefs('[Alice](people/alice) and [Bob](people/bob) met at [Acme](companies/acme).'); + expect(refs.length).toBe(3); + expect(refs.map(r => r.slug)).toEqual(['people/alice', 'people/bob', 'companies/acme']); + }); + + test('handles ../../ deep paths', () => { + const refs = extractEntityRefs('[Alice](../../people/alice.md)'); + expect(refs.length).toBe(1); + expect(refs[0].slug).toBe('people/alice'); + }); + + test('handles unicode names', () => { + const refs = extractEntityRefs('Met [Héctor García](people/hector-garcia)'); + expect(refs.length).toBe(1); + expect(refs[0].name).toBe('Héctor García'); + }); + + test('returns empty array on no matches', () => { + expect(extractEntityRefs('No links here.')).toEqual([]); + }); + + test('skips malformed markdown (unclosed bracket)', () => { + expect(extractEntityRefs('[Alice(people/alice)')).toEqual([]); + }); + + test('skips non-entity dirs (notes/, ideas/ stay if added later but are accepted now)', () => { + // Current regex targets entity dirs explicitly. Notes/ shouldn't match. + const refs = extractEntityRefs('See [random](notes/random).'); + expect(refs).toEqual([]); + }); + + test('extracts meeting refs', () => { + const refs = extractEntityRefs('See [Standup](meetings/2026-01-15-standup).'); + expect(refs.length).toBe(1); + expect(refs[0].dir).toBe('meetings'); + }); +}); + +// ─── extractPageLinks ────────────────────────────────────────── + +describe('extractPageLinks', () => { + test('returns LinkCandidate[] with inferred types', () => { + const candidates = extractPageLinks( + '[Alice](people/alice) is the CEO of Acme.', + {}, + 'concept', + ); + expect(candidates.length).toBeGreaterThan(0); + const aliceLink = candidates.find(c => c.targetSlug === 'people/alice'); + expect(aliceLink).toBeDefined(); + expect(aliceLink!.linkType).toBe('works_at'); + }); + + test('dedups multiple mentions of same entity (within-page dedup)', () => { + const content = '[Alice](people/alice) said this. Later, [Alice](people/alice) said that.'; + const candidates = extractPageLinks(content, {}, 'concept'); + const aliceLinks = candidates.filter(c => c.targetSlug === 'people/alice'); + expect(aliceLinks.length).toBe(1); + }); + + test('extracts frontmatter source as source-type link', () => { + const candidates = extractPageLinks('Some content.', { source: 'meetings/2026-01-15' }, 'person'); + const sourceLink = candidates.find(c => c.linkType === 'source'); + expect(sourceLink).toBeDefined(); + expect(sourceLink!.targetSlug).toBe('meetings/2026-01-15'); + }); + + test('extracts bare slug references in text', () => { + const candidates = extractPageLinks('See companies/acme for details.', {}, 'concept'); + const acme = candidates.find(c => c.targetSlug === 'companies/acme'); + expect(acme).toBeDefined(); + }); + + test('returns empty when no refs found', () => { + expect(extractPageLinks('Plain text with no links.', {}, 'concept')).toEqual([]); + }); + + test('meeting page references default to attended type', () => { + const candidates = extractPageLinks('Attendees: [Alice](people/alice), [Bob](people/bob).', {}, 'meeting'); + const aliceLink = candidates.find(c => c.targetSlug === 'people/alice'); + expect(aliceLink!.linkType).toBe('attended'); + }); + + test('person-page role prior biases company refs to invested_in when prose names partner role', () => { + const candidates = extractPageLinks( + 'Wendy is a partner at Founders Fund. Her portfolio includes [Cipher Labs](companies/cipher-labs).', + {}, + 'person', + ); + const cipher = candidates.find(c => c.targetSlug === 'companies/cipher-labs'); + expect(cipher?.linkType).toBe('invested_in'); + }); +}); + +// ─── inferLinkType ───────────────────────────────────────────── + +describe('inferLinkType', () => { + test('meeting + person ref -> attended', () => { + expect(inferLinkType('meeting', 'Attendees: Alice')).toBe('attended'); + }); + + test('CEO of -> works_at', () => { + expect(inferLinkType('person', 'Alice is CEO of Acme.')).toBe('works_at'); + }); + + test('VP at -> works_at', () => { + expect(inferLinkType('person', 'Bob, VP at Stripe, said.')).toBe('works_at'); + }); + + test('invested in -> invested_in', () => { + expect(inferLinkType('person', 'YC invested in Acme.')).toBe('invested_in'); + }); + + test('founded -> founded', () => { + expect(inferLinkType('person', 'Alice founded NovaPay.')).toBe('founded'); + }); + + test('co-founded -> founded', () => { + expect(inferLinkType('person', 'Bob co-founded Beta Health.')).toBe('founded'); + }); + + test('founder noun-form variants -> founded', () => { + expect(inferLinkType('person', 'Carol Wilson is the founder of Anchor.')).toBe('founded'); + expect(inferLinkType('person', 'The founders are Alice and Bob.')).toBe('founded'); + expect(inferLinkType('person', 'Dana is one of the founders of Northstar.')).toBe('founded'); + }); + + test('advises -> advises', () => { + expect(inferLinkType('person', 'Emily advises Acme on go-to-market.')).toBe('advises'); + }); + + test('"board member" alone is too ambiguous (investors also hold board seats) -> mentions', () => { + // Tightened in v0.10.4 after BrainBench rich-prose surfaced that partner + // bios ("She sits on the boards of [portfolio company]") were classified + // as advises. Generic board language now requires explicit advisor/advise + // rooting to count. + expect(inferLinkType('person', 'Jane is a board member at Beta Health.')).toBe('mentions'); + }); + + test('explicit advisor language -> advises', () => { + expect(inferLinkType('person', 'Jane is an advisor to Beta Health.')).toBe('advises'); + expect(inferLinkType('person', 'Joined the advisory board at Beta Health.')).toBe('advises'); + }); + + test('investment narrative variants -> invested_in', () => { + expect(inferLinkType('person', 'Wendy led the Series A for Cipher Labs.')).toBe('invested_in'); + expect(inferLinkType('person', 'Bob is an early investor in Acme.')).toBe('invested_in'); + expect(inferLinkType('person', 'She invests in fintech startups.')).toBe('invested_in'); + expect(inferLinkType('person', 'Acme is a portfolio company of Founders Fund.')).toBe('invested_in'); + expect(inferLinkType('person', 'Sequoia led the seed round for Vox.')).toBe('invested_in'); + }); + + test('default -> mentions', () => { + expect(inferLinkType('person', 'Random context with no relationship verbs.')).toBe('mentions'); + }); + + test('precedence: founded beats works_at', () => { + // "founded" appears first in regex precedence + expect(inferLinkType('person', 'Alice founded Acme and is the CEO of it.')).toBe('founded'); + }); + + test('media page -> mentions (not attended)', () => { + expect(inferLinkType('media', 'Alice attended the workshop.')).toBe('mentions'); + }); +}); + +// ─── parseTimelineEntries ────────────────────────────────────── + +describe('parseTimelineEntries', () => { + test('parses standard format: - **YYYY-MM-DD** | summary', () => { + const entries = parseTimelineEntries('- **2026-01-15** | Met with Alice'); + expect(entries.length).toBe(1); + expect(entries[0]).toEqual({ date: '2026-01-15', summary: 'Met with Alice', detail: '' }); + }); + + test('parses dash variant: - **YYYY-MM-DD** -- summary', () => { + const entries = parseTimelineEntries('- **2026-01-15** -- Met with Bob'); + expect(entries.length).toBe(1); + expect(entries[0].summary).toBe('Met with Bob'); + }); + + test('parses single dash: - **YYYY-MM-DD** - summary', () => { + const entries = parseTimelineEntries('- **2026-01-15** - Met with Carol'); + expect(entries.length).toBe(1); + expect(entries[0].summary).toBe('Met with Carol'); + }); + + test('parses without leading dash: **YYYY-MM-DD** | summary', () => { + const entries = parseTimelineEntries('**2026-01-15** | Standalone entry'); + expect(entries.length).toBe(1); + }); + + test('parses multiple entries', () => { + const content = `## Timeline +- **2026-01-15** | First event +- **2026-02-20** | Second event +- **2026-03-10** | Third event`; + const entries = parseTimelineEntries(content); + expect(entries.length).toBe(3); + expect(entries.map(e => e.date)).toEqual(['2026-01-15', '2026-02-20', '2026-03-10']); + }); + + test('skips invalid dates (2026-13-45)', () => { + const entries = parseTimelineEntries('- **2026-13-45** | Bad date'); + expect(entries.length).toBe(0); + }); + + test('skips invalid dates (2026-02-30)', () => { + const entries = parseTimelineEntries('- **2026-02-30** | Feb 30 doesnt exist'); + expect(entries.length).toBe(0); + }); + + test('returns empty when no timeline lines found', () => { + expect(parseTimelineEntries('Just some plain text.')).toEqual([]); + }); + + test('handles mixed content (timeline lines interspersed with prose)', () => { + const content = `Some intro paragraph. + +- **2026-01-15** | An event happened + +More prose here. + +- **2026-02-20** | Another event`; + const entries = parseTimelineEntries(content); + expect(entries.length).toBe(2); + }); +}); + +// ─── isAutoLinkEnabled ───────────────────────────────────────── + +function makeFakeEngine(configMap: Map): BrainEngine { + return { + getConfig: async (key: string) => configMap.get(key) ?? null, + } as unknown as BrainEngine; +} + +describe('isAutoLinkEnabled', () => { + test('null/undefined -> true (default on)', async () => { + const engine = makeFakeEngine(new Map()); + expect(await isAutoLinkEnabled(engine)).toBe(true); + }); + + test('"false" -> false', async () => { + const engine = makeFakeEngine(new Map([['auto_link', 'false']])); + expect(await isAutoLinkEnabled(engine)).toBe(false); + }); + + test('"FALSE" (case-insensitive) -> false', async () => { + const engine = makeFakeEngine(new Map([['auto_link', 'FALSE']])); + expect(await isAutoLinkEnabled(engine)).toBe(false); + }); + + test('"0" -> false', async () => { + const engine = makeFakeEngine(new Map([['auto_link', '0']])); + expect(await isAutoLinkEnabled(engine)).toBe(false); + }); + + test('"no" -> false', async () => { + const engine = makeFakeEngine(new Map([['auto_link', 'no']])); + expect(await isAutoLinkEnabled(engine)).toBe(false); + }); + + test('"off" -> false', async () => { + const engine = makeFakeEngine(new Map([['auto_link', 'off']])); + expect(await isAutoLinkEnabled(engine)).toBe(false); + }); + + test('"true" -> true', async () => { + const engine = makeFakeEngine(new Map([['auto_link', 'true']])); + expect(await isAutoLinkEnabled(engine)).toBe(true); + }); + + test('"1" -> true', async () => { + const engine = makeFakeEngine(new Map([['auto_link', '1']])); + expect(await isAutoLinkEnabled(engine)).toBe(true); + }); + + test('whitespace and case: " False " -> false', async () => { + const engine = makeFakeEngine(new Map([['auto_link', ' False ']])); + expect(await isAutoLinkEnabled(engine)).toBe(false); + }); + + test('garbage value -> true (fail-safe to default)', async () => { + const engine = makeFakeEngine(new Map([['auto_link', 'garbage']])); + expect(await isAutoLinkEnabled(engine)).toBe(true); + }); +}); diff --git a/test/search.test.ts b/test/search.test.ts index 7e212584..d4eed5dd 100644 --- a/test/search.test.ts +++ b/test/search.test.ts @@ -4,7 +4,7 @@ */ import { describe, test, expect } from 'bun:test'; -import { rrfFusion, cosineSimilarity } from '../src/core/search/hybrid.ts'; +import { rrfFusion, cosineSimilarity, applyQueryAwareBoosts, hybridSearch } from '../src/core/search/hybrid.ts'; import type { SearchResult } from '../src/core/types.ts'; function makeResult(overrides: Partial = {}): SearchResult { @@ -109,6 +109,267 @@ describe('rrfFusion', () => { }); }); +describe('applyQueryAwareBoosts', () => { + test('prefers maintained summary pages over compatibility stubs for exact entity queries', () => { + const compatibility = makeResult({ + slug: 'knowledge/people/roger-gimbel', + title: 'Roger Gimbel', + type: 'people-profile' as any, + chunk_text: 'Legacy compatibility note.', + score: 1.0, + }); + const summary = makeResult({ + slug: 'knowledge/people/roger-gimbel/summary', + title: 'Summary', + type: 'people-profile' as any, + chunk_text: '# Roger Gimbel', + score: 0.7, + }); + const boosted = applyQueryAwareBoosts([compatibility, summary], 'Roger Gimbel'); + expect(boosted[0].slug).toBe('knowledge/people/roger-gimbel/summary'); + expect(boosted[0].score).toBeGreaterThan(boosted[1].score); + }); + + test('prefers company summary pages over same-name agent pages for exact company queries', () => { + const agent = makeResult({ + slug: 'knowledge/agents/rodaco', + title: 'Rodaco', + type: 'agent-profile' as any, + chunk_text: 'Backup agent on Intel Mac.', + score: 1.0, + }); + const company = makeResult({ + slug: 'knowledge/companies/rodaco/summary', + title: 'Summary', + type: 'company-summary' as any, + chunk_text: '# Rodaco', + score: 0.7, + }); + const boosted = applyQueryAwareBoosts([agent, company], 'Rodaco'); + expect(boosted[0].slug).toBe('knowledge/companies/rodaco/summary'); + expect(boosted[0].score).toBeGreaterThan(boosted[1].score); + }); + + test('promotes canonical entity pages above digest noise for exact-name queries', () => { + const digest = makeResult({ + slug: '2026-03-20', + title: '2026 03 20', + type: 'concept', + chunk_text: 'Current progress for review from Roger Gimbel', + score: 1.0, + }); + const canonical = makeResult({ + slug: 'knowledge/people/roger-gimbel/summary', + title: 'Summary', + type: 'people-profile' as any, + chunk_text: '# Roger Gimbel', + score: 0.7, + }); + const boosted = applyQueryAwareBoosts([digest, canonical], 'Roger Gimbel'); + expect(boosted[0].slug).toBe('knowledge/people/roger-gimbel/summary'); + expect(boosted[0].score).toBeGreaterThan(boosted[1].score); + }); + + test('promotes project-status pages when slug matches the query', () => { + const pilot = makeResult({ + slug: 'knowledge/projects/selfgrowth-knowledge-pilot/readme', + title: 'SelfGrowth Knowledge Pilot', + type: 'project', + score: 1.0, + }); + const status = makeResult({ + slug: 'projects/control/project-status/selfgrowth', + title: 'Selfgrowth', + type: 'project-status' as any, + score: 0.8, + }); + const boosted = applyQueryAwareBoosts([pilot, status], 'SelfGrowth'); + expect(boosted[0].slug).toBe('projects/control/project-status/selfgrowth'); + }); + + test('demotes raw imported pages below structured canonical pages for exact entity queries', () => { + const rawImport = makeResult({ + slug: 'knowledge/projects/selfgrowth-knowledge-pilot/raw/imported-selfgrowth', + title: 'selfgrowth', + type: 'project', + score: 1.0, + }); + const status = makeResult({ + slug: 'projects/control/project-status/selfgrowth', + title: 'Selfgrowth', + type: 'project-status' as any, + score: 0.8, + }); + const boosted = applyQueryAwareBoosts([rawImport, status], 'SelfGrowth'); + expect(boosted[0].slug).toBe('projects/control/project-status/selfgrowth'); + }); + + test('prefers canonical agent pages when the query adds an explicit type hint', () => { + const article = makeResult({ + slug: 'clippings/how-to-build-smart-stress-tested-openclaw-hermes-agents-for-under-30', + title: 'How to Build Smart, Stress-Tested OpenClaw + Hermes Agents for Under $30', + type: 'concept', + score: 1.0, + }); + const agent = makeResult({ + slug: 'knowledge/agents/hermes', + title: 'Hermes', + type: 'agent-profile' as any, + chunk_text: '# Hermes', + score: 0.75, + }); + const boosted = applyQueryAwareBoosts([article, agent], 'Hermes Agent'); + expect(boosted[0].slug).toBe('knowledge/agents/hermes'); + }); + + test('prefers canonical company summaries when the query adds a brand-style ai suffix', () => { + const noisy = makeResult({ + slug: 'claude-memory/feedback_obsidian_vaults', + title: 'Feedback Obsidian Vaults', + type: 'feedback', + chunk_text: 'Winston and Rodaco have separate Obsidian vaults. Two AI agents with different identities.', + score: 1.0, + }); + const company = makeResult({ + slug: 'knowledge/companies/rodaco/summary', + title: 'Summary', + type: 'company-summary' as any, + chunk_text: '# Rodaco', + score: 0.7, + }); + const boosted = applyQueryAwareBoosts([noisy, company], 'Rodaco AI'); + expect(boosted[0].slug).toBe('knowledge/companies/rodaco/summary'); + }); + + test('does not treat unspaced prefixes as ai suffix aliases', () => { + const noisy = makeResult({ + slug: 'knowledge/companies/openai/summary', + title: 'Summary', + type: 'company-summary' as any, + chunk_text: '# OpenAI', + score: 1.0, + }); + const wrongPrefix = makeResult({ + slug: 'knowledge/companies/open/summary', + title: 'Summary', + type: 'company-summary' as any, + chunk_text: '# Open', + score: 0.7, + }); + const boosted = applyQueryAwareBoosts([noisy, wrongPrefix], 'OpenAI'); + expect(boosted[0].slug).toBe('knowledge/companies/openai/summary'); + }); + + test('prefers the explicit company canonical page for ambiguous company disambiguators', () => { + const agent = makeResult({ + slug: 'knowledge/agents/rodaco', + title: 'Rodaco', + type: 'agent-profile' as any, + chunk_text: '# Rodaco', + score: 1.0, + }); + const company = makeResult({ + slug: 'knowledge/companies/rodaco/summary', + title: 'Summary', + type: 'company-summary' as any, + chunk_text: '# Rodaco', + score: 0.2, + }); + const boosted = applyQueryAwareBoosts([agent, company], 'Rodaco Company'); + expect(boosted[0].slug).toBe('knowledge/companies/rodaco/summary'); + }); + + test('prefers the explicit agent canonical page for ambiguous agent disambiguators', () => { + const company = makeResult({ + slug: 'knowledge/companies/rodaco/summary', + title: 'Summary', + type: 'company-summary' as any, + chunk_text: '# Rodaco', + score: 1.0, + }); + const agent = makeResult({ + slug: 'knowledge/agents/rodaco', + title: 'Rodaco', + type: 'agent-profile' as any, + chunk_text: '# Rodaco', + score: 0.15, + }); + const boosted = applyQueryAwareBoosts([company, agent], 'Rodaco Agent'); + expect(boosted[0].slug).toBe('knowledge/agents/rodaco'); + }); + + test('prefers the explicit Hermes agent canonical page for assistant-style aliases', () => { + const installNote = makeResult({ + slug: 'claude-memory/project_hermes_m5', + title: 'Project Hermes M5', + type: 'project', + chunk_text: 'Hermes Agent on the M5 Mac.', + score: 1.0, + }); + const agent = makeResult({ + slug: 'knowledge/agents/hermes', + title: 'Hermes', + type: 'agent-profile' as any, + chunk_text: '# Hermes', + score: 0.15, + }); + const boosted = applyQueryAwareBoosts([installNote, agent], 'Hermes Assistant'); + expect(boosted[0].slug).toBe('knowledge/agents/hermes'); + }); +}); + +describe('hybridSearch exact-query candidate rescue', () => { + test('widens the keyword candidate pool so exact canonical company pages are not missed behind noisy lexical matches', async () => { + const originalKey = process.env.OPENAI_API_KEY; + delete process.env.OPENAI_API_KEY; + try { + const dataset: SearchResult[] = [ + makeResult({ + slug: 'knowledge/agents/rodaco', + title: 'Rodaco', + type: 'agent-profile' as any, + chunk_text: 'Backup agent on Intel Mac.', + chunk_source: 'compiled_truth', + chunk_id: 1, + score: 0.710937, + }), + ]; + + for (let i = 0; i < 56; i++) { + dataset.push(makeResult({ + slug: `noise/page-${i}`, + title: `Noise ${i}`, + type: 'project', + chunk_text: `Rodaco mention ${i}`, + chunk_source: 'compiled_truth', + chunk_id: i + 2, + score: 0.35 - i * 0.001, + })); + } + + dataset.push(makeResult({ + slug: 'knowledge/companies/rodaco/summary', + title: 'Summary', + type: 'company-summary' as any, + chunk_text: '# Rodaco', + chunk_source: 'compiled_truth', + chunk_id: 1000, + score: 0.33098254, + })); + + const engine = { + searchKeyword: async (_query: string, opts?: { limit?: number }) => dataset.slice(0, opts?.limit ?? dataset.length), + } as any; + + const results = await hybridSearch(engine, 'Rodaco', { limit: 10, expansion: false, detail: 'medium' }); + expect(results[0].slug).toBe('knowledge/companies/rodaco/summary'); + } finally { + if (originalKey === undefined) delete process.env.OPENAI_API_KEY; + else process.env.OPENAI_API_KEY = originalKey; + } + }); +}); + describe('cosineSimilarity', () => { test('identical vectors return 1.0', () => { const v = new Float32Array([1, 2, 3]); diff --git a/test/utils.test.ts b/test/utils.test.ts index c11d5725..63889428 100644 --- a/test/utils.test.ts +++ b/test/utils.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'bun:test'; -import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult } from '../src/core/utils.ts'; +import { validateSlug, contentHash, rowToPage, rowToChunk, rowToSearchResult, coerceEmbeddingVector } from '../src/core/utils.ts'; describe('validateSlug', () => { test('accepts valid slugs', () => { @@ -79,6 +79,32 @@ describe('rowToPage', () => { }); }); +describe('coerceEmbeddingVector', () => { + test('returns Float32Array unchanged', () => { + const emb = new Float32Array([0.1, 0.2, 0.3]); + expect(coerceEmbeddingVector(emb)).toBe(emb); + }); + + test('parses postgres vector strings into Float32Array', () => { + const emb = coerceEmbeddingVector('[0.1,0.2,0.3]'); + expect(emb).toBeInstanceOf(Float32Array); + expect(emb?.[0]).toBeCloseTo(0.1, 6); + expect(emb?.[1]).toBeCloseTo(0.2, 6); + expect(emb?.[2]).toBeCloseTo(0.3, 6); + }); + + test('parses JSON array strings into Float32Array', () => { + const emb = coerceEmbeddingVector('[1, 2, 3]'); + expect(emb).toBeInstanceOf(Float32Array); + expect(Array.from(emb || [])).toEqual([1, 2, 3]); + }); + + test('returns null for invalid embedding values', () => { + expect(coerceEmbeddingVector('not-a-vector')).toBeNull(); + expect(coerceEmbeddingVector(null)).toBeNull(); + }); +}); + describe('rowToChunk', () => { test('nulls embedding by default', () => { const chunk = rowToChunk({ @@ -98,6 +124,18 @@ describe('rowToChunk', () => { }, true); expect(chunk.embedding).not.toBeNull(); }); + + test('coerces string embeddings when requested', () => { + const chunk = rowToChunk({ + id: 1, page_id: 1, chunk_index: 0, chunk_text: 'text', + chunk_source: 'compiled_truth', embedding: '[0.1,0.2,0.3]', + model: 'test', token_count: 5, embedded_at: '2024-01-01', + }, true); + expect(chunk.embedding).toBeInstanceOf(Float32Array); + expect(chunk.embedding?.[0]).toBeCloseTo(0.1, 6); + expect(chunk.embedding?.[1]).toBeCloseTo(0.2, 6); + expect(chunk.embedding?.[2]).toBeCloseTo(0.3, 6); + }); }); describe('rowToSearchResult', () => {