diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 0000000..4a3391e
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,15 @@
+{
+  "hooks": {
+    "PostToolUse": [
+      {
+        "matcher": "Edit|Write",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "python3 scripts/check_docs.py"
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..7c73e1e
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,17 @@
+name: Lint
+
+on:
+  pull_request:
+  push:
+    branches: [master, main]
+
+jobs:
+  check-docs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Check CLAUDE.md is in sync
+        run: python3 scripts/check_docs.py
diff --git a/CLAUDE.md b/CLAUDE.md
index 180e120..a297f1d 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -45,11 +45,14 @@ memex/
 ├── memex/
 │   ├── __init__.py
 │   ├── schema.py              # Pydantic models — KnowledgeRecord, ExtractionResult
-│   ├── extractor.py           # LLM extraction pipeline (LiteLLM + Instructor)
+│   ├── extractor.py           # LLM extraction pipeline (anthropic + Instructor)
+│   ├── structural.py          # Structural file detection — categorize_file, is_structural_change (no LLM deps)
 │   ├── writer.py              # Renders KnowledgeRecord to .md and commits it
 │   ├── action.py              # GitHub Action entry point — reads env vars, orchestrates
 │   ├── adr.py                 # ADR parser — find_adr_files, parse_adr, index_adrs
-│   ├── cli.py                 # Click CLI — `memex index` and `memex query`
+│   ├── cli.py                 # Click CLI — `memex configure/init/update/index/query`
+│   ├── config.py              # API key resolution — load_api_key, save_api_key, CONFIG_FILE
+│   ├── nudge.py               # Low-confidence nudge comment — should_nudge, post_nudge_comment
 │   ├── init.py                # `memex init` — bootstrap from repo scan
 │   └── update.py              # `memex update` — incremental extraction from git history
 ├── tests/
@@ -79,7 +82,7 @@ The index cache lives at:
 |---|---|---|
 | Language | Python 3.12+ | Best LLM ecosystem |
 | LLM — extraction | `claude-sonnet-4-6` via `anthropic` SDK | Best structured output quality |
-| LLM — embeddings | `voyage-3-lite` via `anthropic` SDK | Same SDK, same API key, no OpenAI account |
+| LLM — embeddings | `fastembed` (`BAAI/bge-small-en-v1.5`) | Local, no API key required, no data leaves machine during queries |
 | Structured output | `instructor` + `pydantic` | Guaranteed schema compliance, auto-retry |
 | Vector search | `numpy` cosine similarity over `index.json` | No database needed at MVP scale (<5k records) |
 | CLI | `click` | Standard, simple |
@@ -203,12 +206,27 @@ without updating the indexer and CLI accordingly.
 ## CLI behaviour
 
 ```bash
-memex index      # embed all .md files in knowledge/, write to .memex/index.json
+memex configure                             # store ANTHROPIC_API_KEY to ~/.memex/config.json (prompts interactively)
+memex init                                  # bootstrap: scan repo for ADRs + extract from recent git history
+memex update                                # incremental: extract from git history since last run
+memex index                                 # embed all .md files in knowledge/, write to .memex/index.json
 memex query "why did we move off MongoDB"   # cosine similarity search, top 3 results
+memex query --min-score 0.5 "..."           # broaden search by lowering the relevance threshold
+memex query --expand "vague question"       # rewrite query via Claude Haiku before embedding
 ```
 
-`memex index` should be incremental — only embed files not already in `index.json`.
-Do not re-embed records that haven't changed.
+`memex index` should be incremental — only embed files whose content has changed since
+the last run. Change detection uses a SHA256 hash of the cleaned embed text (title +
+context + decision + alternatives + constraints, with YAML frontmatter and markdown
+noise stripped). The hash is stored as `content_hash` in each index entry. Entries
+without a `content_hash` (legacy entries) are always re-embedded.
+
+`memex query` options:
+- `--top N` — show top N results (default 3)
+- `--min-score F` — hide results below this similarity threshold (default 0.70); shows
+  a "no relevant results" message with a suggested lower threshold when nothing passes
+- `--expand` — opt-in: calls Claude Haiku to rewrite the query into richer search
+  phrases before embedding; useful for short or vague queries
 
 `memex query` output format:
 ```
@@ -258,6 +276,25 @@ of the PR's knowledge record.
 
 ---
 
+## Doc sync rules
+
+`scripts/check_docs.py` runs automatically after every file edit (via `.claude/settings.json`
+hook) and on every PR (via `.github/workflows/lint.yml`). It will fail loudly if CLAUDE.md
+drifts from the code.
+
+When you make any of the changes below, update CLAUDE.md **in the same commit**:
+
+| What changed | What to update in CLAUDE.md |
+|---|---|
+| New/removed/renamed `.py` in `memex/` | File structure section |
+| New/removed `@cli.command()` in `cli.py` | CLI behaviour section |
+| `model=` string in `extractor.py` or `init.py` | Tech stack table + decisions section |
+| New dependency in `pyproject.toml` | Tech stack table |
+| New `os.environ["VAR"]` in `action.py` | Environment variables table |
+| New frontmatter field in `writer.py` | Markdown output format section |
+
+---
+
 ## Agent rules — cross-cutting concerns
 
 Before implementing any feature that touches knowledge record creation, extraction logic,
@@ -303,7 +340,7 @@ is harder to debug than a slightly larger PR.
 
 | Variable | Source | Description |
 |---|---|---|
-| `ANTHROPIC_API_KEY` | GitHub Secret | Required. Anthropic API key for Claude + Voyage |
+| `ANTHROPIC_API_KEY` | GitHub Secret | Required. Anthropic API key for Claude extraction |
 | `PR_TITLE` | `github.event.pull_request.title` | PR title |
 | `PR_BODY` | `github.event.pull_request.body` | PR description |
 | `PR_URL` | `github.event.pull_request.html_url` | Full URL to PR |
@@ -324,11 +361,15 @@ is harder to debug than a slightly larger PR.
 - Do not make real LLM calls in tests — mock the `instructor` client
 - Use `pytest` and `pytest-mock`
 
+**Running tests — requires Python 3.12+** (the codebase uses `|` union syntax and other 3.10+ features):
+
 ```bash
-pip install -e ".[dev]"
-pytest tests/ -v
+python3 -m pytest tests/ -v
 ```
 
+No virtualenv or `pip install` needed — dependencies are already installed globally on this machine.
+To run a single file: `python3 -m pytest tests/test_action.py -v`
+
 ---
 
 ## Running tests — agent instructions
@@ -338,12 +379,12 @@ declaring the task complete. Use `pytest` with `-v` for readable output.
 
 | Module changed | Command |
 |---|---|
-| `memex/init.py` | `pytest tests/test_init.py -v` |
-| `memex/update.py` | `pytest tests/test_update.py -v` |
-| `memex/action.py` | `pytest tests/test_action.py tests/test_nudge.py -v` |
-| `memex/nudge.py` | `pytest tests/test_nudge.py -v` |
-| `memex/extractor.py` or `memex/writer.py` | `pytest tests/ -v` |
-| Any other change | `pytest tests/ -v` |
+| `memex/init.py` | `python3 -m pytest tests/test_init.py -v` |
+| `memex/update.py` | `python3 -m pytest tests/test_update.py -v` |
+| `memex/action.py` | `python3 -m pytest tests/test_action.py tests/test_nudge.py -v` |
+| `memex/nudge.py` | `python3 -m pytest tests/test_nudge.py -v` |
+| `memex/extractor.py` or `memex/writer.py` | `python3 -m pytest tests/ -v` |
+| Any other change | `python3 -m pytest tests/ -v` |
 
 Always run at minimum the tests for the module you changed. Run `pytest tests/ -v`
 if your change touches multiple modules or has cross-cutting effects.
@@ -401,18 +442,14 @@ explain why. Do not silently make a different choice.
 
 ---
 
-## What to build next (in order)
-
-If you are picking up this project fresh, work in this sequence:
+## Current state
 
-1. `memex/schema.py` — define `KnowledgeRecord` and `ExtractionResult`
-2. `memex/extractor.py` — `is_low_signal()` + `extract()` with Instructor
-3. `memex/writer.py` — `render_markdown()` + `write_record()`
-4. `memex/action.py` — wire everything together, handle env vars and nudge comment
-5. `.github/workflows/memex.yml` — the Action definition
-6. `memex/cli.py` — `memex index` and `memex query`
-7. `tests/` — unit tests for each module with mocked LLM calls
-8. `memex/adr.py` — ADR parser; wire into `init`, `index --include-adrs`, and `action.py` ✅
-9. `README.md` — installation instructions, one-minute quickstart
+The MVP (Phase 1) is fully implemented and tested. All modules exist and all four core
+features are working: GitHub Action, CLI, ADR parser, and low-confidence nudge.
 
-Do not start step N+1 until step N has tests passing.
\ No newline at end of file
+**Phase 2 work** (not yet started — requires explicit instruction before any of this is built):
+- Web UI / dashboard
+- Cross-repo search
+- Slack integration
+- Cloud backend / hosted service
+- Enterprise features (SSO, audit log, etc.)
\ No newline at end of file
diff --git a/README.md b/README.md
index 92d7a05..4b6a1d4 100644
--- a/README.md
+++ b/README.md
@@ -10,12 +10,13 @@ $ memex query "why did we move off MongoDB"
 Results for: why did we move off MongoDB
 ──────────────────────────────────────────────────────────────────────
 
-  1. Migrate billing store to PostgreSQL                     [0.91] ●
-     Unbounded schema flexibility was causing silent data corruption
-     in the billing pipeline. MongoDB's lack of enforced schema...
-     knowledge/decisions/2024-11-14-migrate-billing-store-to-postgresql.md
+  #1  Migrate billing store to PostgreSQL  [0.91]
+      ✅ High confidence
+      Unbounded schema flexibility was causing silent data corruption
+      in the billing pipeline. — Migrated billing store to PostgreSQL.
+      knowledge/decisions/2024-11-14-migrate-billing-store-to-postgresql.md
 
-  2. ...
+  #2  ...
 ```
 
 ---
@@ -80,8 +81,10 @@ git push
 ### 5. Index and query
 
 ```bash
-memex index          # embed all knowledge files (incremental — skips unchanged files)
+memex index          # embed all knowledge files (incremental — only re-embeds changed files)
 memex query "why did we switch from SQS to Redis"
+memex query --min-score 0.5 "why did we switch from SQS to Redis"   # broaden results
+memex query --expand "why did we switch from SQS to Redis"          # AI-expanded search
 ```
 
 ---
@@ -270,10 +273,64 @@ memex index                Embed knowledge files and write vectors to .memex/ind
   --include-adrs           Parse ADR files before embedding (safe to run repeatedly)
 memex query QUESTION       Semantic search over indexed knowledge
   --top N                  Return top N results (default: 3)
+  --min-score F            Hide results below this similarity score (default: 0.70)
+  --expand                 Rewrite query via Claude Haiku before searching
 ```
 
 ---
 
+## Querying your knowledge
+
+`memex query` runs a local semantic search — no data leaves your machine.
+
+```bash
+memex query "why did we move off MongoDB"
+```
+
+Each result shows a similarity score `[0.00–1.00]`, a confidence badge, the decision excerpt, and the file path.
+
+### Result relevance
+
+By default, results below a **0.70 similarity score** are hidden. This threshold exists to avoid surfacing clearly unrelated records. If you get "no relevant results found", you have two options:
+
+**Lower the threshold** — cast a wider net:
+```bash
+memex query --min-score 0.5 "why did we move off MongoDB"
+```
+
+**Expand the query** — let Claude Haiku rephrase your question into richer search terms before embedding:
+```bash
+memex query --expand "how did we store v1 projects"
+```
+
+`--expand` is useful when your query is short or vague and you're not getting results you expect. It adds ~1 second of latency (one Haiku API call) and requires your `ANTHROPIC_API_KEY` to be set.
+
+Example of what expansion does:
+```
+Input:  "how did we store v1 projects"
+
+Expanded: how did we store v1 projects, v1 project storage architecture,
+          legacy v1 project storage system, v1 project data persistence
+          implementation, historical v1 project storage format,
+          v1 project repository structure
+```
+
+The expanded string is embedded as a single query — the broader vocabulary increases the chance of matching records that describe the same concept using different words.
+
+### Rationale badges in results
+
+Each result shows a **rationale badge** — a measure of how well-documented the original decision was in its source PR or ADR. This is independent of the similarity score (how well it matched your query).
+
+| Badge | Meaning |
+|---|---|
+| `✅ Rationale: well-documented` | Clear reasoning captured — safe to rely on |
+| `💡 Rationale: partial` | Some reasoning present — verify if critical |
+| `⚠️  Rationale: limited` | Little reasoning in source — treat as a hint, not a fact |
+
+A record can have a high similarity score (very relevant to your query) but limited rationale (the original PR didn't explain why). These two dimensions are independent.
+
+---
+
 ## How extraction works
 
 Memex uses [Claude](https://www.anthropic.com/claude) (`claude-sonnet-4-6`) with [Instructor](https://github.com/jxnl/instructor) for structured extraction, guaranteeing schema compliance with automatic retries.
diff --git a/knowledge/decisions/2026-04-04-1058-adr0001-use-of-fastembed.md b/knowledge/decisions/2026-04-04-1058-adr0001-use-of-fastembed.md
index aa0ff2c..57165b3 100644
--- a/knowledge/decisions/2026-04-04-1058-adr0001-use-of-fastembed.md
+++ b/knowledge/decisions/2026-04-04-1058-adr0001-use-of-fastembed.md
@@ -1,5 +1,5 @@
 ---
-title: "ADR-0001: Use of fastembed"
+title: "ADR-0001: Use of fastembed with BAAI/bge-small-en-v1.5 for local embeddings"
 date: 2026-04-04
 author: "adr"
 source: "docs/adr/0001-use-fastembed.md"
@@ -8,28 +8,49 @@ confidence: 0.85
 tags: ["adr"]
 ---
 
-# ADR-0001: Use of fastembed
+# ADR-0001: Use of fastembed with BAAI/bge-small-en-v1.5 for local embeddings
 
 ## Context
 
-We needed to create embedings to query the knowledge
+Memex needs to embed knowledge records so users can run semantic search (`memex query`)
+locally. The embedding solution must not require a second API key, must not send document
+content to an external service, and must work in a CLI context where startup time and
+install size matter. Knowledge records contain potentially sensitive internal engineering
+decisions, so data privacy is a hard constraint.
 
 ## Decision
 
-We will use fastembed. because of pricing and 
+Use fastembed with the `BAAI/bge-small-en-v1.5` model for all embedding operations.
+fastembed runs the model locally in ONNX format (CPU only, no GPU required), downloads
+the model once (~130 MB), and returns 384-dimensional vectors. No data leaves the machine
+during indexing or querying.
 
 ## Alternatives considered
 
-voyage.ai, but is from the mongoDB ecosystem, and I privilege relational databases in general
-chroma db
+- **VoyageAI** (`voyage-3` and similar) — ruled out because embeddings are computed on
+  VoyageAI's servers, meaning every `memex index` and `memex query` call sends document
+  content to an external API. This is a privacy concern for internal engineering knowledge.
+  Also charges per token. Quality is generally higher (larger dimensional space, better
+  benchmarks) but the trade-off was not acceptable given the privacy and cost constraints.
+- **sentence-transformers** — runs locally but requires PyTorch as a dependency, which
+  significantly increases install size and startup time. fastembed uses ONNX format instead,
+  which is faster on CPU and has minimal dependencies.
+- **ChromaDB / hosted vector databases** — introduce an external persistence layer and
+  server dependency, contradicting the local-first, no-database design principle.
 
 ## Constraints
 
-- I must learn more about embedings
+- No API key should be required for semantic search — only extraction (Claude) needs one
+- Internal engineering decisions must never be sent to external services during querying
+- Must work on CPU without GPU
+- Install footprint should stay small for CLI use
 
 ## Revisit signals
 
-_None_
+- If result quality becomes a bottleneck at scale, VoyageAI or a larger BGE variant
+  (e.g. `BAAI/bge-large-en-v1.5`, 1024 dimensions) could be evaluated
+- If the corpus grows beyond ~5k records, brute-force cosine similarity may need replacing
+  with an approximate nearest-neighbour index (e.g. FAISS)
 
 ---
 
diff --git a/memex/action.py b/memex/action.py
index 9b1d4dc..99802f9 100644
--- a/memex/action.py
+++ b/memex/action.py
@@ -9,6 +9,7 @@
 import subprocess
 from pathlib import Path
 from .extractor import extract, confidence_level
+from .structural import categorize_file
 from .writer import write_record
 from .schema import ConfidenceLevel
 from .nudge import should_nudge, has_nudge_comment, post_nudge_comment, is_bot_comment
@@ -18,7 +19,10 @@
 
 
 def get_review_comments(pr_number: str, repo: str) -> list[str]:
-    """Fetch review comments via GitHub CLI."""
+    """Fetch review top-level bodies and inline review comments via GitHub CLI."""
+    comments: list[str] = []
+
+    # Top-level review bodies (submitted via "Review changes")
     try:
         result = subprocess.run(
             ["gh", "pr", "view", pr_number, "--repo", repo,
@@ -26,10 +30,35 @@ def get_review_comments(pr_number: str, repo: str) -> list[str]:
             capture_output=True, text=True, timeout=15
         )
         if result.returncode == 0:
-            return json.loads(result.stdout or "[]")
+            comments += json.loads(result.stdout or "[]")
     except Exception:
         pass
-    return []
+
+    # Inline review comments (line-level comments on code)
+    try:
+        result = subprocess.run(
+            ["gh", "api", f"repos/{repo}/pulls/{pr_number}/comments",
+             "--jq", "[.[].body]"],
+            capture_output=True, text=True, timeout=15
+        )
+        if result.returncode == 0:
+            comments += json.loads(result.stdout or "[]")
+    except Exception:
+        pass
+
+    # General PR comments (issue-level comments on the PR thread)
+    try:
+        result = subprocess.run(
+            ["gh", "api", f"repos/{repo}/issues/{pr_number}/comments",
+             "--jq", "[.[].body]"],
+            capture_output=True, text=True, timeout=15
+        )
+        if result.returncode == 0:
+            comments += json.loads(result.stdout or "[]")
+    except Exception:
+        pass
+
+    return [c for c in comments if c]
 
 
 def get_changed_files(pr_number: str, repo: str) -> list[str]:
@@ -115,7 +144,7 @@ def handle_pr_merge() -> None:
                 print(f"ADR record written: {adr_path}")
 
     # Run extraction on the PR itself
-    result = extract(pr_title, pr_body, review_comments)
+    result = extract(pr_title, pr_body, review_comments, changed_files=changed_files)
 
     if result is None:
         print("Low-signal PR — skipped.")
@@ -140,6 +169,9 @@ def handle_pr_merge() -> None:
     all_text = pr_body + " " + " ".join(review_comments)
     related = find_related_adrs(all_text) or None
 
+    # Derive structural tags from changed files (e.g. ["migration", "schema"])
+    structural_tags = sorted({categorize_file(f) for f in changed_files if categorize_file(f)}) or None
+
     # Write the knowledge record
     path = write_record(
         record=result.record,
@@ -148,6 +180,7 @@ def handle_pr_merge() -> None:
         pr_number=int(pr_number),
         repo=repo,
         related=related,
+        tags=structural_tags,
     )
 
     print(f"Knowledge record written: {path} (confidence {result.record.confidence:.2f} — {level.value})")
diff --git a/memex/cli.py b/memex/cli.py
index 1c0deff..283bd9d 100644
--- a/memex/cli.py
+++ b/memex/cli.py
@@ -67,14 +67,70 @@ def extract_confidence(content: str) -> float:
     return 1.0
 
 
+def _extract_md_section(content: str, header: str) -> str:
+    """Extract the body of a named ## section from markdown content."""
+    import re
+    pattern = rf"##\s+{re.escape(header)}\s*\n(.*?)(?=\n##\s|\n---|\Z)"
+    m = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
+    return m.group(1).strip() if m else ""
+
+
+def build_embed_text(content: str) -> str:
+    """Build a clean semantic string for embedding — strips YAML, warnings, and markdown noise."""
+    import re
+
+    def _bullets(text: str) -> list[str]:
+        items = []
+        for line in text.splitlines():
+            m = re.match(r"^\s*[-*]\s+(.+)", line)
+            if m:
+                item = m.group(1).strip()
+                if not item.startswith("_"):  # skip "_None recorded_" placeholders
+                    items.append(item)
+        return items
+
+    title = extract_title(content)
+    context = _extract_md_section(content, "Context")
+    decision = _extract_md_section(content, "Decision")
+    alts = _bullets(_extract_md_section(content, "Alternatives considered"))
+    constraints = _bullets(_extract_md_section(content, "Constraints"))
+
+    parts = [p for p in [title, context, decision] if p]
+    if alts:
+        parts.append("Alternatives: " + "; ".join(alts))
+    if constraints:
+        parts.append("Constraints: " + "; ".join(constraints))
+
+    return "\n".join(parts) if parts else content
+
+
 def extract_excerpt(content: str) -> str:
-    """Pull first substantive paragraph after the frontmatter."""
+    """Pull a human-readable preview from the Context and Decision sections."""
+    context = _extract_md_section(content, "Context")
+    decision = _extract_md_section(content, "Decision")
+
+    if context or decision:
+        if context:
+            # Truncate at a word boundary so we don't cut mid-word
+            if len(context) > 300:
+                context = context[:300].rsplit(None, 1)[0] + "…"
+        excerpt = context
+        if decision:
+            separator = " — " if excerpt else ""
+            excerpt += separator + decision  # decisions are one sentence — show in full
+        return excerpt
+
+    # Fallback: first non-blank, non-heading, non-blockquote line after frontmatter
     in_frontmatter = False
+    fm_count = 0
     for line in content.splitlines():
-        if line == "---":
-            in_frontmatter = not in_frontmatter
+        if line.strip() == "---":
+            fm_count += 1
+            in_frontmatter = fm_count < 2
+            continue
+        if in_frontmatter:
             continue
-        if not in_frontmatter and line.strip() and not line.startswith("#"):
+        if line.strip() and not line.startswith("#") and not line.startswith(">"):
             return line[:400]
     return ""
 
@@ -271,6 +327,8 @@ def index(force, include_adrs):
         click.echo("No knowledge records found. Is the GitHub Action installed?")
         return
 
+    import hashlib
+
     existing = load_index() if not force else {}
 
     disk_paths = {str(r) for r in records}
@@ -280,23 +338,32 @@ def index(force, include_adrs):
     if stale:
         click.echo(f"Removed {len(stale)} deleted record(s) from index.")
 
-    new_records = [r for r in records if str(r) not in existing]
-
-    if not new_records:
+    to_index = []
+    for r in records:
+        content = r.read_text()
+        embed_text = build_embed_text(content)
+        h = hashlib.sha256(embed_text.encode()).hexdigest()
+        entry = existing.get(str(r))
+        if entry is None or entry.get("content_hash") != h:
+            to_index.append((r, content, embed_text, h))
+
+    if not to_index:
+        if stale:
+            save_index(existing)
         click.echo(f"Index up to date — {len(existing)} records indexed.")
         return
 
-    click.echo(f"Indexing {len(new_records)} new records...")
-    contents = [r.read_text() for r in new_records]
-    embeddings = embed(contents)
+    click.echo(f"Indexing {len(to_index)} record(s)...")
+    embeddings = embed([embed_text for _, _, embed_text, _ in to_index])
 
-    for path, content, embedding in zip(new_records, contents, embeddings):
+    for (path, content, embed_text, h), embedding in zip(to_index, embeddings):
         existing[str(path)] = {
             "embedding": embedding,
             "title": extract_title(content),
             "excerpt": extract_excerpt(content),
             "confidence": extract_confidence(content),
             "path": str(path),
+            "content_hash": h,
         }
 
     save_index(existing)
@@ -306,7 +373,19 @@ def index(force, include_adrs):
 @cli.command()
 @click.argument("query", nargs=-1)
 @click.option("--top", default=3, help="Number of results")
-def query(query, top):
+@click.option(
+    "--min-score",
+    default=0.70,
+    show_default=True,
+    help="Minimum similarity score (0–1). Results below this are hidden.",
+)
+@click.option(
+    "--expand",
+    is_flag=True,
+    default=False,
+    help="Use Claude Haiku to rewrite query into richer search terms (adds ~1s latency).",
+)
+def query(query, top, min_score, expand):
     """Query your institutional knowledge.
 
     Example: memex query why did we move off MongoDB
@@ -321,6 +400,23 @@ def query(query, top):
         click.echo("Nothing indexed yet. Run `memex index` first.")
         return
 
+    if expand:
+        client = _anthropic_client()
+        resp = client.messages.create(
+            model="claude-haiku-4-5",
+            max_tokens=200,
+            messages=[{
+                "role": "user",
+                "content": (
+                    f"Rewrite this search query as 4-6 rich technical search phrases, "
+                    f"comma-separated, no explanation:\n\n{query_text}"
+                ),
+            }],
+        )
+        expanded = resp.content[0].text.strip()
+        query_text = expanded if expanded else query_text
+        click.echo(f"  Expanded: {query_text}", err=True)
+
     # Embed the query
     [query_embedding] = embed([query_text])
 
@@ -331,6 +427,9 @@ def query(query, top):
     ]
     scored.sort(key=lambda x: x[0], reverse=True)
 
+    passing = [(s, e) for s, e in scored if s >= min_score]
+    to_display = passing[:top]
+
     import shutil
     term_width = min(shutil.get_terminal_size((80, 24)).columns, 100)
     divider = "─" * term_width
@@ -338,7 +437,16 @@ def query(query, top):
     click.echo(f"\nResults for: {query_text}")
     click.echo(divider)
 
-    for i, (score, entry) in enumerate(scored[:top], 1):
+    if not to_display:
+        lower = round(max(0.0, min_score - 0.2), 1)
+        click.echo(
+            f"\n  No relevant results found (threshold: {min_score:.2f}).\n"
+            f'  Try `memex query --min-score {lower} "..."` to broaden the search.'
+        )
+        click.echo("")
+        return
+
+    for i, (score, entry) in enumerate(to_display, 1):
         title = entry["title"]
         excerpt = _strip_markdown(entry["excerpt"])
         path = entry["path"]
@@ -364,23 +472,23 @@ def query(query, top):
         rank = click.style(f"#{i}", bold=True)
         click.echo(f"\n  {rank}  {title}  {score_label}")
 
-        # Confidence label
+        # Confidence label — reflects quality of rationale in the source record,
+        # independent of the similarity score above.
         if confidence >= 0.80:
-            conf_line = click.style("✅ High confidence", fg="green")
+            conf_line = click.style("✅ Rationale: well-documented", fg="green")
             click.echo(f"      {conf_line}")
         elif confidence >= 0.65:
-            conf_line = click.style("💡 Medium confidence", fg="yellow")
+            conf_line = click.style("💡 Rationale: partial — verify if critical", fg="yellow")
             click.echo(f"      {conf_line}")
         else:
             conf_line = click.style(
-                "⚠️  Low confidence — limited rationale present in source. "
-                "Verify before relying on this record.",
+                "⚠️  Rationale: limited — limited reasoning in source, verify before relying on this record.",
                 fg="yellow",
             )
             click.echo(f"      {conf_line}")
 
-        # Excerpt — skip if it duplicates the low-confidence warning already shown
-        if excerpt and not excerpt.startswith("⚠️"):
+        # Excerpt
+        if excerpt:
             click.echo(_wrap(excerpt, term_width - 6, "      "))
 
         click.echo(f"      {click.style(path, dim=True)}")
diff --git a/memex/extractor.py b/memex/extractor.py
index e77ddb9..a05ee6e 100644
--- a/memex/extractor.py
+++ b/memex/extractor.py
@@ -4,6 +4,7 @@
 import instructor
 from .config import load_api_key
 from .schema import ExtractionResult, ConfidenceLevel
+from .structural import categorize_file, is_structural_change, build_changed_files_section
 
 
 def _client():
@@ -27,11 +28,15 @@ def _client():
 _BODY_TRIVIAL_THRESHOLD = 80  # chars — below this, treat body as absent
 
 
-def is_low_signal(title: str, body: str) -> bool:
+def is_low_signal(title: str, body: str, changed_files: list[str] | None = None) -> bool:
     """Quick heuristic check before spending an LLM call."""
     text = f"{title}\n{body}"
+    # Always-low patterns win regardless of structural files (dep bumps are always noise)
     if any(re.search(p, text, re.IGNORECASE) for p in _ALWAYS_LOW_SIGNAL):
         return True
+    # Structural files override the title-based heuristics
+    if changed_files and is_structural_change(changed_files):
+        return False
     body_is_trivial = len((body or "").strip()) < _BODY_TRIVIAL_THRESHOLD
     if body_is_trivial:
         if any(re.search(p, title, re.IGNORECASE) for p in _TITLE_LOW_SIGNAL):
@@ -39,18 +44,24 @@ def is_low_signal(title: str, body: str) -> bool:
     return False
 
 
-def build_prompt(pr_title: str, pr_body: str, review_comments: list[str]) -> str:
+def build_prompt(
+    pr_title: str,
+    pr_body: str,
+    review_comments: list[str],
+    changed_files: list[str] | None = None,
+) -> str:
     reviews_text = "\n---\n".join(review_comments) if review_comments else "No review comments."
+    changed_files_section = build_changed_files_section(changed_files)
     return f"""You are extracting institutional knowledge from a GitHub pull request.
 
-Your task: determine whether this PR contains a genuine architectural, technical, or 
+Your task: determine whether this PR contains a genuine architectural, technical, or
 product decision — and if so, extract the decision context.
 
-A genuine decision involves a non-obvious choice between alternatives, a trade-off, 
-or a constraint that shaped how something was built. Routine changes (dependency 
+A genuine decision involves a non-obvious choice between alternatives, a trade-off,
+or a constraint that shaped how something was built. Routine changes (dependency
 updates, typo fixes, style cleanup, test additions) do NOT qualify.
 
-Be strict. It is better to return contains_decision=false than to manufacture 
+Be strict. It is better to return contains_decision=false than to manufacture
 rationale that isn't actually present in the text.
 
 ## PR Title
@@ -62,7 +73,16 @@ def build_prompt(pr_title: str, pr_body: str, review_comments: list[str]) -> str
 ## Review Comments
 {reviews_text}
 
-Extract the decision context now. Set confidence based on how much explicit rationale 
+## Changed Files
+{changed_files_section}
+
+If changed files include migrations, infrastructure-as-code, or schema definitions, \
+these represent architectural decisions worth capturing even when the PR description is thin. \
+Extract the decision (what changed and why it matters structurally), but set confidence \
+based on how much explicit rationale is actually present in the text — do not infer \
+motivation that is not stated.
+
+Extract the decision context now. Set confidence based on how much explicit rationale
 is actually present in the text above — do not infer or assume."""
 
 
@@ -70,6 +90,7 @@ def extract(
     pr_title: str,
     pr_body: str,
     review_comments: list[str] | None = None,
+    changed_files: list[str] | None = None,
 ) -> ExtractionResult | None:
     """
     Run the extraction pipeline on a single PR.
@@ -79,18 +100,18 @@ def extract(
     """
     review_comments = review_comments or []
 
-    # Gate 1: cheap heuristic filter
-    if is_low_signal(pr_title, pr_body or ""):
+    # Gate 1: cheap heuristic filter (changed_files enables structural override)
+    if is_low_signal(pr_title, pr_body or "", changed_files):
         return None
 
     # Gate 2: LLM extraction with structured output
     result: ExtractionResult = _client().messages.create(
-        model="claude-sonnet-4-5",
+        model="claude-sonnet-4-6",
         max_tokens=1024,
         messages=[
             {
                 "role": "user",
-                "content": build_prompt(pr_title, pr_body, review_comments),
+                "content": build_prompt(pr_title, pr_body, review_comments, changed_files),
             }
         ],
         response_model=ExtractionResult,
diff --git a/memex/structural.py b/memex/structural.py
new file mode 100644
index 0000000..0824d2f
--- /dev/null
+++ b/memex/structural.py
@@ -0,0 +1,89 @@
+"""
+Structural file detection — pure classification, no LLM dependency.
+
+Identifies high-value file patterns (migrations, IaC, schema, deployment)
+that indicate architectural decisions worth extracting even when PR descriptions
+are thin. Shared by extractor.py and update.py.
+"""
+from __future__ import annotations
+
+import re
+from collections import defaultdict
+
+# File patterns that indicate high-value structural changes.
+# Deliberately conservative — no raw .yaml/.yml (too broad for most projects).
+_STRUCTURAL_PATTERNS: dict[str, list[str]] = {
+    "migration": [
+        r"migrations?/",
+        r"alembic/versions/",
+        r"db/migrate/",
+        r"\d{14}_\w+\.rb$",   # Rails migration naming convention
+    ],
+    "infra": [
+        r"\.tf$",
+        r"terraform/",
+        r"k8s/",
+        r"kubernetes/",
+        r"helm/",
+        r"charts/",
+        r"manifests/",
+    ],
+    "schema": [
+        r"openapi\.ya?ml$",
+        r"openapi\.json$",
+        r"swagger\.ya?ml$",
+        r"schema\.json$",
+        r"schema\.graphql$",
+        r"\.graphql$",
+        r"\.proto$",
+    ],
+    "deployment": [
+        r"docker-compose.*\.ya?ml$",
+        r"Dockerfile(\.\w+)?$",
+    ],
+}
+
+STRUCTURAL_LABEL: dict[str, str] = {
+    "migration": "Database migrations",
+    "infra": "Infrastructure / IaC",
+    "schema": "API / Schema definitions",
+    "deployment": "Deployment configuration",
+}
+
+
+def categorize_file(path: str) -> str | None:
+    """Return the structural category for a file path, or None if not structural."""
+    p = path.replace("\\", "/")
+    for category, patterns in _STRUCTURAL_PATTERNS.items():
+        for pat in patterns:
+            if re.search(pat, p, re.IGNORECASE):
+                return category
+    return None
+
+
+def is_structural_change(changed_files: list[str]) -> bool:
+    """True if any changed file matches a high-value structural pattern."""
+    return any(categorize_file(p) is not None for p in changed_files)
+
+
+def build_changed_files_section(changed_files: list[str] | None) -> str:
+    """Build the Changed Files section for the LLM prompt."""
+    if not changed_files:
+        return "No file list available."
+    buckets: dict[str, list[str]] = defaultdict(list)
+    for path in changed_files:
+        cat = categorize_file(path) or "other"
+        buckets[cat].append(path)
+    lines = []
+    for cat in ["migration", "infra", "schema", "deployment"]:
+        if cat not in buckets:
+            continue
+        lines.append(f"**{STRUCTURAL_LABEL[cat]}** ({len(buckets[cat])} files):")
+        for f in buckets[cat]:
+            lines.append(f"  - {f}")
+    other_count = len(buckets.get("other", []))
+    if other_count:
+        lines.append(f"Other changed files: {other_count}")
+    if not lines:
+        return f"{len(changed_files)} files changed (none match structural patterns)."
+    return "\n".join(lines)
diff --git a/memex/update.py b/memex/update.py
index c140657..4b2e81c 100644
--- a/memex/update.py
+++ b/memex/update.py
@@ -16,6 +16,9 @@
 from pathlib import Path
 from typing import Callable, Optional
 
+# Pure helpers — safe to import at module level (no LLM dependency)
+from .structural import categorize_file, is_structural_change
+
 STATE_FILE = Path(".memex/state.json")
 KNOWLEDGE_DIR = Path("knowledge/decisions")
 
@@ -119,6 +122,17 @@ def _extract_pr_number(message: str) -> Optional[int]:
     return int(m.group(1)) if m else None
 
 
+def git_changed_file_paths(sha: str) -> list[str]:
+    """Return list of file paths changed in a commit via git diff-tree."""
+    r = subprocess.run(
+        ["git", "diff-tree", "--no-commit-id", "-r", "--name-only", sha],
+        capture_output=True, text=True,
+    )
+    if r.returncode != 0 or not r.stdout.strip():
+        return []
+    return [line.strip() for line in r.stdout.strip().splitlines() if line.strip()]
+
+
 def git_files_changed(sha: str) -> int:
     """Return number of files changed in a commit — used by the stat-first filter."""
     r = subprocess.run(
@@ -168,16 +182,19 @@ def detect_repo() -> Optional[str]:
 
 
 def fetch_pr_data(pr_number: int, repo: str) -> Optional[dict]:
-    """Fetch PR title, body, author, url, and reviews via gh CLI."""
+    """Fetch PR title, body, author, url, reviews, and changed files via gh CLI."""
     r = subprocess.run(
         ["gh", "pr", "view", str(pr_number), "--repo", repo,
-         "--json", "title,body,author,url,reviews"],
+         "--json", "title,body,author,url,reviews,files"],
         capture_output=True, text=True, timeout=15,
     )
     if r.returncode != 0:
         return None
     try:
-        return json.loads(r.stdout)
+        data = json.loads(r.stdout)
+        # Normalise changed_files into a flat list of paths
+        data["changed_files"] = [f.get("path", "") for f in data.get("files", []) if f.get("path")]
+        return data
     except json.JSONDecodeError:
         return None
 
@@ -360,26 +377,29 @@ def _process_pr_commit(
     author = (pr.get("author") or {}).get("login", commit.author)
     pr_url = pr.get("url", f"https://github.com/{repo}/pull/{pr_num}")
     reviews = [r.get("body", "") for r in pr.get("reviews", []) if r.get("body")]
+    changed_files = pr.get("changed_files", [])
 
-    if is_low_signal(title, body):
+    if is_low_signal(title, body, changed_files):
         result.skipped_low_signal += 1
         if progress_cb:
             progress_cb(f"  ✗ #{pr_num} low signal — skip")
         return
 
-    extraction = extract(title, body, reviews)
+    extraction = extract(title, body, reviews, changed_files=changed_files)
     if extraction is None or not extraction.contains_decision or extraction.record is None:
         result.skipped_no_decision += 1
         if progress_cb:
             progress_cb(f"  ✗ #{pr_num} no decision found — skip")
         return
 
+    structural_tags = sorted({categorize_file(f) for f in changed_files if categorize_file(f)}) or None
     path = write_record(
         record=extraction.record,
         source_url=pr_url,
         author=author,
         pr_number=pr_num,
         repo=repo,
+        tags=structural_tags,
     )
     indexed_prs.add(pr_num)
     result.written += 1
@@ -406,13 +426,18 @@ def _process_direct_commit(
             progress_cb(f"  ~ {sha_short} already indexed — skip")
         return
 
-    # Stat-first filter — skip large commits before any LLM call
+    # Fetch changed file paths once — reused for stat bypass, low-signal check, and tags
+    diff_files = git_changed_file_paths(commit.sha)
+
+    # Stat-first filter — skip large commits before any LLM call,
+    # unless the commit touches structural files (migrations, IaC, schema, etc.)
     n_files = git_files_changed(commit.sha)
     if n_files > MAX_FILES_CHANGED:
-        result.skipped_stat_filter += 1
-        if progress_cb:
-            progress_cb(f"  ✗ {sha_short} {n_files} files changed — stat filter")
-        return
+        if not is_structural_change(diff_files):
+            result.skipped_stat_filter += 1
+            if progress_cb:
+                progress_cb(f"  ✗ {sha_short} {n_files} files changed — stat filter")
+            return
 
     diff = git_diff(commit.sha)
     if not diff:
@@ -421,25 +446,27 @@ def _process_direct_commit(
             progress_cb(f"  ✗ {sha_short} empty diff — skip")
         return
 
-    if is_low_signal(commit.subject, diff):
+    if is_low_signal(commit.subject, diff, diff_files):
         result.skipped_low_signal += 1
         if progress_cb:
             progress_cb(f"  ✗ {sha_short} low signal — skip")
         return
 
-    extraction = extract(commit.subject, diff, [])
+    extraction = extract(commit.subject, diff, [], changed_files=diff_files)
     if extraction is None or not extraction.contains_decision or extraction.record is None:
         result.skipped_no_decision += 1
         if progress_cb:
             progress_cb(f"  ✗ {sha_short} no decision found — skip")
         return
 
+    structural_tags = sorted({categorize_file(f) for f in diff_files if categorize_file(f)}) or None
     path = write_record(
         record=extraction.record,
         source_url=c_url,
         author=commit.author,
         pr_number=None,  # direct commit — no PR number
         repo=repo,
+        tags=structural_tags,
     )
     indexed_sources.add(c_url)
     result.written += 1
diff --git a/scripts/check_docs.py b/scripts/check_docs.py
new file mode 100644
index 0000000..abdf8ad
--- /dev/null
+++ b/scripts/check_docs.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""
+check_docs.py — validates CLAUDE.md stays in sync with the actual codebase.
+
+Checks:
+  1. Every memex/*.py module (except __init__.py) is listed in CLAUDE.md
+  2. Every @cli.command() in cli.py appears in the CLI behaviour section
+  3. Model strings in extractor.py and init.py match the tech stack table
+  4. Required env vars (os.environ["VAR"]) in action.py have rows in the env vars table
+
+Exit 0 = clean. Exit 1 = drift detected (printed to stdout so Claude Code surfaces it).
+"""
+
+import re
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).parent.parent
+CLAUDE_MD = ROOT / "CLAUDE.md"
+MEMEX_DIR = ROOT / "memex"
+
+errors = []
+
+
+def check(condition: bool, message: str) -> None:
+    if not condition:
+        errors.append(message)
+
+
+claude = CLAUDE_MD.read_text()
+
+# ── 1. Module files ────────────────────────────────────────────────────────────
+py_files = sorted(f.name for f in MEMEX_DIR.glob("*.py") if f.name != "__init__.py")
+for fname in py_files:
+    check(fname in claude, f"file structure: memex/{fname} not listed in CLAUDE.md")
+
+# ── 2. CLI commands ────────────────────────────────────────────────────────────
+cli_src = (MEMEX_DIR / "cli.py").read_text()
+# Commands may have @click.option decorators between @cli.command() and def,
+# so scan line-by-line: find @cli.command, then look forward for the def name.
+lines = cli_src.splitlines()
+command_names = []
+for i, line in enumerate(lines):
+    m = re.match(r'\s*@cli\.command\((?:"(\w+)")?\)', line)
+    if m:
+        explicit_name = m.group(1)
+        for j in range(i + 1, min(i + 15, len(lines))):
+            def_match = re.match(r'\s*def (\w+)', lines[j])
+            if def_match:
+                command_names.append(explicit_name or def_match.group(1))
+                break
+
+cli_section_match = re.search(
+    r"## CLI behaviour(.*?)^##", claude, re.DOTALL | re.MULTILINE
+)
+cli_text = cli_section_match.group(1) if cli_section_match else ""
+
+for cmd in command_names:
+    check(
+        f"memex {cmd}" in cli_text,
+        f"CLI behaviour section: 'memex {cmd}' not documented in CLAUDE.md",
+    )
+
+# ── 3. Model strings ───────────────────────────────────────────────────────────
+for fname in ["extractor.py", "init.py"]:
+    src = (MEMEX_DIR / fname).read_text()
+    models = re.findall(r'model=["\']([^"\']+)["\']', src)
+    for model in set(models):
+        check(
+            model in claude,
+            f"tech stack: model '{model}' used in memex/{fname} but not in CLAUDE.md",
+        )
+
+# ── 4. Required env vars ───────────────────────────────────────────────────────
+action_src = (MEMEX_DIR / "action.py").read_text()
+# Only required vars (os.environ["VAR"], not os.environ.get)
+required_vars = re.findall(r'os\.environ\["(\w+)"\]', action_src)
+# Exclude internal GitHub meta-vars that don't need user documentation
+skip_vars = {"GITHUB_EVENT_NAME"}
+required_vars = sorted(set(v for v in required_vars if v not in skip_vars))
+
+env_section_match = re.search(
+    r"## Environment variables.*?\n(.*?)^##", claude, re.DOTALL | re.MULTILINE
+)
+env_text = env_section_match.group(1) if env_section_match else ""
+
+for var in required_vars:
+    check(
+        f"`{var}`" in env_text,
+        f"env vars table: '{var}' used in action.py but not documented in CLAUDE.md",
+    )
+
+# ── Report ─────────────────────────────────────────────────────────────────────
+if errors:
+    print("CLAUDE.md drift detected:\n")
+    for e in errors:
+        print(f"  ✗ {e}")
+    print(
+        f"\n{len(errors)} issue(s) — update CLAUDE.md to match the code, "
+        "or run 'python scripts/check_docs.py' to recheck."
+    )
+    sys.exit(1)
+else:
+    print(
+        f"CLAUDE.md is in sync "
+        f"({len(py_files)} modules, {len(command_names)} CLI commands, "
+        f"{len(required_vars)} env vars checked)"
+    )
+    sys.exit(0)
diff --git a/scripts/test_action.sh b/scripts/test_action.sh
index 24f46c2..f2aa138 100755
--- a/scripts/test_action.sh
+++ b/scripts/test_action.sh
@@ -6,11 +6,15 @@
 
 set -euo pipefail
 
+# ── Debug mode ────────────────────────────────────────────────────────────────
+[[ "${DEBUG:-}" == "1" ]] && set -x
+
 # ── API key ───────────────────────────────────────────────────────────────────
 if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then
   echo "Error: ANTHROPIC_API_KEY is not set."
   exit 1
 fi
+echo "[debug] ANTHROPIC_API_KEY is set (${#ANTHROPIC_API_KEY} chars)"
 
 # ── PR data ───────────────────────────────────────────────────────────────────
 if [[ -n "${1:-}" ]]; then
@@ -20,11 +24,15 @@ if [[ -n "${1:-}" ]]; then
   REPO="$(echo "$PR_URL" | sed -E 's|https://github.com/([^/]+/[^/]+)/pull/.*|\1|')"
   PR_NUMBER="$(echo "$PR_URL" | sed -E 's|.*/pull/([0-9]+).*|\1|')"
 
-  echo "Fetching PR #$PR_NUMBER from $REPO ..."
+  echo "[debug] Fetching PR #$PR_NUMBER from $REPO ..."
   PR_JSON="$(gh pr view "$PR_NUMBER" --repo "$REPO" --json title,body,author,url)"
   PR_TITLE="$(echo "$PR_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])")"
   PR_BODY="$(echo "$PR_JSON"  | python3 -c "import sys,json; print(json.load(sys.stdin)['body'] or '')")"
   PR_AUTHOR="$(echo "$PR_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin)['author']['login'])")"
+  echo "[debug] PR_TITLE:  $PR_TITLE"
+  echo "[debug] PR_AUTHOR: $PR_AUTHOR"
+  echo "[debug] PR_NUMBER: $PR_NUMBER"
+  echo "[debug] REPO:      $REPO"
 else
   # ── Built-in fixture (high-signal example) ──────────────────────────────────
   # Swap for the low-signal example to test discard behaviour:
@@ -47,21 +55,29 @@ export PR_TITLE PR_BODY PR_URL PR_NUMBER PR_AUTHOR REPO
 
 # ── Stub gh for nudge comments (avoids posting to a real PR) ─────────────────
 # Real PR data was already fetched above; only the comment call needs stubbing.
+REAL_GH="$(which gh)"
 GH_STUB="$(mktemp -d)/gh"
-cat > "$GH_STUB" << 'STUB'
+cat > "$GH_STUB" << STUB
 #!/usr/bin/env bash
 # Pass through 'gh pr view' so get_review_comments still works.
-if [[ "$*" == *"--json reviews"* ]]; then
-  exec /usr/bin/env gh "$@"
+if [[ "\$*" == *"--json reviews"* ]] || [[ "\$*" == *"--json files"* ]] || [[ "\$1" == "api" ]]; then
+  exec "$REAL_GH" "\$@"
 fi
-echo "[gh stub] $*" >&2
+echo "[gh stub] \$*" >&2
 echo "[]"
 STUB
 chmod +x "$GH_STUB"
 export PATH="$(dirname "$GH_STUB"):$PATH"
+echo "[debug] gh stub created at $GH_STUB"
 
 # ── Run ───────────────────────────────────────────────────────────────────────
 cd "$(dirname "$0")/.."
-echo "Running memex action with PR: \"$PR_TITLE\""
+echo ""
+echo "Running memex action with:"
+echo "  PR_TITLE:  $PR_TITLE"
+echo "  PR_AUTHOR: $PR_AUTHOR"
+echo "  PR_NUMBER: $PR_NUMBER"
+echo "  PR_URL:    $PR_URL"
+echo "  REPO:      $REPO"
 echo "────────────────────────────────────────"
 python3 -m memex.action
diff --git a/tests/test_action.py b/tests/test_action.py
index 7935bdd..785c4bd 100644
--- a/tests/test_action.py
+++ b/tests/test_action.py
@@ -1,7 +1,9 @@
 """Unit tests for memex/action.py nudge integration and issue_comment handling."""
+import json
 import os
+import subprocess
 import pytest
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch, MagicMock, call
 from pathlib import Path
 
 from memex.schema import KnowledgeRecord, ExtractionResult
@@ -36,6 +38,87 @@ def _make_result(confidence: float) -> ExtractionResult:
 }
 
 
+# --- get_review_comments ---
+
+def _mock_run(stdout: str, returncode: int = 0) -> MagicMock:
+    m = MagicMock()
+    m.returncode = returncode
+    m.stdout = stdout
+    return m
+
+
+def _is_reviews_call(cmd):
+    return "--json" in cmd and "reviews" in cmd
+
+def _is_inline_comments_call(cmd):
+    return any("pulls" in arg for arg in cmd)
+
+def _is_general_comments_call(cmd):
+    return any("issues" in arg for arg in cmd)
+
+
+class TestGetReviewComments:
+    def test_combines_review_bodies_inline_and_general_comments(self):
+        from memex.action import get_review_comments
+
+        def fake_run(cmd, **kwargs):
+            if _is_reviews_call(cmd):
+                return _mock_run(json.dumps(["Top-level review body"]))
+            if _is_inline_comments_call(cmd):
+                return _mock_run(json.dumps(["Inline line comment"]))
+            if _is_general_comments_call(cmd):
+                return _mock_run(json.dumps(["General PR comment"]))
+            return _mock_run("[]")
+
+        with patch("memex.action.subprocess.run", side_effect=fake_run):
+            result = get_review_comments("42", "acme/repo")
+
+        assert result == ["Top-level review body", "Inline line comment", "General PR comment"]
+
+    def test_filters_empty_strings(self):
+        from memex.action import get_review_comments
+
+        def fake_run(cmd, **kwargs):
+            if _is_reviews_call(cmd):
+                return _mock_run(json.dumps([""]))        # empty review body
+            if _is_inline_comments_call(cmd):
+                return _mock_run(json.dumps(["Inline comment"]))
+            if _is_general_comments_call(cmd):
+                return _mock_run(json.dumps([""]))        # empty general comment
+            return _mock_run("[]")
+
+        with patch("memex.action.subprocess.run", side_effect=fake_run):
+            result = get_review_comments("42", "acme/repo")
+
+        assert result == ["Inline comment"]
+
+    def test_returns_empty_list_when_all_calls_fail(self):
+        from memex.action import get_review_comments
+
+        failing = _mock_run("", returncode=1)
+        with patch("memex.action.subprocess.run", return_value=failing):
+            result = get_review_comments("42", "acme/repo")
+
+        assert result == []
+
+    def test_partial_failure_returns_successful_sources(self):
+        from memex.action import get_review_comments
+
+        def fake_run(cmd, **kwargs):
+            if _is_reviews_call(cmd):
+                return _mock_run("", returncode=1)        # reviews call fails
+            if _is_inline_comments_call(cmd):
+                return _mock_run(json.dumps(["Inline comment"]))
+            if _is_general_comments_call(cmd):
+                return _mock_run(json.dumps(["General comment"]))
+            return _mock_run("[]")
+
+        with patch("memex.action.subprocess.run", side_effect=fake_run):
+            result = get_review_comments("42", "acme/repo")
+
+        assert result == ["Inline comment", "General comment"]
+
+
 # --- handle_pr_merge: nudge logic ---
 
 class TestHandlePrMergeNudge:
@@ -254,3 +337,60 @@ def test_defaults_to_pr_merge(self, monkeypatch):
             from memex.action import main
             main()
         mock_merge.assert_called_once()
+
+
+# --- changed_files wiring ---
+
+class TestChangedFilesWiring:
+    def test_changed_files_passed_to_extract(self, tmp_path, monkeypatch):
+        """changed_files from get_changed_files() should be forwarded to extract()."""
+        monkeypatch.chdir(tmp_path)
+        for k, v in BASE_ENV.items():
+            monkeypatch.setenv(k, v)
+
+        structural_files = ["migrations/001_add_sessions.py"]
+
+        with patch("memex.action.get_review_comments", return_value=[]), \
+             patch("memex.action.get_changed_files", return_value=structural_files), \
+             patch("memex.action.extract", return_value=_make_result(0.85)) as mock_extract, \
+             patch("memex.action.write_record", return_value=Path("knowledge/decisions/foo.md")):
+
+            from memex.action import handle_pr_merge
+            handle_pr_merge()
+
+        mock_extract.assert_called_once()
+        assert mock_extract.call_args.kwargs.get("changed_files") == structural_files
+
+    def test_structural_tags_written_for_migration_pr(self, tmp_path, monkeypatch):
+        """Structural tags derived from changed_files should reach write_record."""
+        monkeypatch.chdir(tmp_path)
+        for k, v in BASE_ENV.items():
+            monkeypatch.setenv(k, v)
+
+        with patch("memex.action.get_review_comments", return_value=[]), \
+             patch("memex.action.get_changed_files", return_value=["migrations/001_add_sessions.py"]), \
+             patch("memex.action.extract", return_value=_make_result(0.85)), \
+             patch("memex.action.write_record", return_value=Path("knowledge/decisions/foo.md")) as mock_write:
+
+            from memex.action import handle_pr_merge
+            handle_pr_merge()
+
+        _, kwargs = mock_write.call_args
+        assert "migration" in (kwargs.get("tags") or [])
+
+    def test_no_structural_tags_for_plain_pr(self, tmp_path, monkeypatch):
+        """Non-structural files should produce no structural tags."""
+        monkeypatch.chdir(tmp_path)
+        for k, v in BASE_ENV.items():
+            monkeypatch.setenv(k, v)
+
+        with patch("memex.action.get_review_comments", return_value=[]), \
+             patch("memex.action.get_changed_files", return_value=["src/auth.py", "tests/test_auth.py"]), \
+             patch("memex.action.extract", return_value=_make_result(0.85)), \
+             patch("memex.action.write_record", return_value=Path("knowledge/decisions/foo.md")) as mock_write:
+
+            from memex.action import handle_pr_merge
+            handle_pr_merge()
+
+        _, kwargs = mock_write.call_args
+        assert kwargs.get("tags") is None
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..382c179
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,516 @@
+"""Unit tests for memex/cli.py — embedding text, excerpt extraction, index, and query."""
+import hashlib
+import json
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+from click.testing import CliRunner
+
+from memex.cli import (
+    _extract_md_section,
+    build_embed_text,
+    extract_excerpt,
+    extract_title,
+    cli,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+FULL_MD = """\
+---
+title: "Use Redis for caching"
+date: 2026-04-01
+author: "alice"
+source: "https://github.com/acme/repo/pull/42"
+confidence: 0.85
+tags: []
+---
+
+# Use Redis for caching
+
+## Context
+
+The team needed a fast in-memory store for session data. PostgreSQL was being hit on
+every request, causing latency spikes during peak traffic.
+
+## Decision
+
+Use Redis as an in-memory cache layer between the application and PostgreSQL.
+
+## Alternatives considered
+
+- Memcached — ruled out because team has no experience with it
+- In-process LRU cache — doesn't work across multiple app instances
+
+## Constraints
+
+- Redis must be deployed as a separate service
+- Cache invalidation must be handled explicitly
+
+## Revisit signals
+
+_None_
+
+---
+
+_Extracted by Memex from [PR #42](https://github.com/acme/repo/pull/42) · 2026-04-01_
+"""
+
+LOW_CONF_MD = """\
+---
+title: "Low confidence record"
+date: 2026-04-01
+author: "memex-init"
+source: "pyproject.toml"
+confidence: 0.50
+tags: ["init"]
+---
+
+# Low confidence record
+
+> ⚠️ **Low confidence** — limited rationale present in source. Verify before relying on this record.
+
+## Context
+
+Some brief context here about the decision.
+
+## Decision
+
+A decision was made about the system.
+
+## Alternatives considered
+
+_None recorded_
+
+## Constraints
+
+_None recorded_
+
+## Revisit signals
+
+_None_
+
+---
+
+_Extracted by Memex from repo scan · 2026-04-01_
+"""
+
+NO_SECTIONS_MD = """\
+---
+title: "Hand-written note"
+date: 2026-04-01
+author: "bob"
+source: "https://github.com/acme/repo/pull/99"
+confidence: 0.70
+tags: []
+---
+
+This is a hand-written note with no standard sections.
+It has some content but no ## headers.
+"""
+
+
+# ---------------------------------------------------------------------------
+# _extract_md_section
+# ---------------------------------------------------------------------------
+
+class TestExtractMdSection:
+    def test_extracts_section_body(self):
+        result = _extract_md_section(FULL_MD, "Context")
+        assert "PostgreSQL was being hit" in result
+        assert "##" not in result
+
+    def test_missing_section_returns_empty(self):
+        result = _extract_md_section(FULL_MD, "Nonexistent")
+        assert result == ""
+
+    def test_case_insensitive(self):
+        md = "## CONTEXT\nSome context here.\n## Decision\nA decision.\n"
+        assert "Some context here." in _extract_md_section(md, "Context")
+
+    def test_does_not_bleed_into_next_section(self):
+        result = _extract_md_section(FULL_MD, "Context")
+        assert "Use Redis as an in-memory" not in result  # that's in Decision
+
+    def test_stops_at_horizontal_rule(self):
+        result = _extract_md_section(FULL_MD, "Revisit signals")
+        assert "_Extracted by Memex" not in result
+
+
+# ---------------------------------------------------------------------------
+# build_embed_text
+# ---------------------------------------------------------------------------
+
+class TestBuildEmbedText:
+    def test_includes_title(self):
+        result = build_embed_text(FULL_MD)
+        assert "Use Redis for caching" in result
+
+    def test_includes_context(self):
+        result = build_embed_text(FULL_MD)
+        assert "PostgreSQL was being hit" in result
+
+    def test_includes_decision(self):
+        result = build_embed_text(FULL_MD)
+        assert "in-memory cache layer" in result
+
+    def test_includes_alternatives(self):
+        result = build_embed_text(FULL_MD)
+        assert "Memcached" in result
+        assert "In-process LRU cache" in result
+
+    def test_includes_constraints(self):
+        result = build_embed_text(FULL_MD)
+        assert "Redis must be deployed" in result
+
+    def test_strips_yaml_frontmatter(self):
+        result = build_embed_text(FULL_MD)
+        assert "confidence:" not in result
+        assert "tags:" not in result
+        assert "author:" not in result
+
+    def test_strips_warning_block(self):
+        result = build_embed_text(LOW_CONF_MD)
+        assert "⚠️" not in result
+        assert "limited rationale present" not in result
+
+    def test_strips_footer(self):
+        result = build_embed_text(FULL_MD)
+        assert "_Extracted by Memex" not in result
+
+    def test_no_markdown_headers(self):
+        result = build_embed_text(FULL_MD)
+        assert "## " not in result
+
+    def test_skips_none_recorded_alternatives(self):
+        result = build_embed_text(LOW_CONF_MD)
+        assert "Alternatives:" not in result
+
+    def test_skips_none_recorded_constraints(self):
+        result = build_embed_text(LOW_CONF_MD)
+        assert "Constraints:" not in result
+
+    def test_fallback_on_no_sections(self):
+        result = build_embed_text(NO_SECTIONS_MD)
+        # Falls back to raw content — should at least contain the note text (case-preserved)
+        assert "hand-written note" in result.lower()
+
+    def test_alternatives_formatted_with_semicolons(self):
+        result = build_embed_text(FULL_MD)
+        assert "Alternatives: " in result
+        # Both alternatives joined
+        assert "Memcached" in result and "In-process" in result
+
+
+# ---------------------------------------------------------------------------
+# extract_excerpt
+# ---------------------------------------------------------------------------
+
+class TestExtractExcerpt:
+    def test_returns_context_and_decision(self):
+        result = extract_excerpt(FULL_MD)
+        assert "PostgreSQL was being hit" in result  # from Context
+        assert "in-memory cache layer" in result      # from Decision
+
+    def test_skips_warning_block(self):
+        result = extract_excerpt(LOW_CONF_MD)
+        assert "⚠️" not in result
+        assert "Low confidence" not in result
+
+    def test_shows_actual_context_for_low_confidence(self):
+        result = extract_excerpt(LOW_CONF_MD)
+        assert "Some brief context here" in result
+
+    def test_context_truncated_at_300_chars(self):
+        long_context = "word " * 100  # 500 chars
+        md = f"---\ntitle: t\n---\n## Context\n{long_context}\n## Decision\nFoo.\n"
+        result = extract_excerpt(md)
+        # Context portion is capped at 300 chars and ends with ellipsis
+        assert "…" in result
+        assert result.endswith("Foo.")  # decision shown in full
+        # No mid-word cuts — the part before " — " ends cleanly
+        context_part = result.split(" — ")[0]
+        assert not context_part.rstrip("…").endswith(" ")
+
+    def test_fallback_skips_blockquotes(self):
+        md = "---\ntitle: t\n---\n\n> This is a blockquote\n\nActual content here.\n"
+        result = extract_excerpt(md)
+        assert "Actual content here." in result
+        assert "blockquote" not in result
+
+    def test_fallback_skips_headings(self):
+        md = "---\ntitle: t\n---\n\n# Big heading\n\nReal content.\n"
+        result = extract_excerpt(md)
+        assert "Real content." in result
+        assert "Big heading" not in result
+
+    def test_context_only_when_no_decision_section(self):
+        md = "---\ntitle: t\n---\n## Context\nOnly context here.\n"
+        result = extract_excerpt(md)
+        assert "Only context here." in result
+        assert " — " not in result  # no decision separator
+
+
+# ---------------------------------------------------------------------------
+# index command — content_hash logic
+# ---------------------------------------------------------------------------
+
+class TestIndexCommand:
+    def _make_knowledge_file(self, tmp_path: Path, name: str, content: str) -> Path:
+        d = tmp_path / "knowledge" / "decisions"
+        d.mkdir(parents=True, exist_ok=True)
+        f = d / name
+        f.write_text(content)
+        return f
+
+    def _index_path(self, tmp_path: Path) -> Path:
+        return tmp_path / ".memex" / "index.json"
+
+    def _run_index(self, tmp_path, monkeypatch, mock_embed=None):
+        if mock_embed is None:
+            mock_embed = MagicMock(return_value=[[0.1] * 384])
+        monkeypatch.chdir(tmp_path)
+        with patch("memex.cli.embed", mock_embed):
+            runner = CliRunner()
+            result = runner.invoke(cli, ["index"])
+        return result, mock_embed
+
+    def test_new_file_gets_content_hash(self, tmp_path, monkeypatch):
+        self._make_knowledge_file(tmp_path, "record.md", FULL_MD)
+        self._run_index(tmp_path, monkeypatch)
+        index = json.loads(self._index_path(tmp_path).read_text())
+        entry = list(index.values())[0]
+        assert "content_hash" in entry
+        assert len(entry["content_hash"]) == 64  # SHA256 hex
+
+    def test_unchanged_file_not_reembedded(self, tmp_path, monkeypatch):
+        self._make_knowledge_file(tmp_path, "record.md", FULL_MD)
+        _, mock_embed = self._run_index(tmp_path, monkeypatch)
+        assert mock_embed.call_count == 1
+        # Second run — should skip
+        _, mock_embed2 = self._run_index(tmp_path, monkeypatch)
+        assert mock_embed2.call_count == 0
+
+    def test_changed_file_gets_reembedded(self, tmp_path, monkeypatch):
+        f = self._make_knowledge_file(tmp_path, "record.md", FULL_MD)
+        self._run_index(tmp_path, monkeypatch)
+        # Mutate the file
+        f.write_text(FULL_MD.replace("PostgreSQL was being hit", "MySQL was being hit"))
+        _, mock_embed2 = self._run_index(tmp_path, monkeypatch)
+        assert mock_embed2.call_count == 1
+
+    def test_legacy_entry_without_hash_gets_reembedded(self, tmp_path, monkeypatch):
+        f = self._make_knowledge_file(tmp_path, "record.md", FULL_MD)
+        # Pre-populate index without content_hash (legacy format)
+        index_path = self._index_path(tmp_path)
+        index_path.parent.mkdir(parents=True, exist_ok=True)
+        index_path.write_text(json.dumps({
+            str(f): {
+                "embedding": [0.1] * 384,
+                "title": "Use Redis for caching",
+                "excerpt": "old excerpt",
+                "confidence": 0.85,
+                "path": str(f),
+                # no content_hash
+            }
+        }))
+        monkeypatch.chdir(tmp_path)
+        mock_embed = MagicMock(return_value=[[0.2] * 384])
+        with patch("memex.cli.embed", mock_embed):
+            CliRunner().invoke(cli, ["index"])
+        assert mock_embed.call_count == 1
+
+    def test_deleted_record_removed_from_index(self, tmp_path, monkeypatch):
+        f1 = self._make_knowledge_file(tmp_path, "record1.md", FULL_MD)
+        f2 = self._make_knowledge_file(tmp_path, "record2.md", LOW_CONF_MD)
+        mock_embed = MagicMock(side_effect=lambda texts: [[0.1] * 384] * len(texts))
+        self._run_index(tmp_path, monkeypatch, mock_embed)
+        # Delete one file
+        f2.unlink()
+        self._run_index(tmp_path, monkeypatch)
+        index = json.loads(self._index_path(tmp_path).read_text())
+        # Index stores paths relative to cwd (tmp_path)
+        rel_f1 = str(f1.relative_to(tmp_path))
+        rel_f2 = str(f2.relative_to(tmp_path))
+        assert rel_f2 not in index
+        assert rel_f1 in index
+
+    def test_embed_receives_clean_text_not_yaml(self, tmp_path, monkeypatch):
+        self._make_knowledge_file(tmp_path, "record.md", FULL_MD)
+        captured = []
+
+        def mock_embed(texts):
+            captured.extend(texts)
+            return [[0.1] * 384] * len(texts)
+
+        monkeypatch.chdir(tmp_path)
+        with patch("memex.cli.embed", mock_embed):
+            CliRunner().invoke(cli, ["index"])
+
+        assert captured, "embed was not called"
+        embedded_text = captured[0]
+        assert "confidence:" not in embedded_text
+        assert "tags:" not in embedded_text
+        assert "⚠️" not in embedded_text
+
+
+# ---------------------------------------------------------------------------
+# query command — --min-score
+# ---------------------------------------------------------------------------
+
+def _make_index_entry(title: str, score_hint: float, path: str = "knowledge/decisions/test.md"):
+    """Make a fake index entry with a fixed embedding (we'll mock cosine_similarity)."""
+    return {
+        "embedding": [score_hint],  # mock will use actual cosine_similarity override
+        "title": title,
+        "excerpt": f"Excerpt for {title}",
+        "confidence": 0.85,
+        "path": path,
+    }
+
+
+class TestQueryMinScore:
+    def _run_query(self, tmp_path, monkeypatch, index_data: dict, args: list[str]):
+        index_path = tmp_path / ".memex" / "index.json"
+        index_path.parent.mkdir(parents=True, exist_ok=True)
+        index_path.write_text(json.dumps(index_data))
+        monkeypatch.chdir(tmp_path)
+
+        # Mock embed to return a fixed vector, mock cosine_similarity to return
+        # the embedding value directly as the score
+        def mock_embed(texts):
+            return [[0.5] * 1] * len(texts)
+
+        def mock_cosine(a, b):
+            return b[0]  # score encoded in the entry's embedding
+
+        with patch("memex.cli.embed", mock_embed), \
+             patch("memex.cli.cosine_similarity", mock_cosine):
+            runner = CliRunner()
+            result = runner.invoke(cli, ["query"] + args)
+        return result
+
+    def test_filters_result_below_min_score(self, tmp_path, monkeypatch):
+        index_data = {
+            "k/a.md": {**_make_index_entry("Irrelevant result", 0.0), "embedding": [0.60]},
+        }
+        result = self._run_query(tmp_path, monkeypatch, index_data,
+                                 ["some", "query", "--min-score", "0.70"])
+        assert "No relevant results found" in result.output
+
+    def test_shows_result_above_min_score(self, tmp_path, monkeypatch):
+        index_data = {
+            "k/a.md": {**_make_index_entry("Good result", 0.0), "embedding": [0.80]},
+        }
+        result = self._run_query(tmp_path, monkeypatch, index_data,
+                                 ["some", "query", "--min-score", "0.70"])
+        assert "Good result" in result.output
+
+    def test_no_results_message_contains_threshold(self, tmp_path, monkeypatch):
+        index_data = {
+            "k/a.md": {**_make_index_entry("Low match", 0.0), "embedding": [0.50]},
+        }
+        result = self._run_query(tmp_path, monkeypatch, index_data,
+                                 ["test", "--min-score", "0.80"])
+        assert "0.80" in result.output
+
+    def test_no_results_suggests_lower_score(self, tmp_path, monkeypatch):
+        index_data = {
+            "k/a.md": {**_make_index_entry("Low match", 0.0), "embedding": [0.50]},
+        }
+        result = self._run_query(tmp_path, monkeypatch, index_data,
+                                 ["test", "--min-score", "0.80"])
+        assert "--min-score 0.6" in result.output
+
+    def test_top_option_respected_after_filter(self, tmp_path, monkeypatch):
+        index_data = {f"k/{i}.md": {**_make_index_entry(f"Result {i}", 0.0), "embedding": [0.90]}
+                      for i in range(5)}
+        result = self._run_query(tmp_path, monkeypatch, index_data,
+                                 ["test", "--min-score", "0.0", "--top", "2"])
+        # Only 2 results should be shown
+        assert result.output.count("#1") == 1
+        assert result.output.count("#2") == 1
+        assert "#3" not in result.output
+
+    def test_default_min_score_is_070(self, tmp_path, monkeypatch):
+        # Result at 0.69 should be hidden by default
+        index_data = {
+            "k/a.md": {**_make_index_entry("Borderline", 0.0), "embedding": [0.69]},
+        }
+        result = self._run_query(tmp_path, monkeypatch, index_data, ["test"])
+        assert "No relevant results found" in result.output
+
+
+# ---------------------------------------------------------------------------
+# query command — --expand
+# ---------------------------------------------------------------------------
+
+class TestQueryExpand:
+    def _run_query_with_expand(self, tmp_path, monkeypatch, expanded_text: str, args: list[str]):
+        index_path = tmp_path / ".memex" / "index.json"
+        index_path.parent.mkdir(parents=True, exist_ok=True)
+        index_data = {
+            "k/a.md": {**_make_index_entry("Some result", 0.0), "embedding": [0.80]},
+        }
+        index_path.write_text(json.dumps(index_data))
+        monkeypatch.chdir(tmp_path)
+
+        mock_message = MagicMock()
+        mock_message.content = [MagicMock(text=expanded_text)]
+        mock_client = MagicMock()
+        mock_client.messages.create.return_value = mock_message
+
+        captured_embed_args = []
+
+        def mock_embed(texts):
+            captured_embed_args.extend(texts)
+            return [[0.5]] * len(texts)
+
+        def mock_cosine(a, b):
+            return b[0]
+
+        with patch("memex.cli.embed", mock_embed), \
+             patch("memex.cli.cosine_similarity", mock_cosine), \
+             patch("memex.cli._anthropic_client", return_value=mock_client):
+            runner = CliRunner()
+            result = runner.invoke(cli, ["query"] + args)
+
+        return result, mock_client, captured_embed_args
+
+    def test_expand_calls_haiku(self, tmp_path, monkeypatch):
+        _, mock_client, _ = self._run_query_with_expand(
+            tmp_path, monkeypatch, "expanded terms", ["test query", "--expand"]
+        )
+        call_kwargs = mock_client.messages.create.call_args
+        assert call_kwargs.kwargs["model"] == "claude-haiku-4-5"
+
+    def test_expand_passes_expanded_text_to_embed(self, tmp_path, monkeypatch):
+        _, _, captured = self._run_query_with_expand(
+            tmp_path, monkeypatch, "rich expanded search terms", ["original query", "--expand"]
+        )
+        assert captured[0] == "rich expanded search terms"
+
+    def test_expand_fallback_on_empty_response(self, tmp_path, monkeypatch):
+        _, _, captured = self._run_query_with_expand(
+            tmp_path, monkeypatch, "", ["original query", "--expand"]
+        )
+        assert captured[0] == "original query"
+
+    def test_no_expand_skips_anthropic_client(self, tmp_path, monkeypatch):
+        index_path = tmp_path / ".memex" / "index.json"
+        index_path.parent.mkdir(parents=True, exist_ok=True)
+        index_data = {
+            "k/a.md": {**_make_index_entry("Result", 0.0), "embedding": [0.80]},
+        }
+        index_path.write_text(json.dumps(index_data))
+        monkeypatch.chdir(tmp_path)
+
+        with patch("memex.cli.embed", return_value=[[0.5]]), \
+             patch("memex.cli.cosine_similarity", lambda a, b: b[0]), \
+             patch("memex.cli._anthropic_client") as mock_factory:
+            CliRunner().invoke(cli, ["query", "test"])
+
+        mock_factory.assert_not_called()
diff --git a/tests/test_extractor.py b/tests/test_extractor.py
new file mode 100644
index 0000000..df79556
--- /dev/null
+++ b/tests/test_extractor.py
@@ -0,0 +1,256 @@
+"""Unit tests for structural detection and extractor interfaces."""
+import pytest
+from memex.structural import (
+    categorize_file,
+    is_structural_change,
+    build_changed_files_section,
+)
+from memex.extractor import is_low_signal, build_prompt
+
+
+# ---------------------------------------------------------------------------
+# categorize_file
+# ---------------------------------------------------------------------------
+
+class TestCategorizeFile:
+    def test_alembic_migration(self):
+        assert categorize_file("alembic/versions/001_add_users.py") == "migration"
+
+    def test_generic_migrations_dir(self):
+        assert categorize_file("migrations/0001_initial.py") == "migration"
+
+    def test_nested_migrations_dir(self):
+        assert categorize_file("app/migrations/0002_add_column.py") == "migration"
+
+    def test_rails_migration_naming(self):
+        assert categorize_file("db/migrate/20240101120000_create_users.rb") == "migration"
+
+    def test_terraform_file(self):
+        assert categorize_file("infra/main.tf") == "infra"
+
+    def test_terraform_in_root(self):
+        assert categorize_file("main.tf") == "infra"
+
+    def test_k8s_yaml(self):
+        assert categorize_file("k8s/deployment.yaml") == "infra"
+
+    def test_kubernetes_dir(self):
+        assert categorize_file("kubernetes/service.yml") == "infra"
+
+    def test_helm_values(self):
+        assert categorize_file("helm/values.yaml") == "infra"
+
+    def test_charts_dir(self):
+        assert categorize_file("charts/myapp/templates/ingress.yaml") == "infra"
+
+    def test_openapi_yaml(self):
+        assert categorize_file("openapi.yaml") == "schema"
+
+    def test_openapi_json(self):
+        assert categorize_file("api/openapi.json") == "schema"
+
+    def test_swagger_yaml(self):
+        assert categorize_file("swagger.yaml") == "schema"
+
+    def test_schema_json(self):
+        assert categorize_file("schema.json") == "schema"
+
+    def test_graphql_schema(self):
+        assert categorize_file("schema.graphql") == "schema"
+
+    def test_graphql_file(self):
+        assert categorize_file("api/queries.graphql") == "schema"
+
+    def test_proto_file(self):
+        assert categorize_file("api/service.proto") == "schema"
+
+    def test_dockerfile(self):
+        assert categorize_file("Dockerfile") == "deployment"
+
+    def test_dockerfile_variant(self):
+        assert categorize_file("Dockerfile.prod") == "deployment"
+
+    def test_docker_compose(self):
+        assert categorize_file("docker-compose.yml") == "deployment"
+
+    def test_docker_compose_prod(self):
+        assert categorize_file("docker-compose.prod.yml") == "deployment"
+
+    def test_regular_python_file(self):
+        assert categorize_file("src/app/models.py") is None
+
+    def test_test_file(self):
+        assert categorize_file("tests/test_auth.py") is None
+
+    def test_readme(self):
+        assert categorize_file("README.md") is None
+
+    def test_plain_yaml_in_root_not_infra(self):
+        # Raw .yaml without an infra directory prefix must NOT match
+        assert categorize_file("config.yaml") is None
+
+    def test_plain_yml_in_root_not_infra(self):
+        assert categorize_file("renovate.yml") is None
+
+    def test_case_insensitive_migration(self):
+        assert categorize_file("Migrations/001_init.py") == "migration"
+
+    def test_case_insensitive_terraform(self):
+        assert categorize_file("Terraform/main.tf") == "infra"
+
+
+# ---------------------------------------------------------------------------
+# is_structural_change
+# ---------------------------------------------------------------------------
+
+class TestIsStructuralChange:
+    def test_empty_list_is_false(self):
+        assert not is_structural_change([])
+
+    def test_migration_file_is_structural(self):
+        assert is_structural_change(["migrations/001_add_sessions.py", "app/models.py"])
+
+    def test_terraform_is_structural(self):
+        assert is_structural_change(["terraform/main.tf", "README.md"])
+
+    def test_schema_only_is_structural(self):
+        assert is_structural_change(["openapi.yaml"])
+
+    def test_deployment_file_is_structural(self):
+        assert is_structural_change(["Dockerfile", "src/app.py"])
+
+    def test_regular_files_not_structural(self):
+        assert not is_structural_change(["src/auth.py", "tests/test_auth.py", "README.md"])
+
+    def test_plain_yaml_not_structural(self):
+        assert not is_structural_change(["config.yaml", "src/app.py"])
+
+    def test_mixed_structural_and_regular(self):
+        # One structural file is enough
+        assert is_structural_change(["src/app.py", "migrations/001.py", "tests/test_app.py"])
+
+
+# ---------------------------------------------------------------------------
+# is_low_signal — with changed_files
+# ---------------------------------------------------------------------------
+
+class TestIsLowSignalWithFiles:
+    def test_backward_compatible_no_changed_files(self):
+        # Existing callers pass no changed_files — must still work
+        assert is_low_signal("chore: update readme", "")
+
+    def test_structural_files_override_title_filter(self):
+        # "chore: add sessions table" + empty body → normally low-signal by title
+        # but the migration file should rescue it
+        assert not is_low_signal(
+            "chore: add sessions table",
+            "",
+            changed_files=["migrations/001_add_sessions.py"],
+        )
+
+    def test_dep_bump_always_low_signal_even_with_structural_files(self):
+        # Always-low pattern wins — alembic migration doesn't rescue a dep-bump title
+        assert is_low_signal(
+            "bump alembic from 1.0 to 2.0",
+            "",
+            changed_files=["alembic/versions/migration.py"],
+        )
+
+    def test_structural_files_with_good_body_already_not_low_signal(self):
+        # Long body already passes — structural override is redundant but harmless
+        assert not is_low_signal(
+            "chore: schema migration",
+            "A" * 100,
+            changed_files=None,
+        )
+
+    def test_no_structural_files_low_signal_title_applies(self):
+        # Without structural files, the existing heuristic filters still work
+        assert is_low_signal("chore: tidy up imports", "", changed_files=["src/utils.py"])
+
+    def test_infra_files_override_low_signal(self):
+        assert not is_low_signal(
+            "fix: update terraform config",
+            "",
+            changed_files=["terraform/main.tf"],
+        )
+
+    def test_dockerfile_overrides_low_signal(self):
+        assert not is_low_signal(
+            "ci: update dockerfile",
+            "",
+            changed_files=["Dockerfile"],
+        )
+
+
+# ---------------------------------------------------------------------------
+# build_changed_files_section
+# ---------------------------------------------------------------------------
+
+class TestBuildChangedFilesSection:
+    def test_none_returns_no_file_list(self):
+        section = build_changed_files_section(None)
+        assert "No file list available" in section
+
+    def test_empty_list_returns_no_file_list(self):
+        section = build_changed_files_section([])
+        assert "No file list available" in section
+
+    def test_migration_files_listed(self):
+        section = build_changed_files_section(["migrations/001_add_sessions.py"])
+        assert "migrations/001_add_sessions.py" in section
+        assert "Database migrations" in section
+
+    def test_infra_files_listed(self):
+        section = build_changed_files_section(["terraform/main.tf"])
+        assert "terraform/main.tf" in section
+        assert "Infrastructure / IaC" in section
+
+    def test_schema_files_listed(self):
+        section = build_changed_files_section(["openapi.yaml"])
+        assert "openapi.yaml" in section
+        assert "API / Schema definitions" in section
+
+    def test_non_structural_summarized_not_listed(self):
+        files = [f"src/module_{i}.py" for i in range(5)]
+        section = build_changed_files_section(files)
+        # Individual paths should NOT appear
+        for f in files:
+            assert f not in section
+        assert "5" in section  # count is shown
+
+    def test_mixed_structural_and_regular(self):
+        files = ["migrations/001.py", "src/app.py", "src/models.py"]
+        section = build_changed_files_section(files)
+        assert "migrations/001.py" in section
+        assert "Database migrations" in section
+        assert "2" in section  # 2 other files summarized
+
+
+# ---------------------------------------------------------------------------
+# build_prompt — changed_files integration
+# ---------------------------------------------------------------------------
+
+class TestBuildPromptWithFiles:
+    def test_no_changed_files_shows_unavailable(self):
+        prompt = build_prompt("Fix bug", "body", [])
+        assert "No file list available" in prompt
+
+    def test_migration_files_appear_in_prompt(self):
+        prompt = build_prompt(
+            "Add sessions table", "", [],
+            changed_files=["migrations/001_add_sessions.py"],
+        )
+        assert "migrations/001_add_sessions.py" in prompt
+        assert "Database migrations" in prompt
+
+    def test_structural_instruction_in_prompt(self):
+        prompt = build_prompt("Add sessions table", "", [],
+                              changed_files=["migrations/001_add_sessions.py"])
+        assert "migrations, infrastructure" in prompt
+
+    def test_backward_compatible_no_changed_files_param(self):
+        # Old call signature: build_prompt(title, body, comments)
+        prompt = build_prompt("Fix bug", "body", ["a review comment"])
+        assert "a review comment" in prompt
+        assert "No file list available" in prompt
diff --git a/tests/test_update.py b/tests/test_update.py
index 971222c..c0c23ba 100644
--- a/tests/test_update.py
+++ b/tests/test_update.py
@@ -16,6 +16,7 @@
     detect_repo,
     git_diff,
     git_files_changed,
+    git_changed_file_paths,
     git_log_since,
     load_state,
     save_state,
@@ -305,6 +306,7 @@ def _make_pr_commit(pr_number=42):
     "author": {"login": "alice"},
     "url": "https://github.com/acme/repo/pull/42",
     "reviews": [],
+    "changed_files": [],
 }
 
 
@@ -393,7 +395,8 @@ def _make_direct_commit():
 
 
 def _call_process_direct(commit, indexed_sources=None, n_files=1,
-                          diff="some diff", is_low=False, extraction=None):
+                          diff="some diff", is_low=False, extraction=None,
+                          changed_file_paths=None):
     result = UpdateResult()
     if indexed_sources is None:
         indexed_sources = set()
@@ -403,7 +406,8 @@ def _call_process_direct(commit, indexed_sources=None, n_files=1,
     mock_write = MagicMock(return_value=Path("knowledge/decisions/foo.md"))
 
     with patch("memex.update.git_files_changed", return_value=n_files), \
-         patch("memex.update.git_diff", return_value=diff):
+         patch("memex.update.git_diff", return_value=diff), \
+         patch("memex.update.git_changed_file_paths", return_value=changed_file_paths or []):
         _process_direct_commit(
             commit=commit,
             sha_short=commit.sha[:8],
@@ -429,14 +433,37 @@ def test_process_direct_commit_already_indexed():
 
 def test_process_direct_commit_stat_filter():
     commit = _make_direct_commit()
+    # Non-structural files — stat filter applies
+    non_structural = [f"src/module_{i}.py" for i in range(MAX_FILES_CHANGED + 1)]
     result, mock_extract, mock_write = _call_process_direct(
-        commit, n_files=MAX_FILES_CHANGED + 1
+        commit, n_files=MAX_FILES_CHANGED + 1, changed_file_paths=non_structural
     )
     assert result.skipped_stat_filter == 1
     mock_extract.assert_not_called()
     mock_write.assert_not_called()
 
 
+def test_process_direct_commit_stat_filter_bypassed_for_structural():
+    commit = _make_direct_commit()
+    # Structural migration file rescues commit from stat filter
+    structural = ["migrations/001_add_sessions.py"] + [f"src/m_{i}.py" for i in range(12)]
+    result, mock_extract, _ = _call_process_direct(
+        commit, n_files=MAX_FILES_CHANGED + 3, changed_file_paths=structural
+    )
+    assert result.skipped_stat_filter == 0
+    mock_extract.assert_called_once()
+
+
+def test_process_direct_commit_stat_filter_non_structural_large():
+    commit = _make_direct_commit()
+    non_structural = [f"src/module_{i}.py" for i in range(20)]
+    result, mock_extract, _ = _call_process_direct(
+        commit, n_files=MAX_FILES_CHANGED + 10, changed_file_paths=non_structural
+    )
+    assert result.skipped_stat_filter == 1
+    mock_extract.assert_not_called()
+
+
 def test_process_direct_commit_empty_diff():
     commit = _make_direct_commit()
     result, mock_extract, mock_write = _call_process_direct(commit, diff="")