diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..4a3391e --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,15 @@ +{ + "hooks": { + "PostToolUse": [ + { + "matcher": "Edit|Write", + "hooks": [ + { + "type": "command", + "command": "python3 scripts/check_docs.py" + } + ] + } + ] + } +} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..7c73e1e --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,17 @@ +name: Lint + +on: + pull_request: + push: + branches: [master, main] + +jobs: + check-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Check CLAUDE.md is in sync + run: python3 scripts/check_docs.py diff --git a/CLAUDE.md b/CLAUDE.md index 180e120..a297f1d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -45,11 +45,14 @@ memex/ ├── memex/ │ ├── __init__.py │ ├── schema.py # Pydantic models — KnowledgeRecord, ExtractionResult -│ ├── extractor.py # LLM extraction pipeline (LiteLLM + Instructor) +│ ├── extractor.py # LLM extraction pipeline (anthropic + Instructor) +│ ├── structural.py # Structural file detection — categorize_file, is_structural_change (no LLM deps) │ ├── writer.py # Renders KnowledgeRecord to .md and commits it │ ├── action.py # GitHub Action entry point — reads env vars, orchestrates │ ├── adr.py # ADR parser — find_adr_files, parse_adr, index_adrs -│ ├── cli.py # Click CLI — `memex index` and `memex query` +│ ├── cli.py # Click CLI — `memex configure/init/update/index/query` +│ ├── config.py # API key resolution — load_api_key, save_api_key, CONFIG_FILE +│ ├── nudge.py # Low-confidence nudge comment — should_nudge, post_nudge_comment │ ├── init.py # `memex init` — bootstrap from repo scan │ └── update.py # `memex update` — incremental extraction from git history ├── tests/ @@ -79,7 +82,7 @@ The index cache lives at: |---|---|---| | Language | Python 3.12+ | Best LLM ecosystem | | LLM — extraction | `claude-sonnet-4-6` via `anthropic` SDK | Best structured output quality | -| LLM — embeddings | `voyage-3-lite` via `anthropic` SDK | Same SDK, same API key, no OpenAI account | +| LLM — embeddings | `fastembed` (`BAAI/bge-small-en-v1.5`) | Local, no API key required, no data leaves machine during queries | | Structured output | `instructor` + `pydantic` | Guaranteed schema compliance, auto-retry | | Vector search | `numpy` cosine similarity over `index.json` | No database needed at MVP scale (<5k records) | | CLI | `click` | Standard, simple | @@ -203,12 +206,27 @@ without updating the indexer and CLI accordingly. ## CLI behaviour ```bash -memex index # embed all .md files in knowledge/, write to .memex/index.json +memex configure # store ANTHROPIC_API_KEY to ~/.memex/config.json (prompts interactively) +memex init # bootstrap: scan repo for ADRs + extract from recent git history +memex update # incremental: extract from git history since last run +memex index # embed all .md files in knowledge/, write to .memex/index.json memex query "why did we move off MongoDB" # cosine similarity search, top 3 results +memex query --min-score 0.5 "..." # broaden search by lowering the relevance threshold +memex query --expand "vague question" # rewrite query via Claude Haiku before embedding ``` -`memex index` should be incremental — only embed files not already in `index.json`. -Do not re-embed records that haven't changed. +`memex index` should be incremental — only embed files whose content has changed since +the last run. Change detection uses a SHA256 hash of the cleaned embed text (title + +context + decision + alternatives + constraints, with YAML frontmatter and markdown +noise stripped). The hash is stored as `content_hash` in each index entry. Entries +without a `content_hash` (legacy entries) are always re-embedded. + +`memex query` options: +- `--top N` — show top N results (default 3) +- `--min-score F` — hide results below this similarity threshold (default 0.70); shows + a "no relevant results" message with a suggested lower threshold when nothing passes +- `--expand` — opt-in: calls Claude Haiku to rewrite the query into richer search + phrases before embedding; useful for short or vague queries `memex query` output format: ``` @@ -258,6 +276,25 @@ of the PR's knowledge record. --- +## Doc sync rules + +`scripts/check_docs.py` runs automatically after every file edit (via `.claude/settings.json` +hook) and on every PR (via `.github/workflows/lint.yml`). It will fail loudly if CLAUDE.md +drifts from the code. + +When you make any of the changes below, update CLAUDE.md **in the same commit**: + +| What changed | What to update in CLAUDE.md | +|---|---| +| New/removed/renamed `.py` in `memex/` | File structure section | +| New/removed `@cli.command()` in `cli.py` | CLI behaviour section | +| `model=` string in `extractor.py` or `init.py` | Tech stack table + decisions section | +| New dependency in `pyproject.toml` | Tech stack table | +| New `os.environ["VAR"]` in `action.py` | Environment variables table | +| New frontmatter field in `writer.py` | Markdown output format section | + +--- + ## Agent rules — cross-cutting concerns Before implementing any feature that touches knowledge record creation, extraction logic, @@ -303,7 +340,7 @@ is harder to debug than a slightly larger PR. | Variable | Source | Description | |---|---|---| -| `ANTHROPIC_API_KEY` | GitHub Secret | Required. Anthropic API key for Claude + Voyage | +| `ANTHROPIC_API_KEY` | GitHub Secret | Required. Anthropic API key for Claude extraction | | `PR_TITLE` | `github.event.pull_request.title` | PR title | | `PR_BODY` | `github.event.pull_request.body` | PR description | | `PR_URL` | `github.event.pull_request.html_url` | Full URL to PR | @@ -324,11 +361,15 @@ is harder to debug than a slightly larger PR. - Do not make real LLM calls in tests — mock the `instructor` client - Use `pytest` and `pytest-mock` +**Running tests — requires Python 3.12+** (the codebase uses `|` union syntax and other 3.10+ features): + ```bash -pip install -e ".[dev]" -pytest tests/ -v +python3 -m pytest tests/ -v ``` +No virtualenv or `pip install` needed — dependencies are already installed globally on this machine. +To run a single file: `python3 -m pytest tests/test_action.py -v` + --- ## Running tests — agent instructions @@ -338,12 +379,12 @@ declaring the task complete. Use `pytest` with `-v` for readable output. | Module changed | Command | |---|---| -| `memex/init.py` | `pytest tests/test_init.py -v` | -| `memex/update.py` | `pytest tests/test_update.py -v` | -| `memex/action.py` | `pytest tests/test_action.py tests/test_nudge.py -v` | -| `memex/nudge.py` | `pytest tests/test_nudge.py -v` | -| `memex/extractor.py` or `memex/writer.py` | `pytest tests/ -v` | -| Any other change | `pytest tests/ -v` | +| `memex/init.py` | `python3 -m pytest tests/test_init.py -v` | +| `memex/update.py` | `python3 -m pytest tests/test_update.py -v` | +| `memex/action.py` | `python3 -m pytest tests/test_action.py tests/test_nudge.py -v` | +| `memex/nudge.py` | `python3 -m pytest tests/test_nudge.py -v` | +| `memex/extractor.py` or `memex/writer.py` | `python3 -m pytest tests/ -v` | +| Any other change | `python3 -m pytest tests/ -v` | Always run at minimum the tests for the module you changed. Run `pytest tests/ -v` if your change touches multiple modules or has cross-cutting effects. @@ -401,18 +442,14 @@ explain why. Do not silently make a different choice. --- -## What to build next (in order) - -If you are picking up this project fresh, work in this sequence: +## Current state -1. `memex/schema.py` — define `KnowledgeRecord` and `ExtractionResult` -2. `memex/extractor.py` — `is_low_signal()` + `extract()` with Instructor -3. `memex/writer.py` — `render_markdown()` + `write_record()` -4. `memex/action.py` — wire everything together, handle env vars and nudge comment -5. `.github/workflows/memex.yml` — the Action definition -6. `memex/cli.py` — `memex index` and `memex query` -7. `tests/` — unit tests for each module with mocked LLM calls -8. `memex/adr.py` — ADR parser; wire into `init`, `index --include-adrs`, and `action.py` ✅ -9. `README.md` — installation instructions, one-minute quickstart +The MVP (Phase 1) is fully implemented and tested. All modules exist and all four core +features are working: GitHub Action, CLI, ADR parser, and low-confidence nudge. -Do not start step N+1 until step N has tests passing. \ No newline at end of file +**Phase 2 work** (not yet started — requires explicit instruction before any of this is built): +- Web UI / dashboard +- Cross-repo search +- Slack integration +- Cloud backend / hosted service +- Enterprise features (SSO, audit log, etc.) \ No newline at end of file diff --git a/README.md b/README.md index 92d7a05..4b6a1d4 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,13 @@ $ memex query "why did we move off MongoDB" Results for: why did we move off MongoDB ────────────────────────────────────────────────────────────────────── - 1. Migrate billing store to PostgreSQL [0.91] ● - Unbounded schema flexibility was causing silent data corruption - in the billing pipeline. MongoDB's lack of enforced schema... - knowledge/decisions/2024-11-14-migrate-billing-store-to-postgresql.md + #1 Migrate billing store to PostgreSQL [0.91] + ✅ High confidence + Unbounded schema flexibility was causing silent data corruption + in the billing pipeline. — Migrated billing store to PostgreSQL. + knowledge/decisions/2024-11-14-migrate-billing-store-to-postgresql.md - 2. ... + #2 ... ``` --- @@ -80,8 +81,10 @@ git push ### 5. Index and query ```bash -memex index # embed all knowledge files (incremental — skips unchanged files) +memex index # embed all knowledge files (incremental — only re-embeds changed files) memex query "why did we switch from SQS to Redis" +memex query --min-score 0.5 "why did we switch from SQS to Redis" # broaden results +memex query --expand "why did we switch from SQS to Redis" # AI-expanded search ``` --- @@ -270,10 +273,64 @@ memex index Embed knowledge files and write vectors to .memex/ind --include-adrs Parse ADR files before embedding (safe to run repeatedly) memex query QUESTION Semantic search over indexed knowledge --top N Return top N results (default: 3) + --min-score F Hide results below this similarity score (default: 0.70) + --expand Rewrite query via Claude Haiku before searching ``` --- +## Querying your knowledge + +`memex query` runs a local semantic search — no data leaves your machine. + +```bash +memex query "why did we move off MongoDB" +``` + +Each result shows a similarity score `[0.00–1.00]`, a confidence badge, the decision excerpt, and the file path. + +### Result relevance + +By default, results below a **0.70 similarity score** are hidden. This threshold exists to avoid surfacing clearly unrelated records. If you get "no relevant results found", you have two options: + +**Lower the threshold** — cast a wider net: +```bash +memex query --min-score 0.5 "why did we move off MongoDB" +``` + +**Expand the query** — let Claude Haiku rephrase your question into richer search terms before embedding: +```bash +memex query --expand "how did we store v1 projects" +``` + +`--expand` is useful when your query is short or vague and you're not getting results you expect. It adds ~1 second of latency (one Haiku API call) and requires your `ANTHROPIC_API_KEY` to be set. + +Example of what expansion does: +``` +Input: "how did we store v1 projects" + +Expanded: how did we store v1 projects, v1 project storage architecture, + legacy v1 project storage system, v1 project data persistence + implementation, historical v1 project storage format, + v1 project repository structure +``` + +The expanded string is embedded as a single query — the broader vocabulary increases the chance of matching records that describe the same concept using different words. + +### Rationale badges in results + +Each result shows a **rationale badge** — a measure of how well-documented the original decision was in its source PR or ADR. This is independent of the similarity score (how well it matched your query). + +| Badge | Meaning | +|---|---| +| `✅ Rationale: well-documented` | Clear reasoning captured — safe to rely on | +| `💡 Rationale: partial` | Some reasoning present — verify if critical | +| `⚠️ Rationale: limited` | Little reasoning in source — treat as a hint, not a fact | + +A record can have a high similarity score (very relevant to your query) but limited rationale (the original PR didn't explain why). These two dimensions are independent. + +--- + ## How extraction works Memex uses [Claude](https://www.anthropic.com/claude) (`claude-sonnet-4-6`) with [Instructor](https://github.com/jxnl/instructor) for structured extraction, guaranteeing schema compliance with automatic retries. diff --git a/knowledge/decisions/2026-04-04-1058-adr0001-use-of-fastembed.md b/knowledge/decisions/2026-04-04-1058-adr0001-use-of-fastembed.md index aa0ff2c..57165b3 100644 --- a/knowledge/decisions/2026-04-04-1058-adr0001-use-of-fastembed.md +++ b/knowledge/decisions/2026-04-04-1058-adr0001-use-of-fastembed.md @@ -1,5 +1,5 @@ --- -title: "ADR-0001: Use of fastembed" +title: "ADR-0001: Use of fastembed with BAAI/bge-small-en-v1.5 for local embeddings" date: 2026-04-04 author: "adr" source: "docs/adr/0001-use-fastembed.md" @@ -8,28 +8,49 @@ confidence: 0.85 tags: ["adr"] --- -# ADR-0001: Use of fastembed +# ADR-0001: Use of fastembed with BAAI/bge-small-en-v1.5 for local embeddings ## Context -We needed to create embedings to query the knowledge +Memex needs to embed knowledge records so users can run semantic search (`memex query`) +locally. The embedding solution must not require a second API key, must not send document +content to an external service, and must work in a CLI context where startup time and +install size matter. Knowledge records contain potentially sensitive internal engineering +decisions, so data privacy is a hard constraint. ## Decision -We will use fastembed. because of pricing and +Use fastembed with the `BAAI/bge-small-en-v1.5` model for all embedding operations. +fastembed runs the model locally in ONNX format (CPU only, no GPU required), downloads +the model once (~130 MB), and returns 384-dimensional vectors. No data leaves the machine +during indexing or querying. ## Alternatives considered -voyage.ai, but is from the mongoDB ecosystem, and I privilege relational databases in general -chroma db +- **VoyageAI** (`voyage-3` and similar) — ruled out because embeddings are computed on + VoyageAI's servers, meaning every `memex index` and `memex query` call sends document + content to an external API. This is a privacy concern for internal engineering knowledge. + Also charges per token. Quality is generally higher (larger dimensional space, better + benchmarks) but the trade-off was not acceptable given the privacy and cost constraints. +- **sentence-transformers** — runs locally but requires PyTorch as a dependency, which + significantly increases install size and startup time. fastembed uses ONNX format instead, + which is faster on CPU and has minimal dependencies. +- **ChromaDB / hosted vector databases** — introduce an external persistence layer and + server dependency, contradicting the local-first, no-database design principle. ## Constraints -- I must learn more about embedings +- No API key should be required for semantic search — only extraction (Claude) needs one +- Internal engineering decisions must never be sent to external services during querying +- Must work on CPU without GPU +- Install footprint should stay small for CLI use ## Revisit signals -_None_ +- If result quality becomes a bottleneck at scale, VoyageAI or a larger BGE variant + (e.g. `BAAI/bge-large-en-v1.5`, 1024 dimensions) could be evaluated +- If the corpus grows beyond ~5k records, brute-force cosine similarity may need replacing + with an approximate nearest-neighbour index (e.g. FAISS) --- diff --git a/memex/action.py b/memex/action.py index 9b1d4dc..99802f9 100644 --- a/memex/action.py +++ b/memex/action.py @@ -9,6 +9,7 @@ import subprocess from pathlib import Path from .extractor import extract, confidence_level +from .structural import categorize_file from .writer import write_record from .schema import ConfidenceLevel from .nudge import should_nudge, has_nudge_comment, post_nudge_comment, is_bot_comment @@ -18,7 +19,10 @@ def get_review_comments(pr_number: str, repo: str) -> list[str]: - """Fetch review comments via GitHub CLI.""" + """Fetch review top-level bodies and inline review comments via GitHub CLI.""" + comments: list[str] = [] + + # Top-level review bodies (submitted via "Review changes") try: result = subprocess.run( ["gh", "pr", "view", pr_number, "--repo", repo, @@ -26,10 +30,35 @@ def get_review_comments(pr_number: str, repo: str) -> list[str]: capture_output=True, text=True, timeout=15 ) if result.returncode == 0: - return json.loads(result.stdout or "[]") + comments += json.loads(result.stdout or "[]") except Exception: pass - return [] + + # Inline review comments (line-level comments on code) + try: + result = subprocess.run( + ["gh", "api", f"repos/{repo}/pulls/{pr_number}/comments", + "--jq", "[.[].body]"], + capture_output=True, text=True, timeout=15 + ) + if result.returncode == 0: + comments += json.loads(result.stdout or "[]") + except Exception: + pass + + # General PR comments (issue-level comments on the PR thread) + try: + result = subprocess.run( + ["gh", "api", f"repos/{repo}/issues/{pr_number}/comments", + "--jq", "[.[].body]"], + capture_output=True, text=True, timeout=15 + ) + if result.returncode == 0: + comments += json.loads(result.stdout or "[]") + except Exception: + pass + + return [c for c in comments if c] def get_changed_files(pr_number: str, repo: str) -> list[str]: @@ -115,7 +144,7 @@ def handle_pr_merge() -> None: print(f"ADR record written: {adr_path}") # Run extraction on the PR itself - result = extract(pr_title, pr_body, review_comments) + result = extract(pr_title, pr_body, review_comments, changed_files=changed_files) if result is None: print("Low-signal PR — skipped.") @@ -140,6 +169,9 @@ def handle_pr_merge() -> None: all_text = pr_body + " " + " ".join(review_comments) related = find_related_adrs(all_text) or None + # Derive structural tags from changed files (e.g. ["migration", "schema"]) + structural_tags = sorted({categorize_file(f) for f in changed_files if categorize_file(f)}) or None + # Write the knowledge record path = write_record( record=result.record, @@ -148,6 +180,7 @@ def handle_pr_merge() -> None: pr_number=int(pr_number), repo=repo, related=related, + tags=structural_tags, ) print(f"Knowledge record written: {path} (confidence {result.record.confidence:.2f} — {level.value})") diff --git a/memex/cli.py b/memex/cli.py index 1c0deff..283bd9d 100644 --- a/memex/cli.py +++ b/memex/cli.py @@ -67,14 +67,70 @@ def extract_confidence(content: str) -> float: return 1.0 +def _extract_md_section(content: str, header: str) -> str: + """Extract the body of a named ## section from markdown content.""" + import re + pattern = rf"##\s+{re.escape(header)}\s*\n(.*?)(?=\n##\s|\n---|\Z)" + m = re.search(pattern, content, re.DOTALL | re.IGNORECASE) + return m.group(1).strip() if m else "" + + +def build_embed_text(content: str) -> str: + """Build a clean semantic string for embedding — strips YAML, warnings, and markdown noise.""" + import re + + def _bullets(text: str) -> list[str]: + items = [] + for line in text.splitlines(): + m = re.match(r"^\s*[-*]\s+(.+)", line) + if m: + item = m.group(1).strip() + if not item.startswith("_"): # skip "_None recorded_" placeholders + items.append(item) + return items + + title = extract_title(content) + context = _extract_md_section(content, "Context") + decision = _extract_md_section(content, "Decision") + alts = _bullets(_extract_md_section(content, "Alternatives considered")) + constraints = _bullets(_extract_md_section(content, "Constraints")) + + parts = [p for p in [title, context, decision] if p] + if alts: + parts.append("Alternatives: " + "; ".join(alts)) + if constraints: + parts.append("Constraints: " + "; ".join(constraints)) + + return "\n".join(parts) if parts else content + + def extract_excerpt(content: str) -> str: - """Pull first substantive paragraph after the frontmatter.""" + """Pull a human-readable preview from the Context and Decision sections.""" + context = _extract_md_section(content, "Context") + decision = _extract_md_section(content, "Decision") + + if context or decision: + if context: + # Truncate at a word boundary so we don't cut mid-word + if len(context) > 300: + context = context[:300].rsplit(None, 1)[0] + "…" + excerpt = context + if decision: + separator = " — " if excerpt else "" + excerpt += separator + decision # decisions are one sentence — show in full + return excerpt + + # Fallback: first non-blank, non-heading, non-blockquote line after frontmatter in_frontmatter = False + fm_count = 0 for line in content.splitlines(): - if line == "---": - in_frontmatter = not in_frontmatter + if line.strip() == "---": + fm_count += 1 + in_frontmatter = fm_count < 2 + continue + if in_frontmatter: continue - if not in_frontmatter and line.strip() and not line.startswith("#"): + if line.strip() and not line.startswith("#") and not line.startswith(">"): return line[:400] return "" @@ -271,6 +327,8 @@ def index(force, include_adrs): click.echo("No knowledge records found. Is the GitHub Action installed?") return + import hashlib + existing = load_index() if not force else {} disk_paths = {str(r) for r in records} @@ -280,23 +338,32 @@ def index(force, include_adrs): if stale: click.echo(f"Removed {len(stale)} deleted record(s) from index.") - new_records = [r for r in records if str(r) not in existing] - - if not new_records: + to_index = [] + for r in records: + content = r.read_text() + embed_text = build_embed_text(content) + h = hashlib.sha256(embed_text.encode()).hexdigest() + entry = existing.get(str(r)) + if entry is None or entry.get("content_hash") != h: + to_index.append((r, content, embed_text, h)) + + if not to_index: + if stale: + save_index(existing) click.echo(f"Index up to date — {len(existing)} records indexed.") return - click.echo(f"Indexing {len(new_records)} new records...") - contents = [r.read_text() for r in new_records] - embeddings = embed(contents) + click.echo(f"Indexing {len(to_index)} record(s)...") + embeddings = embed([embed_text for _, _, embed_text, _ in to_index]) - for path, content, embedding in zip(new_records, contents, embeddings): + for (path, content, embed_text, h), embedding in zip(to_index, embeddings): existing[str(path)] = { "embedding": embedding, "title": extract_title(content), "excerpt": extract_excerpt(content), "confidence": extract_confidence(content), "path": str(path), + "content_hash": h, } save_index(existing) @@ -306,7 +373,19 @@ def index(force, include_adrs): @cli.command() @click.argument("query", nargs=-1) @click.option("--top", default=3, help="Number of results") -def query(query, top): +@click.option( + "--min-score", + default=0.70, + show_default=True, + help="Minimum similarity score (0–1). Results below this are hidden.", +) +@click.option( + "--expand", + is_flag=True, + default=False, + help="Use Claude Haiku to rewrite query into richer search terms (adds ~1s latency).", +) +def query(query, top, min_score, expand): """Query your institutional knowledge. Example: memex query why did we move off MongoDB @@ -321,6 +400,23 @@ def query(query, top): click.echo("Nothing indexed yet. Run `memex index` first.") return + if expand: + client = _anthropic_client() + resp = client.messages.create( + model="claude-haiku-4-5", + max_tokens=200, + messages=[{ + "role": "user", + "content": ( + f"Rewrite this search query as 4-6 rich technical search phrases, " + f"comma-separated, no explanation:\n\n{query_text}" + ), + }], + ) + expanded = resp.content[0].text.strip() + query_text = expanded if expanded else query_text + click.echo(f" Expanded: {query_text}", err=True) + # Embed the query [query_embedding] = embed([query_text]) @@ -331,6 +427,9 @@ def query(query, top): ] scored.sort(key=lambda x: x[0], reverse=True) + passing = [(s, e) for s, e in scored if s >= min_score] + to_display = passing[:top] + import shutil term_width = min(shutil.get_terminal_size((80, 24)).columns, 100) divider = "─" * term_width @@ -338,7 +437,16 @@ def query(query, top): click.echo(f"\nResults for: {query_text}") click.echo(divider) - for i, (score, entry) in enumerate(scored[:top], 1): + if not to_display: + lower = round(max(0.0, min_score - 0.2), 1) + click.echo( + f"\n No relevant results found (threshold: {min_score:.2f}).\n" + f' Try `memex query --min-score {lower} "..."` to broaden the search.' + ) + click.echo("") + return + + for i, (score, entry) in enumerate(to_display, 1): title = entry["title"] excerpt = _strip_markdown(entry["excerpt"]) path = entry["path"] @@ -364,23 +472,23 @@ def query(query, top): rank = click.style(f"#{i}", bold=True) click.echo(f"\n {rank} {title} {score_label}") - # Confidence label + # Confidence label — reflects quality of rationale in the source record, + # independent of the similarity score above. if confidence >= 0.80: - conf_line = click.style("✅ High confidence", fg="green") + conf_line = click.style("✅ Rationale: well-documented", fg="green") click.echo(f" {conf_line}") elif confidence >= 0.65: - conf_line = click.style("💡 Medium confidence", fg="yellow") + conf_line = click.style("💡 Rationale: partial — verify if critical", fg="yellow") click.echo(f" {conf_line}") else: conf_line = click.style( - "⚠️ Low confidence — limited rationale present in source. " - "Verify before relying on this record.", + "⚠️ Rationale: limited — limited reasoning in source, verify before relying on this record.", fg="yellow", ) click.echo(f" {conf_line}") - # Excerpt — skip if it duplicates the low-confidence warning already shown - if excerpt and not excerpt.startswith("⚠️"): + # Excerpt + if excerpt: click.echo(_wrap(excerpt, term_width - 6, " ")) click.echo(f" {click.style(path, dim=True)}") diff --git a/memex/extractor.py b/memex/extractor.py index e77ddb9..a05ee6e 100644 --- a/memex/extractor.py +++ b/memex/extractor.py @@ -4,6 +4,7 @@ import instructor from .config import load_api_key from .schema import ExtractionResult, ConfidenceLevel +from .structural import categorize_file, is_structural_change, build_changed_files_section def _client(): @@ -27,11 +28,15 @@ def _client(): _BODY_TRIVIAL_THRESHOLD = 80 # chars — below this, treat body as absent -def is_low_signal(title: str, body: str) -> bool: +def is_low_signal(title: str, body: str, changed_files: list[str] | None = None) -> bool: """Quick heuristic check before spending an LLM call.""" text = f"{title}\n{body}" + # Always-low patterns win regardless of structural files (dep bumps are always noise) if any(re.search(p, text, re.IGNORECASE) for p in _ALWAYS_LOW_SIGNAL): return True + # Structural files override the title-based heuristics + if changed_files and is_structural_change(changed_files): + return False body_is_trivial = len((body or "").strip()) < _BODY_TRIVIAL_THRESHOLD if body_is_trivial: if any(re.search(p, title, re.IGNORECASE) for p in _TITLE_LOW_SIGNAL): @@ -39,18 +44,24 @@ def is_low_signal(title: str, body: str) -> bool: return False -def build_prompt(pr_title: str, pr_body: str, review_comments: list[str]) -> str: +def build_prompt( + pr_title: str, + pr_body: str, + review_comments: list[str], + changed_files: list[str] | None = None, +) -> str: reviews_text = "\n---\n".join(review_comments) if review_comments else "No review comments." + changed_files_section = build_changed_files_section(changed_files) return f"""You are extracting institutional knowledge from a GitHub pull request. -Your task: determine whether this PR contains a genuine architectural, technical, or +Your task: determine whether this PR contains a genuine architectural, technical, or product decision — and if so, extract the decision context. -A genuine decision involves a non-obvious choice between alternatives, a trade-off, -or a constraint that shaped how something was built. Routine changes (dependency +A genuine decision involves a non-obvious choice between alternatives, a trade-off, +or a constraint that shaped how something was built. Routine changes (dependency updates, typo fixes, style cleanup, test additions) do NOT qualify. -Be strict. It is better to return contains_decision=false than to manufacture +Be strict. It is better to return contains_decision=false than to manufacture rationale that isn't actually present in the text. ## PR Title @@ -62,7 +73,16 @@ def build_prompt(pr_title: str, pr_body: str, review_comments: list[str]) -> str ## Review Comments {reviews_text} -Extract the decision context now. Set confidence based on how much explicit rationale +## Changed Files +{changed_files_section} + +If changed files include migrations, infrastructure-as-code, or schema definitions, \ +these represent architectural decisions worth capturing even when the PR description is thin. \ +Extract the decision (what changed and why it matters structurally), but set confidence \ +based on how much explicit rationale is actually present in the text — do not infer \ +motivation that is not stated. + +Extract the decision context now. Set confidence based on how much explicit rationale is actually present in the text above — do not infer or assume.""" @@ -70,6 +90,7 @@ def extract( pr_title: str, pr_body: str, review_comments: list[str] | None = None, + changed_files: list[str] | None = None, ) -> ExtractionResult | None: """ Run the extraction pipeline on a single PR. @@ -79,18 +100,18 @@ def extract( """ review_comments = review_comments or [] - # Gate 1: cheap heuristic filter - if is_low_signal(pr_title, pr_body or ""): + # Gate 1: cheap heuristic filter (changed_files enables structural override) + if is_low_signal(pr_title, pr_body or "", changed_files): return None # Gate 2: LLM extraction with structured output result: ExtractionResult = _client().messages.create( - model="claude-sonnet-4-5", + model="claude-sonnet-4-6", max_tokens=1024, messages=[ { "role": "user", - "content": build_prompt(pr_title, pr_body, review_comments), + "content": build_prompt(pr_title, pr_body, review_comments, changed_files), } ], response_model=ExtractionResult, diff --git a/memex/structural.py b/memex/structural.py new file mode 100644 index 0000000..0824d2f --- /dev/null +++ b/memex/structural.py @@ -0,0 +1,89 @@ +""" +Structural file detection — pure classification, no LLM dependency. + +Identifies high-value file patterns (migrations, IaC, schema, deployment) +that indicate architectural decisions worth extracting even when PR descriptions +are thin. Shared by extractor.py and update.py. +""" +from __future__ import annotations + +import re +from collections import defaultdict + +# File patterns that indicate high-value structural changes. +# Deliberately conservative — no raw .yaml/.yml (too broad for most projects). +_STRUCTURAL_PATTERNS: dict[str, list[str]] = { + "migration": [ + r"migrations?/", + r"alembic/versions/", + r"db/migrate/", + r"\d{14}_\w+\.rb$", # Rails migration naming convention + ], + "infra": [ + r"\.tf$", + r"terraform/", + r"k8s/", + r"kubernetes/", + r"helm/", + r"charts/", + r"manifests/", + ], + "schema": [ + r"openapi\.ya?ml$", + r"openapi\.json$", + r"swagger\.ya?ml$", + r"schema\.json$", + r"schema\.graphql$", + r"\.graphql$", + r"\.proto$", + ], + "deployment": [ + r"docker-compose.*\.ya?ml$", + r"Dockerfile(\.\w+)?$", + ], +} + +STRUCTURAL_LABEL: dict[str, str] = { + "migration": "Database migrations", + "infra": "Infrastructure / IaC", + "schema": "API / Schema definitions", + "deployment": "Deployment configuration", +} + + +def categorize_file(path: str) -> str | None: + """Return the structural category for a file path, or None if not structural.""" + p = path.replace("\\", "/") + for category, patterns in _STRUCTURAL_PATTERNS.items(): + for pat in patterns: + if re.search(pat, p, re.IGNORECASE): + return category + return None + + +def is_structural_change(changed_files: list[str]) -> bool: + """True if any changed file matches a high-value structural pattern.""" + return any(categorize_file(p) is not None for p in changed_files) + + +def build_changed_files_section(changed_files: list[str] | None) -> str: + """Build the Changed Files section for the LLM prompt.""" + if not changed_files: + return "No file list available." + buckets: dict[str, list[str]] = defaultdict(list) + for path in changed_files: + cat = categorize_file(path) or "other" + buckets[cat].append(path) + lines = [] + for cat in ["migration", "infra", "schema", "deployment"]: + if cat not in buckets: + continue + lines.append(f"**{STRUCTURAL_LABEL[cat]}** ({len(buckets[cat])} files):") + for f in buckets[cat]: + lines.append(f" - {f}") + other_count = len(buckets.get("other", [])) + if other_count: + lines.append(f"Other changed files: {other_count}") + if not lines: + return f"{len(changed_files)} files changed (none match structural patterns)." + return "\n".join(lines) diff --git a/memex/update.py b/memex/update.py index c140657..4b2e81c 100644 --- a/memex/update.py +++ b/memex/update.py @@ -16,6 +16,9 @@ from pathlib import Path from typing import Callable, Optional +# Pure helpers — safe to import at module level (no LLM dependency) +from .structural import categorize_file, is_structural_change + STATE_FILE = Path(".memex/state.json") KNOWLEDGE_DIR = Path("knowledge/decisions") @@ -119,6 +122,17 @@ def _extract_pr_number(message: str) -> Optional[int]: return int(m.group(1)) if m else None +def git_changed_file_paths(sha: str) -> list[str]: + """Return list of file paths changed in a commit via git diff-tree.""" + r = subprocess.run( + ["git", "diff-tree", "--no-commit-id", "-r", "--name-only", sha], + capture_output=True, text=True, + ) + if r.returncode != 0 or not r.stdout.strip(): + return [] + return [line.strip() for line in r.stdout.strip().splitlines() if line.strip()] + + def git_files_changed(sha: str) -> int: """Return number of files changed in a commit — used by the stat-first filter.""" r = subprocess.run( @@ -168,16 +182,19 @@ def detect_repo() -> Optional[str]: def fetch_pr_data(pr_number: int, repo: str) -> Optional[dict]: - """Fetch PR title, body, author, url, and reviews via gh CLI.""" + """Fetch PR title, body, author, url, reviews, and changed files via gh CLI.""" r = subprocess.run( ["gh", "pr", "view", str(pr_number), "--repo", repo, - "--json", "title,body,author,url,reviews"], + "--json", "title,body,author,url,reviews,files"], capture_output=True, text=True, timeout=15, ) if r.returncode != 0: return None try: - return json.loads(r.stdout) + data = json.loads(r.stdout) + # Normalise changed_files into a flat list of paths + data["changed_files"] = [f.get("path", "") for f in data.get("files", []) if f.get("path")] + return data except json.JSONDecodeError: return None @@ -360,26 +377,29 @@ def _process_pr_commit( author = (pr.get("author") or {}).get("login", commit.author) pr_url = pr.get("url", f"https://github.com/{repo}/pull/{pr_num}") reviews = [r.get("body", "") for r in pr.get("reviews", []) if r.get("body")] + changed_files = pr.get("changed_files", []) - if is_low_signal(title, body): + if is_low_signal(title, body, changed_files): result.skipped_low_signal += 1 if progress_cb: progress_cb(f" ✗ #{pr_num} low signal — skip") return - extraction = extract(title, body, reviews) + extraction = extract(title, body, reviews, changed_files=changed_files) if extraction is None or not extraction.contains_decision or extraction.record is None: result.skipped_no_decision += 1 if progress_cb: progress_cb(f" ✗ #{pr_num} no decision found — skip") return + structural_tags = sorted({categorize_file(f) for f in changed_files if categorize_file(f)}) or None path = write_record( record=extraction.record, source_url=pr_url, author=author, pr_number=pr_num, repo=repo, + tags=structural_tags, ) indexed_prs.add(pr_num) result.written += 1 @@ -406,13 +426,18 @@ def _process_direct_commit( progress_cb(f" ~ {sha_short} already indexed — skip") return - # Stat-first filter — skip large commits before any LLM call + # Fetch changed file paths once — reused for stat bypass, low-signal check, and tags + diff_files = git_changed_file_paths(commit.sha) + + # Stat-first filter — skip large commits before any LLM call, + # unless the commit touches structural files (migrations, IaC, schema, etc.) n_files = git_files_changed(commit.sha) if n_files > MAX_FILES_CHANGED: - result.skipped_stat_filter += 1 - if progress_cb: - progress_cb(f" ✗ {sha_short} {n_files} files changed — stat filter") - return + if not is_structural_change(diff_files): + result.skipped_stat_filter += 1 + if progress_cb: + progress_cb(f" ✗ {sha_short} {n_files} files changed — stat filter") + return diff = git_diff(commit.sha) if not diff: @@ -421,25 +446,27 @@ def _process_direct_commit( progress_cb(f" ✗ {sha_short} empty diff — skip") return - if is_low_signal(commit.subject, diff): + if is_low_signal(commit.subject, diff, diff_files): result.skipped_low_signal += 1 if progress_cb: progress_cb(f" ✗ {sha_short} low signal — skip") return - extraction = extract(commit.subject, diff, []) + extraction = extract(commit.subject, diff, [], changed_files=diff_files) if extraction is None or not extraction.contains_decision or extraction.record is None: result.skipped_no_decision += 1 if progress_cb: progress_cb(f" ✗ {sha_short} no decision found — skip") return + structural_tags = sorted({categorize_file(f) for f in diff_files if categorize_file(f)}) or None path = write_record( record=extraction.record, source_url=c_url, author=commit.author, pr_number=None, # direct commit — no PR number repo=repo, + tags=structural_tags, ) indexed_sources.add(c_url) result.written += 1 diff --git a/scripts/check_docs.py b/scripts/check_docs.py new file mode 100644 index 0000000..abdf8ad --- /dev/null +++ b/scripts/check_docs.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +check_docs.py — validates CLAUDE.md stays in sync with the actual codebase. + +Checks: + 1. Every memex/*.py module (except __init__.py) is listed in CLAUDE.md + 2. Every @cli.command() in cli.py appears in the CLI behaviour section + 3. Model strings in extractor.py and init.py match the tech stack table + 4. Required env vars (os.environ["VAR"]) in action.py have rows in the env vars table + +Exit 0 = clean. Exit 1 = drift detected (printed to stdout so Claude Code surfaces it). +""" + +import re +import sys +from pathlib import Path + +ROOT = Path(__file__).parent.parent +CLAUDE_MD = ROOT / "CLAUDE.md" +MEMEX_DIR = ROOT / "memex" + +errors = [] + + +def check(condition: bool, message: str) -> None: + if not condition: + errors.append(message) + + +claude = CLAUDE_MD.read_text() + +# ── 1. Module files ──────────────────────────────────────────────────────────── +py_files = sorted(f.name for f in MEMEX_DIR.glob("*.py") if f.name != "__init__.py") +for fname in py_files: + check(fname in claude, f"file structure: memex/{fname} not listed in CLAUDE.md") + +# ── 2. CLI commands ──────────────────────────────────────────────────────────── +cli_src = (MEMEX_DIR / "cli.py").read_text() +# Commands may have @click.option decorators between @cli.command() and def, +# so scan line-by-line: find @cli.command, then look forward for the def name. +lines = cli_src.splitlines() +command_names = [] +for i, line in enumerate(lines): + m = re.match(r'\s*@cli\.command\((?:"(\w+)")?\)', line) + if m: + explicit_name = m.group(1) + for j in range(i + 1, min(i + 15, len(lines))): + def_match = re.match(r'\s*def (\w+)', lines[j]) + if def_match: + command_names.append(explicit_name or def_match.group(1)) + break + +cli_section_match = re.search( + r"## CLI behaviour(.*?)^##", claude, re.DOTALL | re.MULTILINE +) +cli_text = cli_section_match.group(1) if cli_section_match else "" + +for cmd in command_names: + check( + f"memex {cmd}" in cli_text, + f"CLI behaviour section: 'memex {cmd}' not documented in CLAUDE.md", + ) + +# ── 3. Model strings ─────────────────────────────────────────────────────────── +for fname in ["extractor.py", "init.py"]: + src = (MEMEX_DIR / fname).read_text() + models = re.findall(r'model=["\']([^"\']+)["\']', src) + for model in set(models): + check( + model in claude, + f"tech stack: model '{model}' used in memex/{fname} but not in CLAUDE.md", + ) + +# ── 4. Required env vars ─────────────────────────────────────────────────────── +action_src = (MEMEX_DIR / "action.py").read_text() +# Only required vars (os.environ["VAR"], not os.environ.get) +required_vars = re.findall(r'os\.environ\["(\w+)"\]', action_src) +# Exclude internal GitHub meta-vars that don't need user documentation +skip_vars = {"GITHUB_EVENT_NAME"} +required_vars = sorted(set(v for v in required_vars if v not in skip_vars)) + +env_section_match = re.search( + r"## Environment variables.*?\n(.*?)^##", claude, re.DOTALL | re.MULTILINE +) +env_text = env_section_match.group(1) if env_section_match else "" + +for var in required_vars: + check( + f"`{var}`" in env_text, + f"env vars table: '{var}' used in action.py but not documented in CLAUDE.md", + ) + +# ── Report ───────────────────────────────────────────────────────────────────── +if errors: + print("CLAUDE.md drift detected:\n") + for e in errors: + print(f" ✗ {e}") + print( + f"\n{len(errors)} issue(s) — update CLAUDE.md to match the code, " + "or run 'python scripts/check_docs.py' to recheck." + ) + sys.exit(1) +else: + print( + f"CLAUDE.md is in sync " + f"({len(py_files)} modules, {len(command_names)} CLI commands, " + f"{len(required_vars)} env vars checked)" + ) + sys.exit(0) diff --git a/scripts/test_action.sh b/scripts/test_action.sh index 24f46c2..f2aa138 100755 --- a/scripts/test_action.sh +++ b/scripts/test_action.sh @@ -6,11 +6,15 @@ set -euo pipefail +# ── Debug mode ──────────────────────────────────────────────────────────────── +[[ "${DEBUG:-}" == "1" ]] && set -x + # ── API key ─────────────────────────────────────────────────────────────────── if [[ -z "${ANTHROPIC_API_KEY:-}" ]]; then echo "Error: ANTHROPIC_API_KEY is not set." exit 1 fi +echo "[debug] ANTHROPIC_API_KEY is set (${#ANTHROPIC_API_KEY} chars)" # ── PR data ─────────────────────────────────────────────────────────────────── if [[ -n "${1:-}" ]]; then @@ -20,11 +24,15 @@ if [[ -n "${1:-}" ]]; then REPO="$(echo "$PR_URL" | sed -E 's|https://github.com/([^/]+/[^/]+)/pull/.*|\1|')" PR_NUMBER="$(echo "$PR_URL" | sed -E 's|.*/pull/([0-9]+).*|\1|')" - echo "Fetching PR #$PR_NUMBER from $REPO ..." + echo "[debug] Fetching PR #$PR_NUMBER from $REPO ..." PR_JSON="$(gh pr view "$PR_NUMBER" --repo "$REPO" --json title,body,author,url)" PR_TITLE="$(echo "$PR_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])")" PR_BODY="$(echo "$PR_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin)['body'] or '')")" PR_AUTHOR="$(echo "$PR_JSON" | python3 -c "import sys,json; print(json.load(sys.stdin)['author']['login'])")" + echo "[debug] PR_TITLE: $PR_TITLE" + echo "[debug] PR_AUTHOR: $PR_AUTHOR" + echo "[debug] PR_NUMBER: $PR_NUMBER" + echo "[debug] REPO: $REPO" else # ── Built-in fixture (high-signal example) ────────────────────────────────── # Swap for the low-signal example to test discard behaviour: @@ -47,21 +55,29 @@ export PR_TITLE PR_BODY PR_URL PR_NUMBER PR_AUTHOR REPO # ── Stub gh for nudge comments (avoids posting to a real PR) ───────────────── # Real PR data was already fetched above; only the comment call needs stubbing. +REAL_GH="$(which gh)" GH_STUB="$(mktemp -d)/gh" -cat > "$GH_STUB" << 'STUB' +cat > "$GH_STUB" << STUB #!/usr/bin/env bash # Pass through 'gh pr view' so get_review_comments still works. -if [[ "$*" == *"--json reviews"* ]]; then - exec /usr/bin/env gh "$@" +if [[ "\$*" == *"--json reviews"* ]] || [[ "\$*" == *"--json files"* ]] || [[ "\$1" == "api" ]]; then + exec "$REAL_GH" "\$@" fi -echo "[gh stub] $*" >&2 +echo "[gh stub] \$*" >&2 echo "[]" STUB chmod +x "$GH_STUB" export PATH="$(dirname "$GH_STUB"):$PATH" +echo "[debug] gh stub created at $GH_STUB" # ── Run ─────────────────────────────────────────────────────────────────────── cd "$(dirname "$0")/.." -echo "Running memex action with PR: \"$PR_TITLE\"" +echo "" +echo "Running memex action with:" +echo " PR_TITLE: $PR_TITLE" +echo " PR_AUTHOR: $PR_AUTHOR" +echo " PR_NUMBER: $PR_NUMBER" +echo " PR_URL: $PR_URL" +echo " REPO: $REPO" echo "────────────────────────────────────────" python3 -m memex.action diff --git a/tests/test_action.py b/tests/test_action.py index 7935bdd..785c4bd 100644 --- a/tests/test_action.py +++ b/tests/test_action.py @@ -1,7 +1,9 @@ """Unit tests for memex/action.py nudge integration and issue_comment handling.""" +import json import os +import subprocess import pytest -from unittest.mock import patch, MagicMock +from unittest.mock import patch, MagicMock, call from pathlib import Path from memex.schema import KnowledgeRecord, ExtractionResult @@ -36,6 +38,87 @@ def _make_result(confidence: float) -> ExtractionResult: } +# --- get_review_comments --- + +def _mock_run(stdout: str, returncode: int = 0) -> MagicMock: + m = MagicMock() + m.returncode = returncode + m.stdout = stdout + return m + + +def _is_reviews_call(cmd): + return "--json" in cmd and "reviews" in cmd + +def _is_inline_comments_call(cmd): + return any("pulls" in arg for arg in cmd) + +def _is_general_comments_call(cmd): + return any("issues" in arg for arg in cmd) + + +class TestGetReviewComments: + def test_combines_review_bodies_inline_and_general_comments(self): + from memex.action import get_review_comments + + def fake_run(cmd, **kwargs): + if _is_reviews_call(cmd): + return _mock_run(json.dumps(["Top-level review body"])) + if _is_inline_comments_call(cmd): + return _mock_run(json.dumps(["Inline line comment"])) + if _is_general_comments_call(cmd): + return _mock_run(json.dumps(["General PR comment"])) + return _mock_run("[]") + + with patch("memex.action.subprocess.run", side_effect=fake_run): + result = get_review_comments("42", "acme/repo") + + assert result == ["Top-level review body", "Inline line comment", "General PR comment"] + + def test_filters_empty_strings(self): + from memex.action import get_review_comments + + def fake_run(cmd, **kwargs): + if _is_reviews_call(cmd): + return _mock_run(json.dumps([""])) # empty review body + if _is_inline_comments_call(cmd): + return _mock_run(json.dumps(["Inline comment"])) + if _is_general_comments_call(cmd): + return _mock_run(json.dumps([""])) # empty general comment + return _mock_run("[]") + + with patch("memex.action.subprocess.run", side_effect=fake_run): + result = get_review_comments("42", "acme/repo") + + assert result == ["Inline comment"] + + def test_returns_empty_list_when_all_calls_fail(self): + from memex.action import get_review_comments + + failing = _mock_run("", returncode=1) + with patch("memex.action.subprocess.run", return_value=failing): + result = get_review_comments("42", "acme/repo") + + assert result == [] + + def test_partial_failure_returns_successful_sources(self): + from memex.action import get_review_comments + + def fake_run(cmd, **kwargs): + if _is_reviews_call(cmd): + return _mock_run("", returncode=1) # reviews call fails + if _is_inline_comments_call(cmd): + return _mock_run(json.dumps(["Inline comment"])) + if _is_general_comments_call(cmd): + return _mock_run(json.dumps(["General comment"])) + return _mock_run("[]") + + with patch("memex.action.subprocess.run", side_effect=fake_run): + result = get_review_comments("42", "acme/repo") + + assert result == ["Inline comment", "General comment"] + + # --- handle_pr_merge: nudge logic --- class TestHandlePrMergeNudge: @@ -254,3 +337,60 @@ def test_defaults_to_pr_merge(self, monkeypatch): from memex.action import main main() mock_merge.assert_called_once() + + +# --- changed_files wiring --- + +class TestChangedFilesWiring: + def test_changed_files_passed_to_extract(self, tmp_path, monkeypatch): + """changed_files from get_changed_files() should be forwarded to extract().""" + monkeypatch.chdir(tmp_path) + for k, v in BASE_ENV.items(): + monkeypatch.setenv(k, v) + + structural_files = ["migrations/001_add_sessions.py"] + + with patch("memex.action.get_review_comments", return_value=[]), \ + patch("memex.action.get_changed_files", return_value=structural_files), \ + patch("memex.action.extract", return_value=_make_result(0.85)) as mock_extract, \ + patch("memex.action.write_record", return_value=Path("knowledge/decisions/foo.md")): + + from memex.action import handle_pr_merge + handle_pr_merge() + + mock_extract.assert_called_once() + assert mock_extract.call_args.kwargs.get("changed_files") == structural_files + + def test_structural_tags_written_for_migration_pr(self, tmp_path, monkeypatch): + """Structural tags derived from changed_files should reach write_record.""" + monkeypatch.chdir(tmp_path) + for k, v in BASE_ENV.items(): + monkeypatch.setenv(k, v) + + with patch("memex.action.get_review_comments", return_value=[]), \ + patch("memex.action.get_changed_files", return_value=["migrations/001_add_sessions.py"]), \ + patch("memex.action.extract", return_value=_make_result(0.85)), \ + patch("memex.action.write_record", return_value=Path("knowledge/decisions/foo.md")) as mock_write: + + from memex.action import handle_pr_merge + handle_pr_merge() + + _, kwargs = mock_write.call_args + assert "migration" in (kwargs.get("tags") or []) + + def test_no_structural_tags_for_plain_pr(self, tmp_path, monkeypatch): + """Non-structural files should produce no structural tags.""" + monkeypatch.chdir(tmp_path) + for k, v in BASE_ENV.items(): + monkeypatch.setenv(k, v) + + with patch("memex.action.get_review_comments", return_value=[]), \ + patch("memex.action.get_changed_files", return_value=["src/auth.py", "tests/test_auth.py"]), \ + patch("memex.action.extract", return_value=_make_result(0.85)), \ + patch("memex.action.write_record", return_value=Path("knowledge/decisions/foo.md")) as mock_write: + + from memex.action import handle_pr_merge + handle_pr_merge() + + _, kwargs = mock_write.call_args + assert kwargs.get("tags") is None diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..382c179 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,516 @@ +"""Unit tests for memex/cli.py — embedding text, excerpt extraction, index, and query.""" +import hashlib +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from click.testing import CliRunner + +from memex.cli import ( + _extract_md_section, + build_embed_text, + extract_excerpt, + extract_title, + cli, +) + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +FULL_MD = """\ +--- +title: "Use Redis for caching" +date: 2026-04-01 +author: "alice" +source: "https://github.com/acme/repo/pull/42" +confidence: 0.85 +tags: [] +--- + +# Use Redis for caching + +## Context + +The team needed a fast in-memory store for session data. PostgreSQL was being hit on +every request, causing latency spikes during peak traffic. + +## Decision + +Use Redis as an in-memory cache layer between the application and PostgreSQL. + +## Alternatives considered + +- Memcached — ruled out because team has no experience with it +- In-process LRU cache — doesn't work across multiple app instances + +## Constraints + +- Redis must be deployed as a separate service +- Cache invalidation must be handled explicitly + +## Revisit signals + +_None_ + +--- + +_Extracted by Memex from [PR #42](https://github.com/acme/repo/pull/42) · 2026-04-01_ +""" + +LOW_CONF_MD = """\ +--- +title: "Low confidence record" +date: 2026-04-01 +author: "memex-init" +source: "pyproject.toml" +confidence: 0.50 +tags: ["init"] +--- + +# Low confidence record + +> ⚠️ **Low confidence** — limited rationale present in source. Verify before relying on this record. + +## Context + +Some brief context here about the decision. + +## Decision + +A decision was made about the system. + +## Alternatives considered + +_None recorded_ + +## Constraints + +_None recorded_ + +## Revisit signals + +_None_ + +--- + +_Extracted by Memex from repo scan · 2026-04-01_ +""" + +NO_SECTIONS_MD = """\ +--- +title: "Hand-written note" +date: 2026-04-01 +author: "bob" +source: "https://github.com/acme/repo/pull/99" +confidence: 0.70 +tags: [] +--- + +This is a hand-written note with no standard sections. +It has some content but no ## headers. +""" + + +# --------------------------------------------------------------------------- +# _extract_md_section +# --------------------------------------------------------------------------- + +class TestExtractMdSection: + def test_extracts_section_body(self): + result = _extract_md_section(FULL_MD, "Context") + assert "PostgreSQL was being hit" in result + assert "##" not in result + + def test_missing_section_returns_empty(self): + result = _extract_md_section(FULL_MD, "Nonexistent") + assert result == "" + + def test_case_insensitive(self): + md = "## CONTEXT\nSome context here.\n## Decision\nA decision.\n" + assert "Some context here." in _extract_md_section(md, "Context") + + def test_does_not_bleed_into_next_section(self): + result = _extract_md_section(FULL_MD, "Context") + assert "Use Redis as an in-memory" not in result # that's in Decision + + def test_stops_at_horizontal_rule(self): + result = _extract_md_section(FULL_MD, "Revisit signals") + assert "_Extracted by Memex" not in result + + +# --------------------------------------------------------------------------- +# build_embed_text +# --------------------------------------------------------------------------- + +class TestBuildEmbedText: + def test_includes_title(self): + result = build_embed_text(FULL_MD) + assert "Use Redis for caching" in result + + def test_includes_context(self): + result = build_embed_text(FULL_MD) + assert "PostgreSQL was being hit" in result + + def test_includes_decision(self): + result = build_embed_text(FULL_MD) + assert "in-memory cache layer" in result + + def test_includes_alternatives(self): + result = build_embed_text(FULL_MD) + assert "Memcached" in result + assert "In-process LRU cache" in result + + def test_includes_constraints(self): + result = build_embed_text(FULL_MD) + assert "Redis must be deployed" in result + + def test_strips_yaml_frontmatter(self): + result = build_embed_text(FULL_MD) + assert "confidence:" not in result + assert "tags:" not in result + assert "author:" not in result + + def test_strips_warning_block(self): + result = build_embed_text(LOW_CONF_MD) + assert "⚠️" not in result + assert "limited rationale present" not in result + + def test_strips_footer(self): + result = build_embed_text(FULL_MD) + assert "_Extracted by Memex" not in result + + def test_no_markdown_headers(self): + result = build_embed_text(FULL_MD) + assert "## " not in result + + def test_skips_none_recorded_alternatives(self): + result = build_embed_text(LOW_CONF_MD) + assert "Alternatives:" not in result + + def test_skips_none_recorded_constraints(self): + result = build_embed_text(LOW_CONF_MD) + assert "Constraints:" not in result + + def test_fallback_on_no_sections(self): + result = build_embed_text(NO_SECTIONS_MD) + # Falls back to raw content — should at least contain the note text (case-preserved) + assert "hand-written note" in result.lower() + + def test_alternatives_formatted_with_semicolons(self): + result = build_embed_text(FULL_MD) + assert "Alternatives: " in result + # Both alternatives joined + assert "Memcached" in result and "In-process" in result + + +# --------------------------------------------------------------------------- +# extract_excerpt +# --------------------------------------------------------------------------- + +class TestExtractExcerpt: + def test_returns_context_and_decision(self): + result = extract_excerpt(FULL_MD) + assert "PostgreSQL was being hit" in result # from Context + assert "in-memory cache layer" in result # from Decision + + def test_skips_warning_block(self): + result = extract_excerpt(LOW_CONF_MD) + assert "⚠️" not in result + assert "Low confidence" not in result + + def test_shows_actual_context_for_low_confidence(self): + result = extract_excerpt(LOW_CONF_MD) + assert "Some brief context here" in result + + def test_context_truncated_at_300_chars(self): + long_context = "word " * 100 # 500 chars + md = f"---\ntitle: t\n---\n## Context\n{long_context}\n## Decision\nFoo.\n" + result = extract_excerpt(md) + # Context portion is capped at 300 chars and ends with ellipsis + assert "…" in result + assert result.endswith("Foo.") # decision shown in full + # No mid-word cuts — the part before " — " ends cleanly + context_part = result.split(" — ")[0] + assert not context_part.rstrip("…").endswith(" ") + + def test_fallback_skips_blockquotes(self): + md = "---\ntitle: t\n---\n\n> This is a blockquote\n\nActual content here.\n" + result = extract_excerpt(md) + assert "Actual content here." in result + assert "blockquote" not in result + + def test_fallback_skips_headings(self): + md = "---\ntitle: t\n---\n\n# Big heading\n\nReal content.\n" + result = extract_excerpt(md) + assert "Real content." in result + assert "Big heading" not in result + + def test_context_only_when_no_decision_section(self): + md = "---\ntitle: t\n---\n## Context\nOnly context here.\n" + result = extract_excerpt(md) + assert "Only context here." in result + assert " — " not in result # no decision separator + + +# --------------------------------------------------------------------------- +# index command — content_hash logic +# --------------------------------------------------------------------------- + +class TestIndexCommand: + def _make_knowledge_file(self, tmp_path: Path, name: str, content: str) -> Path: + d = tmp_path / "knowledge" / "decisions" + d.mkdir(parents=True, exist_ok=True) + f = d / name + f.write_text(content) + return f + + def _index_path(self, tmp_path: Path) -> Path: + return tmp_path / ".memex" / "index.json" + + def _run_index(self, tmp_path, monkeypatch, mock_embed=None): + if mock_embed is None: + mock_embed = MagicMock(return_value=[[0.1] * 384]) + monkeypatch.chdir(tmp_path) + with patch("memex.cli.embed", mock_embed): + runner = CliRunner() + result = runner.invoke(cli, ["index"]) + return result, mock_embed + + def test_new_file_gets_content_hash(self, tmp_path, monkeypatch): + self._make_knowledge_file(tmp_path, "record.md", FULL_MD) + self._run_index(tmp_path, monkeypatch) + index = json.loads(self._index_path(tmp_path).read_text()) + entry = list(index.values())[0] + assert "content_hash" in entry + assert len(entry["content_hash"]) == 64 # SHA256 hex + + def test_unchanged_file_not_reembedded(self, tmp_path, monkeypatch): + self._make_knowledge_file(tmp_path, "record.md", FULL_MD) + _, mock_embed = self._run_index(tmp_path, monkeypatch) + assert mock_embed.call_count == 1 + # Second run — should skip + _, mock_embed2 = self._run_index(tmp_path, monkeypatch) + assert mock_embed2.call_count == 0 + + def test_changed_file_gets_reembedded(self, tmp_path, monkeypatch): + f = self._make_knowledge_file(tmp_path, "record.md", FULL_MD) + self._run_index(tmp_path, monkeypatch) + # Mutate the file + f.write_text(FULL_MD.replace("PostgreSQL was being hit", "MySQL was being hit")) + _, mock_embed2 = self._run_index(tmp_path, monkeypatch) + assert mock_embed2.call_count == 1 + + def test_legacy_entry_without_hash_gets_reembedded(self, tmp_path, monkeypatch): + f = self._make_knowledge_file(tmp_path, "record.md", FULL_MD) + # Pre-populate index without content_hash (legacy format) + index_path = self._index_path(tmp_path) + index_path.parent.mkdir(parents=True, exist_ok=True) + index_path.write_text(json.dumps({ + str(f): { + "embedding": [0.1] * 384, + "title": "Use Redis for caching", + "excerpt": "old excerpt", + "confidence": 0.85, + "path": str(f), + # no content_hash + } + })) + monkeypatch.chdir(tmp_path) + mock_embed = MagicMock(return_value=[[0.2] * 384]) + with patch("memex.cli.embed", mock_embed): + CliRunner().invoke(cli, ["index"]) + assert mock_embed.call_count == 1 + + def test_deleted_record_removed_from_index(self, tmp_path, monkeypatch): + f1 = self._make_knowledge_file(tmp_path, "record1.md", FULL_MD) + f2 = self._make_knowledge_file(tmp_path, "record2.md", LOW_CONF_MD) + mock_embed = MagicMock(side_effect=lambda texts: [[0.1] * 384] * len(texts)) + self._run_index(tmp_path, monkeypatch, mock_embed) + # Delete one file + f2.unlink() + self._run_index(tmp_path, monkeypatch) + index = json.loads(self._index_path(tmp_path).read_text()) + # Index stores paths relative to cwd (tmp_path) + rel_f1 = str(f1.relative_to(tmp_path)) + rel_f2 = str(f2.relative_to(tmp_path)) + assert rel_f2 not in index + assert rel_f1 in index + + def test_embed_receives_clean_text_not_yaml(self, tmp_path, monkeypatch): + self._make_knowledge_file(tmp_path, "record.md", FULL_MD) + captured = [] + + def mock_embed(texts): + captured.extend(texts) + return [[0.1] * 384] * len(texts) + + monkeypatch.chdir(tmp_path) + with patch("memex.cli.embed", mock_embed): + CliRunner().invoke(cli, ["index"]) + + assert captured, "embed was not called" + embedded_text = captured[0] + assert "confidence:" not in embedded_text + assert "tags:" not in embedded_text + assert "⚠️" not in embedded_text + + +# --------------------------------------------------------------------------- +# query command — --min-score +# --------------------------------------------------------------------------- + +def _make_index_entry(title: str, score_hint: float, path: str = "knowledge/decisions/test.md"): + """Make a fake index entry with a fixed embedding (we'll mock cosine_similarity).""" + return { + "embedding": [score_hint], # mock will use actual cosine_similarity override + "title": title, + "excerpt": f"Excerpt for {title}", + "confidence": 0.85, + "path": path, + } + + +class TestQueryMinScore: + def _run_query(self, tmp_path, monkeypatch, index_data: dict, args: list[str]): + index_path = tmp_path / ".memex" / "index.json" + index_path.parent.mkdir(parents=True, exist_ok=True) + index_path.write_text(json.dumps(index_data)) + monkeypatch.chdir(tmp_path) + + # Mock embed to return a fixed vector, mock cosine_similarity to return + # the embedding value directly as the score + def mock_embed(texts): + return [[0.5] * 1] * len(texts) + + def mock_cosine(a, b): + return b[0] # score encoded in the entry's embedding + + with patch("memex.cli.embed", mock_embed), \ + patch("memex.cli.cosine_similarity", mock_cosine): + runner = CliRunner() + result = runner.invoke(cli, ["query"] + args) + return result + + def test_filters_result_below_min_score(self, tmp_path, monkeypatch): + index_data = { + "k/a.md": {**_make_index_entry("Irrelevant result", 0.0), "embedding": [0.60]}, + } + result = self._run_query(tmp_path, monkeypatch, index_data, + ["some", "query", "--min-score", "0.70"]) + assert "No relevant results found" in result.output + + def test_shows_result_above_min_score(self, tmp_path, monkeypatch): + index_data = { + "k/a.md": {**_make_index_entry("Good result", 0.0), "embedding": [0.80]}, + } + result = self._run_query(tmp_path, monkeypatch, index_data, + ["some", "query", "--min-score", "0.70"]) + assert "Good result" in result.output + + def test_no_results_message_contains_threshold(self, tmp_path, monkeypatch): + index_data = { + "k/a.md": {**_make_index_entry("Low match", 0.0), "embedding": [0.50]}, + } + result = self._run_query(tmp_path, monkeypatch, index_data, + ["test", "--min-score", "0.80"]) + assert "0.80" in result.output + + def test_no_results_suggests_lower_score(self, tmp_path, monkeypatch): + index_data = { + "k/a.md": {**_make_index_entry("Low match", 0.0), "embedding": [0.50]}, + } + result = self._run_query(tmp_path, monkeypatch, index_data, + ["test", "--min-score", "0.80"]) + assert "--min-score 0.6" in result.output + + def test_top_option_respected_after_filter(self, tmp_path, monkeypatch): + index_data = {f"k/{i}.md": {**_make_index_entry(f"Result {i}", 0.0), "embedding": [0.90]} + for i in range(5)} + result = self._run_query(tmp_path, monkeypatch, index_data, + ["test", "--min-score", "0.0", "--top", "2"]) + # Only 2 results should be shown + assert result.output.count("#1") == 1 + assert result.output.count("#2") == 1 + assert "#3" not in result.output + + def test_default_min_score_is_070(self, tmp_path, monkeypatch): + # Result at 0.69 should be hidden by default + index_data = { + "k/a.md": {**_make_index_entry("Borderline", 0.0), "embedding": [0.69]}, + } + result = self._run_query(tmp_path, monkeypatch, index_data, ["test"]) + assert "No relevant results found" in result.output + + +# --------------------------------------------------------------------------- +# query command — --expand +# --------------------------------------------------------------------------- + +class TestQueryExpand: + def _run_query_with_expand(self, tmp_path, monkeypatch, expanded_text: str, args: list[str]): + index_path = tmp_path / ".memex" / "index.json" + index_path.parent.mkdir(parents=True, exist_ok=True) + index_data = { + "k/a.md": {**_make_index_entry("Some result", 0.0), "embedding": [0.80]}, + } + index_path.write_text(json.dumps(index_data)) + monkeypatch.chdir(tmp_path) + + mock_message = MagicMock() + mock_message.content = [MagicMock(text=expanded_text)] + mock_client = MagicMock() + mock_client.messages.create.return_value = mock_message + + captured_embed_args = [] + + def mock_embed(texts): + captured_embed_args.extend(texts) + return [[0.5]] * len(texts) + + def mock_cosine(a, b): + return b[0] + + with patch("memex.cli.embed", mock_embed), \ + patch("memex.cli.cosine_similarity", mock_cosine), \ + patch("memex.cli._anthropic_client", return_value=mock_client): + runner = CliRunner() + result = runner.invoke(cli, ["query"] + args) + + return result, mock_client, captured_embed_args + + def test_expand_calls_haiku(self, tmp_path, monkeypatch): + _, mock_client, _ = self._run_query_with_expand( + tmp_path, monkeypatch, "expanded terms", ["test query", "--expand"] + ) + call_kwargs = mock_client.messages.create.call_args + assert call_kwargs.kwargs["model"] == "claude-haiku-4-5" + + def test_expand_passes_expanded_text_to_embed(self, tmp_path, monkeypatch): + _, _, captured = self._run_query_with_expand( + tmp_path, monkeypatch, "rich expanded search terms", ["original query", "--expand"] + ) + assert captured[0] == "rich expanded search terms" + + def test_expand_fallback_on_empty_response(self, tmp_path, monkeypatch): + _, _, captured = self._run_query_with_expand( + tmp_path, monkeypatch, "", ["original query", "--expand"] + ) + assert captured[0] == "original query" + + def test_no_expand_skips_anthropic_client(self, tmp_path, monkeypatch): + index_path = tmp_path / ".memex" / "index.json" + index_path.parent.mkdir(parents=True, exist_ok=True) + index_data = { + "k/a.md": {**_make_index_entry("Result", 0.0), "embedding": [0.80]}, + } + index_path.write_text(json.dumps(index_data)) + monkeypatch.chdir(tmp_path) + + with patch("memex.cli.embed", return_value=[[0.5]]), \ + patch("memex.cli.cosine_similarity", lambda a, b: b[0]), \ + patch("memex.cli._anthropic_client") as mock_factory: + CliRunner().invoke(cli, ["query", "test"]) + + mock_factory.assert_not_called() diff --git a/tests/test_extractor.py b/tests/test_extractor.py new file mode 100644 index 0000000..df79556 --- /dev/null +++ b/tests/test_extractor.py @@ -0,0 +1,256 @@ +"""Unit tests for structural detection and extractor interfaces.""" +import pytest +from memex.structural import ( + categorize_file, + is_structural_change, + build_changed_files_section, +) +from memex.extractor import is_low_signal, build_prompt + + +# --------------------------------------------------------------------------- +# categorize_file +# --------------------------------------------------------------------------- + +class TestCategorizeFile: + def test_alembic_migration(self): + assert categorize_file("alembic/versions/001_add_users.py") == "migration" + + def test_generic_migrations_dir(self): + assert categorize_file("migrations/0001_initial.py") == "migration" + + def test_nested_migrations_dir(self): + assert categorize_file("app/migrations/0002_add_column.py") == "migration" + + def test_rails_migration_naming(self): + assert categorize_file("db/migrate/20240101120000_create_users.rb") == "migration" + + def test_terraform_file(self): + assert categorize_file("infra/main.tf") == "infra" + + def test_terraform_in_root(self): + assert categorize_file("main.tf") == "infra" + + def test_k8s_yaml(self): + assert categorize_file("k8s/deployment.yaml") == "infra" + + def test_kubernetes_dir(self): + assert categorize_file("kubernetes/service.yml") == "infra" + + def test_helm_values(self): + assert categorize_file("helm/values.yaml") == "infra" + + def test_charts_dir(self): + assert categorize_file("charts/myapp/templates/ingress.yaml") == "infra" + + def test_openapi_yaml(self): + assert categorize_file("openapi.yaml") == "schema" + + def test_openapi_json(self): + assert categorize_file("api/openapi.json") == "schema" + + def test_swagger_yaml(self): + assert categorize_file("swagger.yaml") == "schema" + + def test_schema_json(self): + assert categorize_file("schema.json") == "schema" + + def test_graphql_schema(self): + assert categorize_file("schema.graphql") == "schema" + + def test_graphql_file(self): + assert categorize_file("api/queries.graphql") == "schema" + + def test_proto_file(self): + assert categorize_file("api/service.proto") == "schema" + + def test_dockerfile(self): + assert categorize_file("Dockerfile") == "deployment" + + def test_dockerfile_variant(self): + assert categorize_file("Dockerfile.prod") == "deployment" + + def test_docker_compose(self): + assert categorize_file("docker-compose.yml") == "deployment" + + def test_docker_compose_prod(self): + assert categorize_file("docker-compose.prod.yml") == "deployment" + + def test_regular_python_file(self): + assert categorize_file("src/app/models.py") is None + + def test_test_file(self): + assert categorize_file("tests/test_auth.py") is None + + def test_readme(self): + assert categorize_file("README.md") is None + + def test_plain_yaml_in_root_not_infra(self): + # Raw .yaml without an infra directory prefix must NOT match + assert categorize_file("config.yaml") is None + + def test_plain_yml_in_root_not_infra(self): + assert categorize_file("renovate.yml") is None + + def test_case_insensitive_migration(self): + assert categorize_file("Migrations/001_init.py") == "migration" + + def test_case_insensitive_terraform(self): + assert categorize_file("Terraform/main.tf") == "infra" + + +# --------------------------------------------------------------------------- +# is_structural_change +# --------------------------------------------------------------------------- + +class TestIsStructuralChange: + def test_empty_list_is_false(self): + assert not is_structural_change([]) + + def test_migration_file_is_structural(self): + assert is_structural_change(["migrations/001_add_sessions.py", "app/models.py"]) + + def test_terraform_is_structural(self): + assert is_structural_change(["terraform/main.tf", "README.md"]) + + def test_schema_only_is_structural(self): + assert is_structural_change(["openapi.yaml"]) + + def test_deployment_file_is_structural(self): + assert is_structural_change(["Dockerfile", "src/app.py"]) + + def test_regular_files_not_structural(self): + assert not is_structural_change(["src/auth.py", "tests/test_auth.py", "README.md"]) + + def test_plain_yaml_not_structural(self): + assert not is_structural_change(["config.yaml", "src/app.py"]) + + def test_mixed_structural_and_regular(self): + # One structural file is enough + assert is_structural_change(["src/app.py", "migrations/001.py", "tests/test_app.py"]) + + +# --------------------------------------------------------------------------- +# is_low_signal — with changed_files +# --------------------------------------------------------------------------- + +class TestIsLowSignalWithFiles: + def test_backward_compatible_no_changed_files(self): + # Existing callers pass no changed_files — must still work + assert is_low_signal("chore: update readme", "") + + def test_structural_files_override_title_filter(self): + # "chore: add sessions table" + empty body → normally low-signal by title + # but the migration file should rescue it + assert not is_low_signal( + "chore: add sessions table", + "", + changed_files=["migrations/001_add_sessions.py"], + ) + + def test_dep_bump_always_low_signal_even_with_structural_files(self): + # Always-low pattern wins — alembic migration doesn't rescue a dep-bump title + assert is_low_signal( + "bump alembic from 1.0 to 2.0", + "", + changed_files=["alembic/versions/migration.py"], + ) + + def test_structural_files_with_good_body_already_not_low_signal(self): + # Long body already passes — structural override is redundant but harmless + assert not is_low_signal( + "chore: schema migration", + "A" * 100, + changed_files=None, + ) + + def test_no_structural_files_low_signal_title_applies(self): + # Without structural files, the existing heuristic filters still work + assert is_low_signal("chore: tidy up imports", "", changed_files=["src/utils.py"]) + + def test_infra_files_override_low_signal(self): + assert not is_low_signal( + "fix: update terraform config", + "", + changed_files=["terraform/main.tf"], + ) + + def test_dockerfile_overrides_low_signal(self): + assert not is_low_signal( + "ci: update dockerfile", + "", + changed_files=["Dockerfile"], + ) + + +# --------------------------------------------------------------------------- +# build_changed_files_section +# --------------------------------------------------------------------------- + +class TestBuildChangedFilesSection: + def test_none_returns_no_file_list(self): + section = build_changed_files_section(None) + assert "No file list available" in section + + def test_empty_list_returns_no_file_list(self): + section = build_changed_files_section([]) + assert "No file list available" in section + + def test_migration_files_listed(self): + section = build_changed_files_section(["migrations/001_add_sessions.py"]) + assert "migrations/001_add_sessions.py" in section + assert "Database migrations" in section + + def test_infra_files_listed(self): + section = build_changed_files_section(["terraform/main.tf"]) + assert "terraform/main.tf" in section + assert "Infrastructure / IaC" in section + + def test_schema_files_listed(self): + section = build_changed_files_section(["openapi.yaml"]) + assert "openapi.yaml" in section + assert "API / Schema definitions" in section + + def test_non_structural_summarized_not_listed(self): + files = [f"src/module_{i}.py" for i in range(5)] + section = build_changed_files_section(files) + # Individual paths should NOT appear + for f in files: + assert f not in section + assert "5" in section # count is shown + + def test_mixed_structural_and_regular(self): + files = ["migrations/001.py", "src/app.py", "src/models.py"] + section = build_changed_files_section(files) + assert "migrations/001.py" in section + assert "Database migrations" in section + assert "2" in section # 2 other files summarized + + +# --------------------------------------------------------------------------- +# build_prompt — changed_files integration +# --------------------------------------------------------------------------- + +class TestBuildPromptWithFiles: + def test_no_changed_files_shows_unavailable(self): + prompt = build_prompt("Fix bug", "body", []) + assert "No file list available" in prompt + + def test_migration_files_appear_in_prompt(self): + prompt = build_prompt( + "Add sessions table", "", [], + changed_files=["migrations/001_add_sessions.py"], + ) + assert "migrations/001_add_sessions.py" in prompt + assert "Database migrations" in prompt + + def test_structural_instruction_in_prompt(self): + prompt = build_prompt("Add sessions table", "", [], + changed_files=["migrations/001_add_sessions.py"]) + assert "migrations, infrastructure" in prompt + + def test_backward_compatible_no_changed_files_param(self): + # Old call signature: build_prompt(title, body, comments) + prompt = build_prompt("Fix bug", "body", ["a review comment"]) + assert "a review comment" in prompt + assert "No file list available" in prompt diff --git a/tests/test_update.py b/tests/test_update.py index 971222c..c0c23ba 100644 --- a/tests/test_update.py +++ b/tests/test_update.py @@ -16,6 +16,7 @@ detect_repo, git_diff, git_files_changed, + git_changed_file_paths, git_log_since, load_state, save_state, @@ -305,6 +306,7 @@ def _make_pr_commit(pr_number=42): "author": {"login": "alice"}, "url": "https://github.com/acme/repo/pull/42", "reviews": [], + "changed_files": [], } @@ -393,7 +395,8 @@ def _make_direct_commit(): def _call_process_direct(commit, indexed_sources=None, n_files=1, - diff="some diff", is_low=False, extraction=None): + diff="some diff", is_low=False, extraction=None, + changed_file_paths=None): result = UpdateResult() if indexed_sources is None: indexed_sources = set() @@ -403,7 +406,8 @@ def _call_process_direct(commit, indexed_sources=None, n_files=1, mock_write = MagicMock(return_value=Path("knowledge/decisions/foo.md")) with patch("memex.update.git_files_changed", return_value=n_files), \ - patch("memex.update.git_diff", return_value=diff): + patch("memex.update.git_diff", return_value=diff), \ + patch("memex.update.git_changed_file_paths", return_value=changed_file_paths or []): _process_direct_commit( commit=commit, sha_short=commit.sha[:8], @@ -429,14 +433,37 @@ def test_process_direct_commit_already_indexed(): def test_process_direct_commit_stat_filter(): commit = _make_direct_commit() + # Non-structural files — stat filter applies + non_structural = [f"src/module_{i}.py" for i in range(MAX_FILES_CHANGED + 1)] result, mock_extract, mock_write = _call_process_direct( - commit, n_files=MAX_FILES_CHANGED + 1 + commit, n_files=MAX_FILES_CHANGED + 1, changed_file_paths=non_structural ) assert result.skipped_stat_filter == 1 mock_extract.assert_not_called() mock_write.assert_not_called() +def test_process_direct_commit_stat_filter_bypassed_for_structural(): + commit = _make_direct_commit() + # Structural migration file rescues commit from stat filter + structural = ["migrations/001_add_sessions.py"] + [f"src/m_{i}.py" for i in range(12)] + result, mock_extract, _ = _call_process_direct( + commit, n_files=MAX_FILES_CHANGED + 3, changed_file_paths=structural + ) + assert result.skipped_stat_filter == 0 + mock_extract.assert_called_once() + + +def test_process_direct_commit_stat_filter_non_structural_large(): + commit = _make_direct_commit() + non_structural = [f"src/module_{i}.py" for i in range(20)] + result, mock_extract, _ = _call_process_direct( + commit, n_files=MAX_FILES_CHANGED + 10, changed_file_paths=non_structural + ) + assert result.skipped_stat_filter == 1 + mock_extract.assert_not_called() + + def test_process_direct_commit_empty_diff(): commit = _make_direct_commit() result, mock_extract, mock_write = _call_process_direct(commit, diff="")