Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions src/products/gardener/engine/classifiers/anthropic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import type {
ClassifyOutput,
} from "../comment.js";
import { collectTreeDigest, formatDigest } from "./tree-digest.js";
import { filterDiffNoise } from "./diff-filter.js";
import {
parseVerdictJson,
validateAndGroundNodes,
Expand All @@ -37,7 +38,7 @@ const DEFAULT_MODEL = "claude-haiku-4-5";
const ANTHROPIC_URL = "https://api.anthropic.com/v1/messages";
const ANTHROPIC_VERSION = "2023-06-01";
const MAX_TOKENS = 1024;
const DIFF_CAP = 20_000;
const DIFF_CAP = 200_000;
const FETCH_TIMEOUT_MS = 60_000;

export interface AnthropicClassifierOptions {
Expand Down Expand Up @@ -160,14 +161,17 @@ function buildUserPrompt(input: ClassifyInput, digest: string): string {
parts.push(input.prView.body);
}
if (input.diff) {
parts.push("");
parts.push("## Diff");
parts.push("```diff");
parts.push(input.diff.slice(0, DIFF_CAP));
if (input.diff.length > DIFF_CAP) {
parts.push(`... (truncated, ${input.diff.length - DIFF_CAP} bytes omitted)`);
const filtered = filterDiffNoise(input.diff);
if (filtered.length > 0) {
parts.push("");
parts.push("## Diff");
parts.push("```diff");
parts.push(filtered.slice(0, DIFF_CAP));
if (filtered.length > DIFF_CAP) {
parts.push(`... (truncated, ${filtered.length - DIFF_CAP} bytes omitted)`);
}
parts.push("```");
}
parts.push("```");
}
} else if (input.type === "issue" && input.issueView) {
parts.push(`## Issue #${input.issueView.number ?? "?"}: ${input.issueView.title ?? ""}`);
Expand Down
20 changes: 12 additions & 8 deletions src/products/gardener/engine/classifiers/claude-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ import type {
ClassifyOutput,
} from "../comment.js";
import { collectTreeDigest, formatDigest } from "./tree-digest.js";
import { filterDiffNoise } from "./diff-filter.js";
import {
parseVerdictJson,
validateAndGroundNodes,
} from "./verdict-parse.js";

const DEFAULT_MODEL = "claude-haiku-4-5";
const DIFF_CAP = 20_000;
const DIFF_CAP = 200_000;
const SPAWN_TIMEOUT_MS = 90_000;

export type ClaudeCliFailureKind =
Expand Down Expand Up @@ -277,14 +278,17 @@ function buildPrompt(input: ClassifyInput, digest: string): string {
parts.push(input.prView.body);
}
if (input.diff) {
parts.push("");
parts.push("## Diff");
parts.push("```diff");
parts.push(input.diff.slice(0, DIFF_CAP));
if (input.diff.length > DIFF_CAP) {
parts.push(`... (truncated, ${input.diff.length - DIFF_CAP} bytes omitted)`);
const filtered = filterDiffNoise(input.diff);
if (filtered.length > 0) {
parts.push("");
parts.push("## Diff");
parts.push("```diff");
parts.push(filtered.slice(0, DIFF_CAP));
if (filtered.length > DIFF_CAP) {
parts.push(`... (truncated, ${filtered.length - DIFF_CAP} bytes omitted)`);
}
parts.push("```");
}
parts.push("```");
}
} else if (input.type === "issue" && input.issueView) {
parts.push(`## Issue #${input.issueView.number ?? "?"}: ${input.issueView.title ?? ""}`);
Expand Down
58 changes: 58 additions & 0 deletions src/products/gardener/engine/classifiers/diff-filter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/**
* Shared noise filter for PR diffs fed to classifiers.
*
* A PR that regenerates `pnpm-lock.yaml` or ships minified build output
* can blow past the classifier's byte cap before any real-code hunk
* reaches the model. Filter those files out of the diff BEFORE applying
* the cap, so the cap bounds real code instead of noise.
*
* Regex list originated in `src/products/tree/engine/sync.ts` (the
* `DIFF_NOISE_PATTERNS` constant used by `formatPrDiffForPrompt`) —
* extracted here so sync and both classifiers share one source of truth.
*/

export const DIFF_NOISE_PATTERNS: readonly RegExp[] = [
/(^|\/)(?:package-lock\.json|pnpm-lock\.yaml|yarn\.lock|Cargo\.lock|poetry\.lock|Gemfile\.lock)$/,
/(^|\/)(?:dist|build|out|coverage|node_modules|__pycache__)\//,
/\.(?:lock|min\.js|min\.css|map|snap)$/,
];

/** True when `filename` matches any noise pattern. */
export function isDiffNoise(filename: string): boolean {
return DIFF_NOISE_PATTERNS.some((re) => re.test(filename));
}

/**
* Strip noise-file hunks from a unified-diff text. The input is the raw
* output of `gh pr diff` (or equivalent) — a concatenation of per-file
* hunks each starting with `diff --git a/<path> b/<path>`.
*
* The parser is deliberately simple: it splits on the `diff --git`
* marker, extracts the `b/<path>` target filename from each hunk header,
* and drops the whole hunk if that path is noise. Anything before the
* first `diff --git` marker (rare — usually empty) is preserved as-is.
*
* Malformed hunks that don't yield a parseable filename are kept by
* default: we'd rather show too much than silently drop a real change.
*/
export function filterDiffNoise(diff: string): string {
if (diff === "") return diff;

// Split keeping the delimiter at the start of each segment. The first
// segment is anything before the first `diff --git` (usually empty).
const parts = diff.split(/(?=^diff --git )/m);
const kept: string[] = [];
for (const part of parts) {
if (!part.startsWith("diff --git ")) {
if (part.length > 0) kept.push(part);
continue;
}
const header = part.slice(0, part.indexOf("\n"));
// `diff --git a/<path> b/<path>` — pull the b-side path.
const match = header.match(/^diff --git a\/.+ b\/(.+)$/);
const filename = match?.[1]?.trim();
if (filename && isDiffNoise(filename)) continue;
kept.push(part);
}
return kept.join("");
}
2 changes: 1 addition & 1 deletion src/products/gardener/engine/classifiers/tree-digest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ export interface TreeNodeEntry {
summary: string;
}

const DIGEST_BUDGET_BYTES = 30_000;
const DIGEST_BUDGET_BYTES = 100_000;
const PER_NODE_SUMMARY_CAP = 400;
const SKIP_DIRS = new Set([
".git",
Expand Down
138 changes: 138 additions & 0 deletions tests/gardener/gardener-diff-filter.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import { describe, expect, it } from "vitest";
import {
DIFF_NOISE_PATTERNS,
filterDiffNoise,
isDiffNoise,
} from "#products/gardener/engine/classifiers/diff-filter.js";

describe("isDiffNoise", () => {
it("flags lockfiles at root and in subdirs", () => {
expect(isDiffNoise("pnpm-lock.yaml")).toBe(true);
expect(isDiffNoise("apps/web/package-lock.json")).toBe(true);
expect(isDiffNoise("rust/Cargo.lock")).toBe(true);
expect(isDiffNoise("py/poetry.lock")).toBe(true);
});

it("flags build output dirs", () => {
expect(isDiffNoise("dist/index.js")).toBe(true);
expect(isDiffNoise("build/output.js")).toBe(true);
expect(isDiffNoise("coverage/lcov.info")).toBe(true);
expect(isDiffNoise("node_modules/foo/index.js")).toBe(true);
expect(isDiffNoise("__pycache__/mod.cpython-311.pyc")).toBe(true);
});

it("flags minified and map artifacts", () => {
expect(isDiffNoise("vendor/jquery.min.js")).toBe(true);
expect(isDiffNoise("styles.min.css")).toBe(true);
expect(isDiffNoise("bundle.js.map")).toBe(true);
expect(isDiffNoise("snapshots/foo.snap")).toBe(true);
});

it("does not flag real source files", () => {
expect(isDiffNoise("src/index.ts")).toBe(false);
expect(isDiffNoise("README.md")).toBe(false);
expect(isDiffNoise("apps/web/src/App.tsx")).toBe(false);
// A file literally named "lock.ts" should NOT match — the .lock
// pattern requires the extension, not a substring.
expect(isDiffNoise("lock.ts")).toBe(false);
});

it("exports a non-empty pattern list", () => {
expect(DIFF_NOISE_PATTERNS.length).toBeGreaterThan(0);
});
});

describe("filterDiffNoise", () => {
it("returns empty diff untouched", () => {
expect(filterDiffNoise("")).toBe("");
});

it("drops a lockfile hunk while keeping real-code hunks", () => {
const diff = [
"diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml",
"index 111..222 100644",
"--- a/pnpm-lock.yaml",
"+++ b/pnpm-lock.yaml",
"@@ -1,1 +1,1 @@",
"-old",
"+new",
"diff --git a/src/index.ts b/src/index.ts",
"index 333..444 100644",
"--- a/src/index.ts",
"+++ b/src/index.ts",
"@@ -1,1 +1,1 @@",
"-export const x = 1;",
"+export const x = 2;",
"",
].join("\n");
const out = filterDiffNoise(diff);
expect(out).not.toContain("pnpm-lock.yaml");
expect(out).toContain("src/index.ts");
expect(out).toContain("export const x = 2;");
});

it("drops dist/ hunks", () => {
const diff = [
"diff --git a/dist/bundle.js b/dist/bundle.js",
"@@ -1 +1 @@",
"-a",
"+b",
"diff --git a/src/app.ts b/src/app.ts",
"@@ -1 +1 @@",
"-c",
"+d",
"",
].join("\n");
const out = filterDiffNoise(diff);
expect(out).not.toContain("dist/bundle.js");
expect(out).toContain("src/app.ts");
});

it("keeps entire diff when nothing is noise", () => {
const diff = [
"diff --git a/src/a.ts b/src/a.ts",
"@@ -1 +1 @@",
"-x",
"+y",
"diff --git a/src/b.ts b/src/b.ts",
"@@ -1 +1 @@",
"-m",
"+n",
"",
].join("\n");
const out = filterDiffNoise(diff);
expect(out).toBe(diff);
});

it("returns empty when every hunk is noise", () => {
const diff = [
"diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml",
"@@ -1 +1 @@",
"-a",
"+b",
"diff --git a/dist/out.js b/dist/out.js",
"@@ -1 +1 @@",
"-c",
"+d",
"",
].join("\n");
const out = filterDiffNoise(diff);
expect(out.trim()).toBe("");
});

it("preserves hunks with unparseable headers (fail-open)", () => {
const diff = [
"diff --git malformed-header-no-paths",
"some body",
"diff --git a/src/ok.ts b/src/ok.ts",
"@@ -1 +1 @@",
"-x",
"+y",
"",
].join("\n");
const out = filterDiffNoise(diff);
// malformed hunk should NOT be silently dropped
expect(out).toContain("malformed-header-no-paths");
expect(out).toContain("src/ok.ts");
});
});
Loading