Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 56 additions & 5 deletions src/application/services/ExtractService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@ import type {
import type { IGitTriageService } from '../../domain/interfaces/IGitTriageService';
import type { IMemoryRepository } from '../../domain/interfaces/IMemoryRepository';
import type { IGitClient } from '../../domain/interfaces/IGitClient';
import type { ITrailerService } from '../../domain/interfaces/ITrailerService';
import type { ILLMClient, ILLMExtractedFact } from '../../domain/interfaces/ILLMClient';
import type { MemoryType } from '../../domain/entities/IMemoryEntity';
import type { ConfidenceLevel } from '../../domain/types/IMemoryQuality';
import type { IPatternMatch } from '../../infrastructure/services/patterns/HeuristicPatterns';
import { extractPatternMatches } from '../../infrastructure/services/patterns/HeuristicPatterns';
import { extractWords, jaccardSimilarity } from '../../domain/utils/deduplication';
import { AI_TRAILER_KEYS } from '../../domain/entities/ITrailer';
import type { ILogger } from '../../domain/interfaces/ILogger';

/** Maximum diff length sent to LLM (chars). Truncated at line boundary. */
Expand All @@ -30,13 +32,21 @@ const MAX_DIFF_LENGTH = 15_000;
/** Jaccard similarity threshold for deduplication between heuristic and LLM facts. */
const DEDUP_THRESHOLD = 0.7;

/** Uniform fact shape for merging heuristic and LLM results. */
/** Trailer key → MemoryType mapping. */
const TRAILER_KEY_TO_MEMORY_TYPE: Record<string, MemoryType> = {
[AI_TRAILER_KEYS.DECISION]: 'decision',
[AI_TRAILER_KEYS.GOTCHA]: 'gotcha',
[AI_TRAILER_KEYS.CONVENTION]: 'convention',
[AI_TRAILER_KEYS.FACT]: 'fact',
};

/** Uniform fact shape for merging heuristic, LLM, and trailer results. */
interface IUnifiedFact {
readonly content: string;
readonly type: MemoryType;
readonly confidence: ConfidenceLevel;
readonly tags: readonly string[];
readonly source: 'heuristic-extraction' | 'llm-enrichment';
readonly source: 'heuristic-extraction' | 'llm-enrichment' | 'commit-trailer';
}

export class ExtractService implements IExtractService {
Expand All @@ -46,6 +56,7 @@ export class ExtractService implements IExtractService {
private readonly gitClient?: IGitClient,
private readonly llmClient?: ILLMClient,
private readonly logger?: ILogger,
private readonly trailerService?: ITrailerService,
) {}

async extract(options?: IExtractOptions): Promise<IExtractResult> {
Expand Down Expand Up @@ -85,8 +96,17 @@ export class ExtractService implements IExtractService {
for (const scored of triageResult.highInterest) {
commitIndex++;
options?.onProgress?.({ phase: 'processing', current: commitIndex, total: highInterestTotal, sha: scored.commit.sha, subject: scored.commit.subject, factsExtracted: totalFactsExtracted });

// Read existing AI-* trailers from this commit (authoritative, high-confidence)
const trailerFacts = this.extractTrailerFacts(scored.commit.sha, options?.cwd);
const trailerTypes = new Set(trailerFacts.map(f => f.type));

// Heuristic extraction — skip types already covered by trailers
const text = `${scored.commit.subject}\n${scored.commit.body}`.trim();
const heuristicMatches = extractPatternMatches(text);
const allHeuristicMatches = extractPatternMatches(text);
const heuristicMatches = trailerTypes.size > 0
? allHeuristicMatches.filter(m => !trailerTypes.has(m.factType))
: allHeuristicMatches;

// LLM enrichment (if enabled)
let llmFacts: ILLMExtractedFact[] = [];
Expand Down Expand Up @@ -121,8 +141,8 @@ export class ExtractService implements IExtractService {
}
}

// Merge heuristic + LLM facts with deduplication
const mergedFacts = mergeFacts(heuristicMatches, llmFacts);
// Merge trailer + heuristic + LLM facts with deduplication
const mergedFacts = [...trailerFacts, ...mergeFacts(heuristicMatches, llmFacts)];

if (mergedFacts.length === 0) continue;

Expand Down Expand Up @@ -175,6 +195,37 @@ export class ExtractService implements IExtractService {

return result;
}

private extractTrailerFacts(sha: string, cwd?: string): IUnifiedFact[] {
if (!this.trailerService) return [];

try {
const trailers = this.trailerService.readTrailers(sha, cwd);
if (trailers.length === 0) return [];

Comment on lines +203 to +205
Copy link

Copilot AI Feb 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The extractTrailerFacts method spawns a git process for each commit via readTrailers(sha). For large liberations, this could be inefficient. Consider using queryTrailers (which can batch-query multiple commits at once) to fetch all trailer data upfront, then look up by SHA during processing. This is a performance optimization opportunity, not a blocking issue, but could significantly speed up large-scale liberations.

Suggested change
const trailers = this.trailerService.readTrailers(sha, cwd);
if (trailers.length === 0) return [];
let trailers: readonly { key: string; value: string }[] = [];
// Prefer batch-capable queryTrailers when available, fall back to per-commit readTrailers.
const trailerServiceAny = this.trailerService as any;
const queried = typeof trailerServiceAny.queryTrailers === 'function'
? trailerServiceAny.queryTrailers([sha], cwd)
: undefined;
if (Array.isArray(queried)) {
// Some implementations may return the trailers array directly for a single SHA.
trailers = queried;
} else if (queried && typeof queried === 'object') {
// Others may return a map/dictionary keyed by SHA.
const bySha = (queried as Record<string, readonly { key: string; value: string }[] | undefined>)[sha];
if (Array.isArray(bySha)) {
trailers = bySha;
}
}
if (trailers.length === 0) {
trailers = this.trailerService.readTrailers(sha, cwd);
}
if (!trailers || trailers.length === 0) return [];

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good callout. For now, readTrailers() per commit is acceptable since liberate processes high-interest commits only (typically a small subset). If this becomes a bottleneck, we can switch to queryTrailers() for batch retrieval. Deferring as a future optimization.

const facts: IUnifiedFact[] = [];
const confidence = (trailers.find(t => t.key === AI_TRAILER_KEYS.CONFIDENCE)?.value || 'high') as ConfidenceLevel;
Copy link

Copilot AI Feb 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The confidence value from trailers is cast to ConfidenceLevel without validation. If a manually-added trailer has an invalid confidence value (e.g., "AI-Confidence: very-high"), this will silently accept it and potentially cause issues downstream. Consider using the isValidConfidence helper function from domain/types/IMemoryQuality to validate the value before casting, falling back to 'high' if invalid.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed — extractTrailerFacts() now validates the confidence value using isValidConfidence() with a fallback to 'high'. See 97190a7.

const tagsStr = trailers.find(t => t.key === AI_TRAILER_KEYS.TAGS)?.value;
const tags: string[] = tagsStr ? tagsStr.split(',').map(t => t.trim()) : [];

for (const trailer of trailers) {
const type = TRAILER_KEY_TO_MEMORY_TYPE[trailer.key];
if (!type) continue;

facts.push({
content: trailer.value,
type,
confidence,
tags,
source: 'commit-trailer',
});
}

return facts;
} catch {
Comment thread
coderabbitai[bot] marked this conversation as resolved.
return [];
}
}
}

/**
Expand Down
131 changes: 130 additions & 1 deletion tests/unit/application/services/ExtractService.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,26 @@ import { GitTriageService } from '../../../../src/application/services/GitTriage
import { MemoryRepository } from '../../../../src/infrastructure/repositories/MemoryRepository';
import { NotesService } from '../../../../src/infrastructure/services/NotesService';
import { GitClient } from '../../../../src/infrastructure/git/GitClient';
import { TrailerService } from '../../../../src/infrastructure/services/TrailerService';

function git(args: string[], cwd: string): string {
return execFileSync('git', args, { encoding: 'utf8', cwd }).trim();
}

describe('ExtractService', () => {
let service: ExtractService;
let serviceWithTrailers: ExtractService;
let memoryRepo: MemoryRepository;
let repoDir: string;

before(() => {
const gitClient = new GitClient();
const triageService = new GitTriageService(gitClient);
const notesService = new NotesService();
const memoryRepo = new MemoryRepository(notesService);
memoryRepo = new MemoryRepository(notesService);
const trailerService = new TrailerService();
service = new ExtractService(triageService, memoryRepo);
serviceWithTrailers = new ExtractService(triageService, memoryRepo, gitClient, undefined, undefined, trailerService);

repoDir = mkdtempSync(join(tmpdir(), 'git-mem-extract-test-'));
git(['init'], repoDir);
Expand Down Expand Up @@ -142,4 +147,128 @@ describe('ExtractService', () => {
assert.equal(events[1].factsExtracted, 0);
});
});

describe('extract with trailers', () => {
let trailerRepoDir: string;

before(() => {
trailerRepoDir = mkdtempSync(join(tmpdir(), 'git-mem-extract-trailer-'));
git(['init'], trailerRepoDir);
git(['config', 'user.email', 'test@test.com'], trailerRepoDir);
git(['config', 'user.name', 'Test User'], trailerRepoDir);

writeFileSync(join(trailerRepoDir, 'file1.txt'), 'hello');
git(['add', '.'], trailerRepoDir);
git(['commit', '-m', 'initial commit'], trailerRepoDir);
});

after(() => {
rmSync(trailerRepoDir, { recursive: true, force: true });
});

it('should import AI-* trailers as high-confidence memories', async () => {
// Create a commit with AI-Decision trailer
writeFileSync(join(trailerRepoDir, 'trailer1.txt'), 'trailer1');
git(['add', '.'], trailerRepoDir);
const msg = 'feat: add caching layer\n\nAI-Decision: Use Redis for caching\nAI-Confidence: high';
git(['commit', '-m', msg], trailerRepoDir);

const result = await serviceWithTrailers.extract({
cwd: trailerRepoDir,
dryRun: false,
threshold: 1,
});

// Should extract the trailer as a fact
const annotation = result.annotations.find(a => a.subject === 'feat: add caching layer');
assert.ok(annotation, 'should find annotation for trailer commit');
assert.ok(annotation.factsExtracted >= 1);
assert.ok(annotation.factTypes.includes('decision'));

// Verify the memory was stored with correct source
const memories = memoryRepo.query({ cwd: trailerRepoDir });
const trailerMemory = memories.memories.find(m => m.content === 'Use Redis for caching');
assert.ok(trailerMemory);
assert.equal(trailerMemory.source, 'commit-trailer');
assert.equal(trailerMemory.type, 'decision');
});

it('should not duplicate when trailer and heuristic extract same type', async () => {
// Create a commit with both a decision trailer AND decision keywords in the message
writeFileSync(join(trailerRepoDir, 'dedup1.txt'), 'dedup1');
git(['add', '.'], trailerRepoDir);
const msg = [
'feat: migrate to PostgreSQL',
'',
'Decided to use PostgreSQL instead of MySQL because it has better JSON support.',
'',
'AI-Decision: Use PostgreSQL for persistence',
'AI-Confidence: high',
].join('\n');
git(['commit', '-m', msg], trailerRepoDir);

const result = await serviceWithTrailers.extract({
cwd: trailerRepoDir,
dryRun: false,
threshold: 1,
});

const annotation = result.annotations.find(a => a.subject === 'feat: migrate to PostgreSQL');
assert.ok(annotation);

// The trailer decision should be included but heuristic decision should be skipped
const memories = memoryRepo.query({ cwd: trailerRepoDir });
const pgMemories = memories.memories.filter(
m => m.sha === annotation.sha && m.type === 'decision'
);

// Should have exactly 1 decision (from trailer), not 2 (trailer + heuristic)
assert.equal(pgMemories.length, 1, 'should not duplicate decision from trailer + heuristic');
assert.equal(pgMemories[0].source, 'commit-trailer');
});

it('should still extract heuristic facts for types not covered by trailers', async () => {
// Commit with AI-Decision trailer but gotcha keywords in message
writeFileSync(join(trailerRepoDir, 'mixed1.txt'), 'mixed1');
git(['add', '.'], trailerRepoDir);
const msg = [
'feat: add auth middleware',
'',
'Watch out: tokens expire after 24h, must handle refresh.',
'',
'AI-Decision: Use middleware pattern for auth',
'AI-Confidence: high',
].join('\n');
git(['commit', '-m', msg], trailerRepoDir);

const result = await serviceWithTrailers.extract({
cwd: trailerRepoDir,
dryRun: false,
threshold: 1,
});

const annotation = result.annotations.find(a => a.subject === 'feat: add auth middleware');
assert.ok(annotation);
// Should have both decision (from trailer) and gotcha (from heuristic)
assert.ok(annotation.factTypes.includes('decision'));
assert.ok(annotation.factTypes.includes('gotcha'));
});

it('should work without trailerService (existing behavior preserved)', async () => {
writeFileSync(join(trailerRepoDir, 'no-svc.txt'), 'no-svc');
git(['add', '.'], trailerRepoDir);
const msg = 'feat: add validation\n\nAI-Decision: Use Zod for validation';
git(['commit', '-m', msg], trailerRepoDir);

// Service without trailerService should still work (ignores trailers)
const result = await service.extract({
cwd: trailerRepoDir,
dryRun: true,
threshold: 1,
});

assert.ok(result.commitsScanned >= 1);
assert.ok(result.durationMs >= 0);
});
});
});