From bb65d91d67d11372043e7da1c6badbea3ac9b4e4 Mon Sep 17 00:00:00 2001 From: Srikanth Rao M Date: Sat, 28 Mar 2026 17:10:37 +0530 Subject: [PATCH 1/2] feat(cli): add analysis/ module with prompt builders, parsers, and normalizers Move 9 pure-function modules from server/src/llm/ to cli/src/analysis/ as the foundation for native analysis (--native mode, Issue #238, Phase 12 v4.8.0). New modules in cli/src/analysis/: - prompt-types.ts: AnalysisResponse, PromptQualityResponse, SQLiteMessageRow, ContentBlock - prompt-constants.ts: Canonical categories and classification guidance strings - prompts.ts: buildSessionAnalysisInstructions, buildPromptQualityInstructions, buildFacetOnlyInstructions - message-format.ts: formatMessagesForAnalysis, classifyStoredUserMessage, formatSessionMetaLine - response-parsers.ts: parseAnalysisResponse, parsePromptQualityResponse, extractJsonPayload - normalize-utils.ts: levenshtein, normalizeCategory, kebabToTitleCase - friction-normalize.ts: normalizeFrictionCategory - pattern-normalize.ts: normalizePatternCategory, getPatternCategoryLabel - prompt-quality-normalize.ts: normalizePromptQualityCategory, getPQCategoryLabel, getPQCategoryType Added 9 ./analysis/* exports to cli/package.json. Moved test files to cli/src/analysis/__tests__/ (5 test files, 131 tests). Co-Authored-By: Claude Sonnet 4.6 --- cli/package.json | 11 +- .../__tests__/friction-normalize.test.ts | 153 ++++ .../__tests__/normalize-utils.test.ts | 69 ++ .../__tests__/pattern-normalize.test.ts | 189 +++++ .../prompt-quality-normalize.test.ts | 99 +++ cli/src/analysis/__tests__/prompts.test.ts | 702 ++++++++++++++++++ cli/src/analysis/friction-normalize.ts | 56 ++ cli/src/analysis/message-format.ts | 142 ++++ cli/src/analysis/normalize-utils.ts | 87 +++ cli/src/analysis/pattern-normalize.ts | 101 +++ cli/src/analysis/prompt-constants.ts | 189 +++++ cli/src/analysis/prompt-quality-normalize.ts | 131 ++++ cli/src/analysis/prompt-types.ts | 143 ++++ cli/src/analysis/prompts.ts | 423 +++++++++++ cli/src/analysis/response-parsers.ts | 200 +++++ 15 files changed, 2694 insertions(+), 1 deletion(-) create mode 100644 cli/src/analysis/__tests__/friction-normalize.test.ts create mode 100644 cli/src/analysis/__tests__/normalize-utils.test.ts create mode 100644 cli/src/analysis/__tests__/pattern-normalize.test.ts create mode 100644 cli/src/analysis/__tests__/prompt-quality-normalize.test.ts create mode 100644 cli/src/analysis/__tests__/prompts.test.ts create mode 100644 cli/src/analysis/friction-normalize.ts create mode 100644 cli/src/analysis/message-format.ts create mode 100644 cli/src/analysis/normalize-utils.ts create mode 100644 cli/src/analysis/pattern-normalize.ts create mode 100644 cli/src/analysis/prompt-constants.ts create mode 100644 cli/src/analysis/prompt-quality-normalize.ts create mode 100644 cli/src/analysis/prompt-types.ts create mode 100644 cli/src/analysis/prompts.ts create mode 100644 cli/src/analysis/response-parsers.ts diff --git a/cli/package.json b/cli/package.json index d6b01f5..142c594 100644 --- a/cli/package.json +++ b/cli/package.json @@ -15,7 +15,16 @@ "./utils/browser": "./dist/utils/browser.js", "./constants/llm-providers": "./dist/constants/llm-providers.js", "./utils/pricing": "./dist/utils/pricing.js", - "./utils/telemetry": "./dist/utils/telemetry.js" + "./utils/telemetry": "./dist/utils/telemetry.js", + "./analysis/prompts": "./dist/analysis/prompts.js", + "./analysis/prompt-types": "./dist/analysis/prompt-types.js", + "./analysis/prompt-constants": "./dist/analysis/prompt-constants.js", + "./analysis/message-format": "./dist/analysis/message-format.js", + "./analysis/response-parsers": "./dist/analysis/response-parsers.js", + "./analysis/normalize-utils": "./dist/analysis/normalize-utils.js", + "./analysis/friction-normalize": "./dist/analysis/friction-normalize.js", + "./analysis/pattern-normalize": "./dist/analysis/pattern-normalize.js", + "./analysis/prompt-quality-normalize": "./dist/analysis/prompt-quality-normalize.js" }, "bin": { "code-insights": "./dist/index.js" diff --git a/cli/src/analysis/__tests__/friction-normalize.test.ts b/cli/src/analysis/__tests__/friction-normalize.test.ts new file mode 100644 index 0000000..25c7392 --- /dev/null +++ b/cli/src/analysis/__tests__/friction-normalize.test.ts @@ -0,0 +1,153 @@ +import { describe, it, expect } from 'vitest'; +import { normalizeFrictionCategory } from '../friction-normalize.js'; + +// ────────────────────────────────────────────────────── +// normalizeFrictionCategory +// ────────────────────────────────────────────────────── + +describe('normalizeFrictionCategory', () => { + // ──────────────────────────────────────────────────── + // Rule 1: Exact match (case-insensitive) + // ──────────────────────────────────────────────────── + + it('returns canonical for exact match', () => { + expect(normalizeFrictionCategory('knowledge-gap')).toBe('knowledge-gap'); + expect(normalizeFrictionCategory('wrong-approach')).toBe('wrong-approach'); + expect(normalizeFrictionCategory('stale-assumptions')).toBe('stale-assumptions'); + expect(normalizeFrictionCategory('context-loss')).toBe('context-loss'); + expect(normalizeFrictionCategory('scope-creep')).toBe('scope-creep'); + expect(normalizeFrictionCategory('repeated-mistakes')).toBe('repeated-mistakes'); + }); + + it('matches case-insensitively', () => { + expect(normalizeFrictionCategory('Knowledge-Gap')).toBe('knowledge-gap'); + expect(normalizeFrictionCategory('WRONG-APPROACH')).toBe('wrong-approach'); + expect(normalizeFrictionCategory('Stale-Assumptions')).toBe('stale-assumptions'); + }); + + // ──────────────────────────────────────────────────── + // Rule 2: Levenshtein distance <= 2 + // ──────────────────────────────────────────────────── + + it('normalizes typos within Levenshtein distance 2', () => { + expect(normalizeFrictionCategory('knowlede-gap')).toBe('knowledge-gap'); // distance 1 + expect(normalizeFrictionCategory('wrong-aproach')).toBe('wrong-approach'); // distance 1 + expect(normalizeFrictionCategory('scope-crepp')).toBe('scope-creep'); // distance 1 + }); + + it('does not match when Levenshtein distance > 2', () => { + // "typo-error" is distance 3 from "type-error" — too far + const result = normalizeFrictionCategory('completely-different-thing'); + expect(result).toBe('completely-different-thing'); + }); + + // ──────────────────────────────────────────────────── + // Rule 3: Substring match (significant portion) + // ──────────────────────────────────────────────────── + + it('matches when canonical is a significant substring', () => { + // "scope-creep-issue" contains "scope-creep" (11 chars, 11/17 = 0.65 > 0.5) + expect(normalizeFrictionCategory('scope-creep-issue')).toBe('scope-creep'); + }); + + it('does not match short substrings (< 5 chars)', () => { + // Very short overlaps should not trigger substring match + const result = normalizeFrictionCategory('abc'); + expect(result).toBe('abc'); + }); + + // ──────────────────────────────────────────────────── + // Rule 1.5: Explicit alias match + // ──────────────────────────────────────────────────── + + it('remaps legacy canonical categories to new taxonomy', () => { + // These were canonical in the old 15-category taxonomy; they now map to new categories + expect(normalizeFrictionCategory('missing-dependency')).toBe('stale-assumptions'); + expect(normalizeFrictionCategory('config-drift')).toBe('stale-assumptions'); + expect(normalizeFrictionCategory('stale-cache')).toBe('stale-assumptions'); + expect(normalizeFrictionCategory('version-mismatch')).toBe('stale-assumptions'); + expect(normalizeFrictionCategory('permission-issue')).toBe('stale-assumptions'); + expect(normalizeFrictionCategory('environment-mismatch')).toBe('stale-assumptions'); + expect(normalizeFrictionCategory('race-condition')).toBe('wrong-approach'); + expect(normalizeFrictionCategory('circular-dependency')).toBe('wrong-approach'); + expect(normalizeFrictionCategory('test-failure')).toBe('wrong-approach'); + expect(normalizeFrictionCategory('type-error')).toBe('knowledge-gap'); + expect(normalizeFrictionCategory('api-misunderstanding')).toBe('knowledge-gap'); + }); + + it('remaps legacy aliases case-insensitively', () => { + expect(normalizeFrictionCategory('Missing-Dependency')).toBe('stale-assumptions'); + expect(normalizeFrictionCategory('TYPE-ERROR')).toBe('knowledge-gap'); + }); + + it('resolves all agent-orchestration alias variants to the cluster target', () => { + expect(normalizeFrictionCategory('agent-lifecycle-issue')).toBe('agent-orchestration-failure'); + expect(normalizeFrictionCategory('agent-communication-failure')).toBe('agent-orchestration-failure'); + expect(normalizeFrictionCategory('agent-communication-breakdown')).toBe('agent-orchestration-failure'); + expect(normalizeFrictionCategory('agent-lifecycle-management')).toBe('agent-orchestration-failure'); + expect(normalizeFrictionCategory('agent-shutdown-failure')).toBe('agent-orchestration-failure'); + }); + + it('resolves all rate-limit alias variants to the cluster target', () => { + expect(normalizeFrictionCategory('api-rate-limit')).toBe('rate-limit-hit'); + expect(normalizeFrictionCategory('rate-limiting')).toBe('rate-limit-hit'); + expect(normalizeFrictionCategory('rate-limited')).toBe('rate-limit-hit'); + }); + + it('resolves aliases case-insensitively', () => { + expect(normalizeFrictionCategory('Agent-Lifecycle-Issue')).toBe('agent-orchestration-failure'); + expect(normalizeFrictionCategory('API-RATE-LIMIT')).toBe('rate-limit-hit'); + }); + + it('does not further normalize non-canonical alias targets via Levenshtein', () => { + // "agent-orchestration-failure" is NOT in CANONICAL_FRICTION_CATEGORIES, + // but when returned as an alias target it should be returned as-is (not mangled by Levenshtein). + // Here we test the target itself — it should pass through as a novel category since it + // doesn't match any canonical via Levenshtein and isn't in the alias map as a key. + const result = normalizeFrictionCategory('agent-orchestration-failure'); + // Not canonical, not an alias key → returned as novel category (original casing) + expect(result).toBe('agent-orchestration-failure'); + }); + + it('does not further normalize "rate-limit-hit" target when passed directly', () => { + // Same as above — "rate-limit-hit" is not canonical, so if someone passes it directly + // it comes back as-is (novel category). + const result = normalizeFrictionCategory('rate-limit-hit'); + expect(result).toBe('rate-limit-hit'); + }); + + // ──────────────────────────────────────────────────── + // Rule 4: Novel category (no match) + // ──────────────────────────────────────────────────── + + it('returns original for novel categories', () => { + expect(normalizeFrictionCategory('database-deadlock')).toBe('database-deadlock'); + expect(normalizeFrictionCategory('memory-leak')).toBe('memory-leak'); + expect(normalizeFrictionCategory('flaky-ci')).toBe('flaky-ci'); + }); + + it('preserves original casing for novel categories', () => { + expect(normalizeFrictionCategory('Custom-Category')).toBe('Custom-Category'); + }); + + // ──────────────────────────────────────────────────── + // All canonical categories are recognized + // ──────────────────────────────────────────────────── + + it('recognizes all 9 canonical categories', () => { + const canonicals = [ + 'wrong-approach', + 'knowledge-gap', + 'stale-assumptions', + 'incomplete-requirements', + 'context-loss', + 'scope-creep', + 'repeated-mistakes', + 'documentation-gap', + 'tooling-limitation', + ]; + for (const cat of canonicals) { + expect(normalizeFrictionCategory(cat)).toBe(cat); + } + }); +}); diff --git a/cli/src/analysis/__tests__/normalize-utils.test.ts b/cli/src/analysis/__tests__/normalize-utils.test.ts new file mode 100644 index 0000000..f242925 --- /dev/null +++ b/cli/src/analysis/__tests__/normalize-utils.test.ts @@ -0,0 +1,69 @@ +import { describe, it, expect } from 'vitest'; +import { levenshtein, normalizeCategory, kebabToTitleCase } from '../normalize-utils.js'; + +describe('levenshtein', () => { + it('returns 0 for identical strings', () => { + expect(levenshtein('abc', 'abc')).toBe(0); + }); + + it('returns correct distance for single edit', () => { + expect(levenshtein('kitten', 'sitten')).toBe(1); + }); + + it('returns correct distance for multiple edits', () => { + expect(levenshtein('kitten', 'sitting')).toBe(3); + }); + + it('handles empty strings', () => { + expect(levenshtein('', 'abc')).toBe(3); + expect(levenshtein('abc', '')).toBe(3); + expect(levenshtein('', '')).toBe(0); + }); +}); + +describe('normalizeCategory', () => { + const config: Parameters[1] = { + canonicalCategories: ['wrong-approach', 'knowledge-gap', 'stale-assumptions'], + aliases: { 'type-error': 'knowledge-gap', 'agent-issue': 'agent-failure' }, + }; + + it('returns canonical for exact match (case-insensitive)', () => { + expect(normalizeCategory('knowledge-gap', config)).toBe('knowledge-gap'); + expect(normalizeCategory('Knowledge-Gap', config)).toBe('knowledge-gap'); + }); + + it('resolves aliases to their target', () => { + expect(normalizeCategory('type-error', config)).toBe('knowledge-gap'); + }); + + it('resolves aliases to non-canonical cluster targets', () => { + expect(normalizeCategory('agent-issue', config)).toBe('agent-failure'); + }); + + it('normalizes via Levenshtein distance <= 2', () => { + expect(normalizeCategory('knowlede-gap', config)).toBe('knowledge-gap'); // dist 1 + }); + + it('normalizes via substring match', () => { + expect(normalizeCategory('stale-assumptions-here', config)).toBe('stale-assumptions'); + }); + + it('returns original for no match', () => { + expect(normalizeCategory('completely-unrelated', config)).toBe('completely-unrelated'); + }); +}); + +describe('kebabToTitleCase', () => { + it('converts kebab-case to Title Case', () => { + expect(kebabToTitleCase('structured-planning')).toBe('Structured Planning'); + expect(kebabToTitleCase('self-correction')).toBe('Self Correction'); + }); + + it('handles single word', () => { + expect(kebabToTitleCase('planning')).toBe('Planning'); + }); + + it('handles empty string', () => { + expect(kebabToTitleCase('')).toBe(''); + }); +}); diff --git a/cli/src/analysis/__tests__/pattern-normalize.test.ts b/cli/src/analysis/__tests__/pattern-normalize.test.ts new file mode 100644 index 0000000..9692124 --- /dev/null +++ b/cli/src/analysis/__tests__/pattern-normalize.test.ts @@ -0,0 +1,189 @@ +import { describe, it, expect } from 'vitest'; +import { normalizePatternCategory, getPatternCategoryLabel } from '../pattern-normalize.js'; + +// ────────────────────────────────────────────────────── +// normalizePatternCategory +// ────────────────────────────────────────────────────── + +describe('normalizePatternCategory', () => { + // ──────────────────────────────────────────────────── + // Rule 1: Exact match (case-insensitive) + // ──────────────────────────────────────────────────── + + it('returns canonical for exact match — all 8 categories', () => { + expect(normalizePatternCategory('structured-planning')).toBe('structured-planning'); + expect(normalizePatternCategory('incremental-implementation')).toBe('incremental-implementation'); + expect(normalizePatternCategory('verification-workflow')).toBe('verification-workflow'); + expect(normalizePatternCategory('systematic-debugging')).toBe('systematic-debugging'); + expect(normalizePatternCategory('self-correction')).toBe('self-correction'); + expect(normalizePatternCategory('context-gathering')).toBe('context-gathering'); + expect(normalizePatternCategory('domain-expertise')).toBe('domain-expertise'); + expect(normalizePatternCategory('effective-tooling')).toBe('effective-tooling'); + }); + + it('matches case-insensitively', () => { + expect(normalizePatternCategory('Structured-Planning')).toBe('structured-planning'); + expect(normalizePatternCategory('INCREMENTAL-IMPLEMENTATION')).toBe('incremental-implementation'); + expect(normalizePatternCategory('Self-Correction')).toBe('self-correction'); + expect(normalizePatternCategory('Domain-Expertise')).toBe('domain-expertise'); + }); + + // ──────────────────────────────────────────────────── + // Rule 1.5: Explicit alias match + // ──────────────────────────────────────────────────── + + it('resolves all structured-planning aliases', () => { + expect(normalizePatternCategory('task-decomposition')).toBe('structured-planning'); + expect(normalizePatternCategory('plan-first')).toBe('structured-planning'); + expect(normalizePatternCategory('upfront-planning')).toBe('structured-planning'); + expect(normalizePatternCategory('phased-approach')).toBe('structured-planning'); + expect(normalizePatternCategory('task-breakdown')).toBe('structured-planning'); + expect(normalizePatternCategory('planning-before-implementation')).toBe('structured-planning'); + }); + + it('resolves all effective-tooling aliases', () => { + expect(normalizePatternCategory('agent-delegation')).toBe('effective-tooling'); + expect(normalizePatternCategory('agent-orchestration')).toBe('effective-tooling'); + expect(normalizePatternCategory('specialized-agents')).toBe('effective-tooling'); + expect(normalizePatternCategory('multi-agent')).toBe('effective-tooling'); + expect(normalizePatternCategory('tool-leverage')).toBe('effective-tooling'); + }); + + it('resolves all verification-workflow aliases', () => { + expect(normalizePatternCategory('build-test-verify')).toBe('verification-workflow'); + expect(normalizePatternCategory('test-driven-development')).toBe('verification-workflow'); + expect(normalizePatternCategory('tdd')).toBe('verification-workflow'); + expect(normalizePatternCategory('test-first')).toBe('verification-workflow'); + expect(normalizePatternCategory('pre-commit-checks')).toBe('verification-workflow'); + }); + + it('resolves all systematic-debugging aliases', () => { + expect(normalizePatternCategory('binary-search-debugging')).toBe('systematic-debugging'); + expect(normalizePatternCategory('methodical-debugging')).toBe('systematic-debugging'); + expect(normalizePatternCategory('log-based-debugging')).toBe('systematic-debugging'); + expect(normalizePatternCategory('debugging-methodology')).toBe('systematic-debugging'); + }); + + it('resolves all self-correction aliases', () => { + expect(normalizePatternCategory('course-correction')).toBe('self-correction'); + expect(normalizePatternCategory('pivot-on-failure')).toBe('self-correction'); + expect(normalizePatternCategory('backtracking')).toBe('self-correction'); + }); + + it('resolves all context-gathering aliases', () => { + expect(normalizePatternCategory('code-reading-first')).toBe('context-gathering'); + expect(normalizePatternCategory('codebase-exploration')).toBe('context-gathering'); + expect(normalizePatternCategory('understanding-before-changing')).toBe('context-gathering'); + }); + + it('resolves all domain-expertise aliases', () => { + expect(normalizePatternCategory('framework-knowledge')).toBe('domain-expertise'); + expect(normalizePatternCategory('types-first')).toBe('domain-expertise'); + expect(normalizePatternCategory('type-driven-development')).toBe('domain-expertise'); + expect(normalizePatternCategory('schema-first')).toBe('domain-expertise'); + }); + + it('resolves all incremental-implementation aliases', () => { + expect(normalizePatternCategory('small-steps')).toBe('incremental-implementation'); + expect(normalizePatternCategory('iterative-building')).toBe('incremental-implementation'); + expect(normalizePatternCategory('iterative-development')).toBe('incremental-implementation'); + }); + + it('resolves aliases case-insensitively', () => { + expect(normalizePatternCategory('Task-Decomposition')).toBe('structured-planning'); + expect(normalizePatternCategory('AGENT-DELEGATION')).toBe('effective-tooling'); + expect(normalizePatternCategory('TDD')).toBe('verification-workflow'); + expect(normalizePatternCategory('Course-Correction')).toBe('self-correction'); + }); + + // ──────────────────────────────────────────────────── + // Rule 2: Levenshtein distance <= 2 + // ──────────────────────────────────────────────────── + + it('normalizes typos within Levenshtein distance 2', () => { + expect(normalizePatternCategory('self-corection')).toBe('self-correction'); // distance 1 + expect(normalizePatternCategory('domain-expertse')).toBe('domain-expertise'); // distance 1 + expect(normalizePatternCategory('context-gthering')).toBe('context-gathering'); // distance 1 + }); + + it('does not match when Levenshtein distance > 2', () => { + const result = normalizePatternCategory('completely-unrelated'); + expect(result).toBe('completely-unrelated'); + }); + + // ──────────────────────────────────────────────────── + // Rule 3: Substring match (significant portion) + // ──────────────────────────────────────────────────── + + it('matches when category is a significant extension of a canonical', () => { + // "self-correction-behavior" contains "self-correction" (15 chars, 15/24 = 0.625 > 0.5) + expect(normalizePatternCategory('self-correction-behavior')).toBe('self-correction'); + }); + + it('does not match short substrings (< 5 chars)', () => { + const result = normalizePatternCategory('abc'); + expect(result).toBe('abc'); + }); + + // ──────────────────────────────────────────────────── + // Rule 4: Novel category (no match) + // ──────────────────────────────────────────────────── + + it('returns original for novel categories', () => { + expect(normalizePatternCategory('pair-programming')).toBe('pair-programming'); + expect(normalizePatternCategory('mob-programming')).toBe('mob-programming'); + expect(normalizePatternCategory('rubber-duck-debugging')).toBe('rubber-duck-debugging'); + }); + + it('preserves original casing for novel categories', () => { + expect(normalizePatternCategory('Custom-Pattern')).toBe('Custom-Pattern'); + expect(normalizePatternCategory('My-Novel-Category')).toBe('My-Novel-Category'); + }); + + // ──────────────────────────────────────────────────── + // All canonical categories are recognized + // ──────────────────────────────────────────────────── + + it('recognizes all 8 canonical categories', () => { + const canonicals = [ + 'structured-planning', + 'incremental-implementation', + 'verification-workflow', + 'systematic-debugging', + 'self-correction', + 'context-gathering', + 'domain-expertise', + 'effective-tooling', + ]; + for (const cat of canonicals) { + expect(normalizePatternCategory(cat)).toBe(cat); + } + }); +}); + +// ────────────────────────────────────────────────────── +// getPatternCategoryLabel +// ────────────────────────────────────────────────────── + +describe('getPatternCategoryLabel', () => { + it('returns human-readable labels for all canonical categories', () => { + expect(getPatternCategoryLabel('structured-planning')).toBe('Structured Planning'); + expect(getPatternCategoryLabel('incremental-implementation')).toBe('Incremental Implementation'); + expect(getPatternCategoryLabel('verification-workflow')).toBe('Verification Workflow'); + expect(getPatternCategoryLabel('systematic-debugging')).toBe('Systematic Debugging'); + expect(getPatternCategoryLabel('self-correction')).toBe('Self-Correction'); + expect(getPatternCategoryLabel('context-gathering')).toBe('Context Gathering'); + expect(getPatternCategoryLabel('domain-expertise')).toBe('Domain Expertise'); + expect(getPatternCategoryLabel('effective-tooling')).toBe('Effective Tooling'); + }); + + it('converts novel kebab-case categories to Title Case', () => { + expect(getPatternCategoryLabel('pair-programming')).toBe('Pair Programming'); + expect(getPatternCategoryLabel('mob-programming')).toBe('Mob Programming'); + expect(getPatternCategoryLabel('rubber-duck-debugging')).toBe('Rubber Duck Debugging'); + }); + + it('handles single-word novel categories', () => { + expect(getPatternCategoryLabel('refactoring')).toBe('Refactoring'); + }); +}); diff --git a/cli/src/analysis/__tests__/prompt-quality-normalize.test.ts b/cli/src/analysis/__tests__/prompt-quality-normalize.test.ts new file mode 100644 index 0000000..e4662d3 --- /dev/null +++ b/cli/src/analysis/__tests__/prompt-quality-normalize.test.ts @@ -0,0 +1,99 @@ +import { describe, it, expect } from 'vitest'; +import { normalizePromptQualityCategory, getPQCategoryLabel, getPQCategoryType } from '../prompt-quality-normalize.js'; + +describe('normalizePromptQualityCategory', () => { + // Rule 1: Exact match + it('returns canonical for exact match', () => { + expect(normalizePromptQualityCategory('vague-request')).toBe('vague-request'); + expect(normalizePromptQualityCategory('missing-context')).toBe('missing-context'); + expect(normalizePromptQualityCategory('late-constraint')).toBe('late-constraint'); + expect(normalizePromptQualityCategory('precise-request')).toBe('precise-request'); + expect(normalizePromptQualityCategory('effective-context')).toBe('effective-context'); + expect(normalizePromptQualityCategory('productive-correction')).toBe('productive-correction'); + }); + + it('matches case-insensitively', () => { + expect(normalizePromptQualityCategory('Vague-Request')).toBe('vague-request'); + expect(normalizePromptQualityCategory('MISSING-CONTEXT')).toBe('missing-context'); + }); + + // Rule 1.5: Aliases + it('remaps common LLM variants to canonical categories', () => { + expect(normalizePromptQualityCategory('vague-instructions')).toBe('vague-request'); + expect(normalizePromptQualityCategory('unclear-request')).toBe('vague-request'); + expect(normalizePromptQualityCategory('imprecise-prompting')).toBe('vague-request'); + expect(normalizePromptQualityCategory('missing-information')).toBe('missing-context'); + expect(normalizePromptQualityCategory('insufficient-context')).toBe('missing-context'); + expect(normalizePromptQualityCategory('late-context')).toBe('late-constraint'); + expect(normalizePromptQualityCategory('late-requirements')).toBe('late-constraint'); + expect(normalizePromptQualityCategory('piecemeal-requirements')).toBe('late-constraint'); + expect(normalizePromptQualityCategory('drip-fed-requirements')).toBe('late-constraint'); + expect(normalizePromptQualityCategory('unclear-feedback')).toBe('unclear-correction'); + expect(normalizePromptQualityCategory('vague-correction')).toBe('unclear-correction'); + expect(normalizePromptQualityCategory('context-drift')).toBe('scope-drift'); + expect(normalizePromptQualityCategory('objective-bloat')).toBe('scope-drift'); + expect(normalizePromptQualityCategory('session-bloat')).toBe('scope-drift'); + expect(normalizePromptQualityCategory('no-acceptance-criteria')).toBe('missing-acceptance-criteria'); + expect(normalizePromptQualityCategory('undefined-done')).toBe('missing-acceptance-criteria'); + expect(normalizePromptQualityCategory('hidden-assumption')).toBe('assumption-not-surfaced'); + expect(normalizePromptQualityCategory('unstated-assumption')).toBe('assumption-not-surfaced'); + expect(normalizePromptQualityCategory('clear-request')).toBe('precise-request'); + expect(normalizePromptQualityCategory('specific-request')).toBe('precise-request'); + expect(normalizePromptQualityCategory('good-context')).toBe('effective-context'); + expect(normalizePromptQualityCategory('upfront-context')).toBe('effective-context'); + expect(normalizePromptQualityCategory('clear-correction')).toBe('productive-correction'); + expect(normalizePromptQualityCategory('effective-feedback')).toBe('productive-correction'); + }); + + // Rule 2: Levenshtein + it('normalizes typos within Levenshtein distance 2', () => { + expect(normalizePromptQualityCategory('vague-requst')).toBe('vague-request'); + expect(normalizePromptQualityCategory('scope-drft')).toBe('scope-drift'); + }); + + // Rule 4: Novel category + it('returns original for novel categories', () => { + expect(normalizePromptQualityCategory('over-delegation')).toBe('over-delegation'); + expect(normalizePromptQualityCategory('micro-management')).toBe('micro-management'); + }); + + it('recognizes all 10 canonical categories', () => { + const all = [ + 'vague-request', 'missing-context', 'late-constraint', + 'unclear-correction', 'scope-drift', 'missing-acceptance-criteria', + 'assumption-not-surfaced', 'precise-request', 'effective-context', + 'productive-correction', + ]; + for (const cat of all) { + expect(normalizePromptQualityCategory(cat)).toBe(cat); + } + }); +}); + +describe('getPQCategoryLabel', () => { + it('returns human label for canonical categories', () => { + expect(getPQCategoryLabel('vague-request')).toBe('Vague Request'); + expect(getPQCategoryLabel('late-constraint')).toBe('Late Constraint'); + expect(getPQCategoryLabel('precise-request')).toBe('Precise Request'); + }); + + it('converts novel categories to title case', () => { + expect(getPQCategoryLabel('over-delegation')).toBe('Over Delegation'); + }); +}); + +describe('getPQCategoryType', () => { + it('returns deficit for deficit categories', () => { + expect(getPQCategoryType('vague-request')).toBe('deficit'); + expect(getPQCategoryType('late-constraint')).toBe('deficit'); + }); + + it('returns strength for strength categories', () => { + expect(getPQCategoryType('precise-request')).toBe('strength'); + expect(getPQCategoryType('effective-context')).toBe('strength'); + }); + + it('returns deficit for unknown categories', () => { + expect(getPQCategoryType('over-delegation')).toBe('deficit'); + }); +}); diff --git a/cli/src/analysis/__tests__/prompts.test.ts b/cli/src/analysis/__tests__/prompts.test.ts new file mode 100644 index 0000000..7793df5 --- /dev/null +++ b/cli/src/analysis/__tests__/prompts.test.ts @@ -0,0 +1,702 @@ +import { describe, it, expect } from 'vitest'; +import { + classifyStoredUserMessage, + formatMessagesForAnalysis, + formatSessionMetaLine, +} from '../message-format.js'; +import { + parseAnalysisResponse, + parsePromptQualityResponse, +} from '../response-parsers.js'; +import { + SHARED_ANALYST_SYSTEM_PROMPT, + buildCacheableConversationBlock, + buildSessionAnalysisInstructions, + buildPromptQualityInstructions, + buildFacetOnlyInstructions, +} from '../prompts.js'; +import type { SQLiteMessageRow } from '../prompt-types.js'; + +// ────────────────────────────────────────────────────── +// Helpers +// ────────────────────────────────────────────────────── + +function makeMessage(overrides: Partial = {}): SQLiteMessageRow { + return { + id: 'msg-1', + session_id: 'sess-1', + type: 'user', + content: 'Hello world', + thinking: null, + tool_calls: '', + tool_results: '', + usage: null, + timestamp: '2025-06-15T10:00:00Z', + parent_id: null, + ...overrides, + }; +} + +// ────────────────────────────────────────────────────── +// classifyStoredUserMessage +// ────────────────────────────────────────────────────── + +describe('classifyStoredUserMessage', () => { + it('classifies JSON array with tool_result as tool-result', () => { + const content = '[{"type":"tool_result","tool_use_id":"toolu_abc","content":"File written successfully"}]'; + expect(classifyStoredUserMessage(content)).toBe('tool-result'); + }); + + it('classifies JSON array with multiple items including tool_result as tool-result', () => { + const content = '[{"type":"tool_result","tool_use_id":"toolu_xyz","content":"ok"},{"type":"tool_result","tool_use_id":"toolu_123","content":"done"}]'; + expect(classifyStoredUserMessage(content)).toBe('tool-result'); + }); + + it('does NOT classify a JSON array without tool_result keyword as tool-result', () => { + // A human might paste a JSON array in a message + const content = '[{"name":"Alice"},{"name":"Bob"}]'; + expect(classifyStoredUserMessage(content)).toBe('human'); + }); + + it('classifies "Here is a summary of our conversation" prefix as system-artifact', () => { + const content = 'Here is a summary of our conversation so far:\n\nWe discussed auth middleware...'; + expect(classifyStoredUserMessage(content)).toBe('system-artifact'); + }); + + it('classifies "This session is being continued" prefix as system-artifact', () => { + const content = 'This session is being continued from a previous conversation that ran out of context...'; + expect(classifyStoredUserMessage(content)).toBe('system-artifact'); + }); + + it('classifies single-line slash command as system-artifact', () => { + expect(classifyStoredUserMessage('/compact')).toBe('system-artifact'); + expect(classifyStoredUserMessage('/review')).toBe('system-artifact'); + expect(classifyStoredUserMessage('/test --coverage')).toBe('system-artifact'); + }); + + it('classifies two-line slash command as system-artifact', () => { + const content = '/compact\nsome brief instruction'; + expect(classifyStoredUserMessage(content)).toBe('system-artifact'); + }); + + it('does NOT classify long slash content (>2 lines) as system-artifact — avoids false positives', () => { + // A human message starting with /usr/bin/... path in a longer paragraph + const content = '/usr/bin/node is the runtime I am using.\nPlease update the shebang in the file.\nAlso fix the permissions.'; + expect(classifyStoredUserMessage(content)).toBe('human'); + }); + + it('does NOT classify /UPPERCASE as system-artifact — only /[a-z] pattern', () => { + const content = '/NotACommand'; + expect(classifyStoredUserMessage(content)).toBe('human'); + }); + + it('classifies normal human text as human', () => { + expect(classifyStoredUserMessage('Fix the auth middleware to use Hono patterns')).toBe('human'); + expect(classifyStoredUserMessage('Can you help me debug this?')).toBe('human'); + expect(classifyStoredUserMessage('')).toBe('human'); + }); + + it('classifies human message starting with [ but no tool_result as human', () => { + const content = '[Step 1] First do X\n[Step 2] Then do Y'; + expect(classifyStoredUserMessage(content)).toBe('human'); + }); +}); + +// ────────────────────────────────────────────────────── +// formatSessionMetaLine +// ────────────────────────────────────────────────────── + +describe('formatSessionMetaLine', () => { + it('returns empty string when meta is undefined', () => { + expect(formatSessionMetaLine(undefined)).toBe(''); + }); + + it('returns empty string when all meta fields are zero/empty', () => { + expect(formatSessionMetaLine({ compactCount: 0, autoCompactCount: 0, slashCommands: [] })).toBe(''); + }); + + it('formats auto-compact only', () => { + const result = formatSessionMetaLine({ autoCompactCount: 2 }); + expect(result).toContain('2 context compaction'); + expect(result).toContain('2 auto'); + expect(result).toContain('session exceeded context window'); + expect(result.endsWith('\n')).toBe(true); + }); + + it('formats manual compact only', () => { + const result = formatSessionMetaLine({ compactCount: 1 }); + expect(result).toContain('1 context compaction'); + expect(result).toContain('1 manual'); + expect(result).not.toContain('auto'); + }); + + it('formats both auto and manual compacts', () => { + const result = formatSessionMetaLine({ compactCount: 1, autoCompactCount: 2 }); + expect(result).toContain('3 context compaction'); + expect(result).toContain('2 auto'); + expect(result).toContain('1 manual'); + }); + + it('uses singular "compaction" for count of 1', () => { + const result = formatSessionMetaLine({ autoCompactCount: 1 }); + expect(result).toContain('1 context compaction'); + expect(result).not.toContain('compactions'); + }); + + it('uses plural "compactions" for count > 1', () => { + const result = formatSessionMetaLine({ autoCompactCount: 3 }); + expect(result).toContain('3 context compactions'); + }); + + it('formats slash commands only', () => { + const result = formatSessionMetaLine({ slashCommands: ['/review', '/test'] }); + expect(result).toContain('slash commands used: /review, /test'); + expect(result).not.toContain('compaction'); + }); + + it('formats compacts and slash commands together', () => { + const result = formatSessionMetaLine({ + autoCompactCount: 1, + slashCommands: ['/compact', '/review'], + }); + expect(result).toContain('Context signals:'); + expect(result).toContain('context compaction'); + expect(result).toContain('slash commands used:'); + }); +}); + +// ────────────────────────────────────────────────────── +// formatMessagesForAnalysis +// ────────────────────────────────────────────────────── + +describe('formatMessagesForAnalysis', () => { + it('produces readable text with role labels', () => { + const messages = [ + makeMessage({ type: 'user', content: 'Fix the bug' }), + makeMessage({ id: 'msg-2', type: 'assistant', content: 'Done!' }), + ]; + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('### User#0:'); + expect(result).toContain('Fix the bug'); + expect(result).toContain('### Assistant#0:'); + expect(result).toContain('Done!'); + }); + + it('increments user and assistant indices independently', () => { + const messages = [ + makeMessage({ type: 'user', content: 'msg 1' }), + makeMessage({ id: 'msg-2', type: 'assistant', content: 'msg 2' }), + makeMessage({ id: 'msg-3', type: 'user', content: 'msg 3' }), + makeMessage({ id: 'msg-4', type: 'assistant', content: 'msg 4' }), + ]; + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('User#0'); + expect(result).toContain('Assistant#0'); + expect(result).toContain('User#1'); + expect(result).toContain('Assistant#1'); + }); + + it('includes tool call names when present', () => { + const messages = [ + makeMessage({ + type: 'assistant', + content: 'Let me read the file', + tool_calls: JSON.stringify([{ name: 'Read' }, { name: 'Write' }]), + }), + ]; + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('[Tools used: Read, Write]'); + }); + + it('includes thinking content when present', () => { + const messages = [ + makeMessage({ + type: 'assistant', + content: 'The answer is 42', + thinking: 'I need to calculate this carefully', + }), + ]; + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('[Thinking: I need to calculate this carefully]'); + }); + + it('includes tool results when present', () => { + const messages = [ + makeMessage({ + type: 'assistant', + content: 'Read the file', + tool_results: JSON.stringify([{ output: 'file contents here' }]), + }), + ]; + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('[Tool results: file contents here]'); + }); + + it('handles empty messages array', () => { + const result = formatMessagesForAnalysis([]); + expect(result).toBe(''); + }); + + it('handles malformed JSON in tool_calls gracefully', () => { + const messages = [ + makeMessage({ + type: 'assistant', + content: 'oops', + tool_calls: 'not valid json', + }), + ]; + // Should not throw + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('oops'); + // No [Tools used:] since parse failed + expect(result).not.toContain('[Tools used:'); + }); + + it('labels tool-result user messages as [tool-result] and does NOT increment User#N', () => { + const toolResultContent = '[{"type":"tool_result","tool_use_id":"toolu_abc","content":"ok"}]'; + const messages = [ + makeMessage({ id: 'msg-1', type: 'user', content: 'First human message' }), + makeMessage({ id: 'msg-2', type: 'user', content: toolResultContent }), + makeMessage({ id: 'msg-3', type: 'user', content: 'Second human message' }), + ]; + const result = formatMessagesForAnalysis(messages); + // First and second human messages get indices 0 and 1 (tool-result in between skipped) + expect(result).toContain('### User#0:'); + expect(result).toContain('### User#1:'); + // No User#2 should appear (only 2 human messages) + expect(result).not.toContain('User#2'); + // Tool-result gets [tool-result] label + expect(result).toContain('### [tool-result]:'); + }); + + it('labels auto-compact user messages as [auto-compact] and does NOT increment User#N', () => { + const autoCompactContent = 'Here is a summary of our conversation so far:\n\nWe implemented auth...'; + const messages = [ + makeMessage({ id: 'msg-1', type: 'user', content: 'Start work' }), + makeMessage({ id: 'msg-2', type: 'user', content: autoCompactContent }), + makeMessage({ id: 'msg-3', type: 'user', content: 'Continue work' }), + ]; + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('### User#0:'); + expect(result).toContain('### [auto-compact]:'); + expect(result).toContain('### User#1:'); + expect(result).not.toContain('User#2'); + }); + + it('labels slash command user messages as [system] (not [auto-compact]) and does NOT increment User#N', () => { + // Slash commands are system artifacts but NOT compaction events — they get [system] label. + const messages = [ + makeMessage({ id: 'msg-1', type: 'user', content: 'Start work' }), + makeMessage({ id: 'msg-2', type: 'user', content: '/compact' }), + makeMessage({ id: 'msg-3', type: 'user', content: 'Continue work' }), + ]; + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('### User#0:'); + expect(result).toContain('### [system]:'); + expect(result).not.toContain('[auto-compact]'); + expect(result).toContain('### User#1:'); + expect(result).not.toContain('User#2'); + }); + + it('distinguishes [auto-compact] from [system] when both appear in same session', () => { + const autoCompactContent = 'This session is being continued from a previous conversation...'; + const messages = [ + makeMessage({ id: '1', type: 'user', content: 'Do something' }), + makeMessage({ id: '2', type: 'user', content: '/review' }), + makeMessage({ id: '3', type: 'user', content: autoCompactContent }), + makeMessage({ id: '4', type: 'user', content: 'Continue' }), + ]; + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('### [system]:'); + expect(result).toContain('### [auto-compact]:'); + // User index should still count only genuine human messages (2 of them: 'Do something' + 'Continue') + expect(result).toContain('### User#0:'); + expect(result).toContain('### User#1:'); + expect(result).not.toContain('User#2'); + }); + + it('preserves User#N counter continuity across mixed message types', () => { + const toolResult = '[{"type":"tool_result","tool_use_id":"toolu_1","content":"done"}]'; + const messages = [ + makeMessage({ id: '1', type: 'user', content: 'Human 0' }), + makeMessage({ id: '2', type: 'user', content: toolResult }), + makeMessage({ id: '3', type: 'user', content: toolResult }), + makeMessage({ id: '4', type: 'user', content: 'Human 1' }), + makeMessage({ id: '5', type: 'assistant', content: 'Reply' }), + makeMessage({ id: '6', type: 'user', content: 'Human 2' }), + ]; + const result = formatMessagesForAnalysis(messages); + expect(result).toContain('User#0'); + expect(result).toContain('User#1'); + expect(result).toContain('User#2'); + expect(result).not.toContain('User#3'); + // Two [tool-result] blocks appear + const toolResultCount = (result.match(/\[tool-result\]/g) ?? []).length; + expect(toolResultCount).toBe(2); + }); +}); + +// ────────────────────────────────────────────────────── +// buildCacheableConversationBlock +// ────────────────────────────────────────────────────── + +describe('buildCacheableConversationBlock', () => { + it('wraps formatted messages in conversation markers', () => { + const block = buildCacheableConversationBlock('### User#0:\nHello'); + expect(block.text).toContain('--- CONVERSATION ---'); + expect(block.text).toContain('--- END CONVERSATION ---'); + expect(block.text).toContain('### User#0:\nHello'); + }); + + it('sets cache_control to ephemeral', () => { + const block = buildCacheableConversationBlock('messages'); + expect(block.cache_control).toEqual({ type: 'ephemeral' }); + }); + + it('returns type text block', () => { + const block = buildCacheableConversationBlock('messages'); + expect(block.type).toBe('text'); + }); + + it('ends with double newline to separate instruction block', () => { + const block = buildCacheableConversationBlock('messages'); + expect(block.text.endsWith('\n\n')).toBe(true); + }); +}); + +// ────────────────────────────────────────────────────── +// buildSessionAnalysisInstructions +// ────────────────────────────────────────────────────── + +describe('buildSessionAnalysisInstructions', () => { + it('includes project name in the instructions', () => { + const result = buildSessionAnalysisInstructions('my-app', null); + expect(result).toContain('Project: my-app'); + }); + + it('includes session summary when provided', () => { + const result = buildSessionAnalysisInstructions('my-app', 'Fixed a critical bug'); + expect(result).toContain('Session Summary: Fixed a critical bug'); + }); + + it('omits session summary line when null', () => { + const result = buildSessionAnalysisInstructions('my-app', null); + expect(result).not.toContain('Session Summary:'); + }); + + it('contains the PART 1 and PART 2 section headers', () => { + const result = buildSessionAnalysisInstructions('my-app', null); + expect(result).toContain('=== PART 1: SESSION FACETS ==='); + expect(result).toContain('=== PART 2: INSIGHTS ==='); + }); + + it('ends with json tags instruction', () => { + const result = buildSessionAnalysisInstructions('proj', null); + expect(result).toContain('...'); + }); +}); + +// ────────────────────────────────────────────────────── +// buildPromptQualityInstructions +// ────────────────────────────────────────────────────── + +describe('buildPromptQualityInstructions', () => { + const sessionMeta = { + humanMessageCount: 8, + assistantMessageCount: 12, + toolExchangeCount: 31, + }; + + it('includes project name in the instructions', () => { + const result = buildPromptQualityInstructions('my-app', sessionMeta); + expect(result).toContain('Project: my-app'); + }); + + it('formats session shape header with structured counts', () => { + const result = buildPromptQualityInstructions('my-app', sessionMeta); + expect(result).toContain('Session shape: 8 user messages, 12 assistant messages, 31 tool exchanges'); + }); + + it('handles zero tool exchanges', () => { + const result = buildPromptQualityInstructions('proj', { + humanMessageCount: 2, + assistantMessageCount: 2, + toolExchangeCount: 0, + }); + expect(result).toContain('2 user messages, 2 assistant messages, 0 tool exchanges'); + }); + + it('omits Context signals line when meta is not provided', () => { + const result = buildPromptQualityInstructions('proj', sessionMeta); + expect(result).not.toContain('Context signals:'); + }); + + it('includes Context signals line when meta with compactions is provided', () => { + const result = buildPromptQualityInstructions('proj', sessionMeta, { + compactCount: 1, + autoCompactCount: 2, + }); + expect(result).toContain('Context signals:'); + expect(result).toContain('context compaction'); + }); + + it('includes slash commands in Context signals when meta has slash commands', () => { + const result = buildPromptQualityInstructions('proj', sessionMeta, { + slashCommands: ['/review', '/test'], + }); + expect(result).toContain('slash commands used: /review, /test'); + }); + + it('ends with json tags instruction', () => { + const result = buildPromptQualityInstructions('proj', sessionMeta); + expect(result).toContain('...'); + }); +}); + +// ────────────────────────────────────────────────────── +// buildFacetOnlyInstructions +// ────────────────────────────────────────────────────── + +describe('buildFacetOnlyInstructions', () => { + it('includes project name', () => { + const result = buildFacetOnlyInstructions('my-app', null); + expect(result).toContain('Project: my-app'); + }); + + it('includes session summary when provided', () => { + const result = buildFacetOnlyInstructions('my-app', 'Fixed auth bug'); + expect(result).toContain('Session Summary: Fixed auth bug'); + }); + + it('omits session summary when null', () => { + const result = buildFacetOnlyInstructions('my-app', null); + expect(result).not.toContain('Session Summary:'); + }); + + it('ends with json tags instruction', () => { + const result = buildFacetOnlyInstructions('proj', null); + expect(result).toContain('...'); + }); +}); + +// ────────────────────────────────────────────────────── +// parseAnalysisResponse +// ────────────────────────────────────────────────────── + +describe('parseAnalysisResponse', () => { + it('parses valid JSON in tags', () => { + const response = ` +{ + "summary": { + "title": "Implemented auth", + "content": "Added login and logout", + "bullets": ["Login flow", "Logout flow"] + }, + "decisions": [], + "learnings": [] +} +`; + const result = parseAnalysisResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.summary.title).toBe('Implemented auth'); + expect(result.data.summary.bullets).toHaveLength(2); + expect(result.data.decisions).toEqual([]); + expect(result.data.learnings).toEqual([]); + }); + + it('parses raw JSON without tags', () => { + const response = `{ + "summary": { "title": "Test", "content": "Content", "bullets": [] }, + "decisions": [], + "learnings": [] +}`; + const result = parseAnalysisResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.summary.title).toBe('Test'); + }); + + it('returns error for completely malformed response', () => { + const result = parseAnalysisResponse('This is not JSON at all'); + expect(result.success).toBe(false); + if (result.success) return; + expect(result.error.error_type).toBe('no_json_found'); + }); + + it('returns error for JSON missing required summary.title', () => { + const response = '{ "summary": { "content": "no title" }, "decisions": [], "learnings": [] }'; + const result = parseAnalysisResponse(response); + expect(result.success).toBe(false); + if (result.success) return; + expect(result.error.error_type).toBe('invalid_structure'); + }); + + it('defaults decisions and learnings to empty arrays when missing', () => { + const response = '{ "summary": { "title": "Test", "content": "c", "bullets": [] } }'; + const result = parseAnalysisResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.decisions).toEqual([]); + expect(result.data.learnings).toEqual([]); + }); + + // Fix 2: LLM response structure validation — array guard tests + it('coerces decisions to [] when LLM returns a non-array string value', () => { + // LLM returned "decisions": "none" — string is truthy so || [] would NOT catch this + const response = '{ "summary": { "title": "Test", "content": "c", "bullets": [] }, "decisions": "none", "learnings": [] }'; + const result = parseAnalysisResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + // Must be an array — not the string "none" + expect(Array.isArray(result.data.decisions)).toBe(true); + expect(result.data.decisions).toEqual([]); + }); + + it('coerces learnings to [] when LLM returns a non-array value', () => { + const response = '{ "summary": { "title": "Test", "content": "c", "bullets": [] }, "decisions": [], "learnings": {} }'; + const result = parseAnalysisResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(Array.isArray(result.data.learnings)).toBe(true); + expect(result.data.learnings).toEqual([]); + }); + + it('coerces facet arrays to [] when LLM returns non-array facets', () => { + // LLM returned friction_points as a string instead of an array + const response = '{ "summary": { "title": "Test", "content": "c", "bullets": [] }, "decisions": [], "learnings": [], "facets": { "friction_points": "none", "effective_patterns": null } }'; + const result = parseAnalysisResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + // Both must be arrays — .some() calls on monitors must not throw + expect(Array.isArray(result.data.facets?.friction_points)).toBe(true); + expect(Array.isArray(result.data.facets?.effective_patterns)).toBe(true); + }); +}); + +// ────────────────────────────────────────────────────── +// parsePromptQualityResponse +// ────────────────────────────────────────────────────── + +describe('parsePromptQualityResponse', () => { + it('parses valid response with findings and takeaways', () => { + const response = `{ + "efficiency_score": 85, + "message_overhead": 2, + "assessment": "Good prompting style overall", + "takeaways": [ + { + "type": "improve", + "category": "vague-request", + "label": "Add file path to requests", + "message_ref": "User#3", + "original": "fix the bug", + "better_prompt": "Fix the null pointer in cli/src/commands/sync.ts line 42", + "why": "The original lacked enough detail to act on without guessing" + } + ], + "findings": [ + { + "category": "vague-request", + "type": "deficit", + "description": "User#3 asked to fix a bug without specifying file, function, or error message", + "message_ref": "User#3", + "impact": "medium", + "confidence": 80 + } + ], + "dimension_scores": { + "context_provision": 70, + "request_specificity": 65, + "scope_management": 90, + "information_timing": 80, + "correction_quality": 75 + } + }`; + const result = parsePromptQualityResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.efficiency_score).toBe(85); + expect(result.data.takeaways).toHaveLength(1); + expect(result.data.findings).toHaveLength(1); + expect(result.data.findings[0].category).toBe('vague-request'); + expect(result.data.dimension_scores.scope_management).toBe(90); + }); + + it('clamps efficiency_score to 0-100 range', () => { + const response = '{ "efficiency_score": 150, "message_overhead": 0, "assessment": "ok", "takeaways": [], "findings": [], "dimension_scores": { "context_provision": 50, "request_specificity": 50, "scope_management": 50, "information_timing": 50, "correction_quality": 50 } }'; + const result = parsePromptQualityResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.efficiency_score).toBe(100); + }); + + it('defaults missing dimension_scores to 50s', () => { + const response = '{ "efficiency_score": 75, "message_overhead": 0, "assessment": "ok", "takeaways": [], "findings": [] }'; + const result = parsePromptQualityResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.dimension_scores.context_provision).toBe(50); + expect(result.data.dimension_scores.correction_quality).toBe(50); + }); + + it('accepts empty arrays (well-prompted session)', () => { + const response = '{ "efficiency_score": 95, "message_overhead": 0, "assessment": "Excellent session", "takeaways": [], "findings": [], "dimension_scores": { "context_provision": 95, "request_specificity": 90, "scope_management": 95, "information_timing": 95, "correction_quality": 75 } }'; + const result = parsePromptQualityResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(result.data.takeaways).toHaveLength(0); + expect(result.data.findings).toHaveLength(0); + }); + + it('returns error for missing efficiency_score', () => { + const response = '{ "assessment": "no score" }'; + const result = parsePromptQualityResponse(response); + expect(result.success).toBe(false); + if (result.success) return; + expect(result.error.error_type).toBe('invalid_structure'); + }); + + it('returns error for completely invalid response', () => { + const result = parsePromptQualityResponse('not json'); + expect(result.success).toBe(false); + if (result.success) return; + expect(result.error.error_type).toBe('no_json_found'); + }); + + // Fix 2: array guard tests for parsePromptQualityResponse + it('coerces takeaways to [] when LLM returns a non-array string value', () => { + // LLM returned "takeaways": "none" — truthy string bypasses || [] coercion + const response = '{ "efficiency_score": 80, "takeaways": "none", "findings": [] }'; + const result = parsePromptQualityResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(Array.isArray(result.data.takeaways)).toBe(true); + expect(result.data.takeaways).toEqual([]); + }); + + it('coerces findings to [] when LLM returns a non-array value (prevents .some() TypeError)', () => { + // LLM returned "findings": "none" — monitor on line 166 calls .some(), would throw without guard + const response = '{ "efficiency_score": 80, "takeaways": [], "findings": "none" }'; + const result = parsePromptQualityResponse(response); + expect(result.success).toBe(true); + if (!result.success) return; + expect(Array.isArray(result.data.findings)).toBe(true); + expect(result.data.findings).toEqual([]); + }); +}); + +// ────────────────────────────────────────────────────── +// SHARED_ANALYST_SYSTEM_PROMPT +// ────────────────────────────────────────────────────── + +describe('SHARED_ANALYST_SYSTEM_PROMPT', () => { + it('is a non-empty string', () => { + expect(typeof SHARED_ANALYST_SYSTEM_PROMPT).toBe('string'); + expect(SHARED_ANALYST_SYSTEM_PROMPT.length).toBeGreaterThan(0); + }); + + it('instructs JSON output wrapped in json tags', () => { + expect(SHARED_ANALYST_SYSTEM_PROMPT).toContain(''); + }); +}); diff --git a/cli/src/analysis/friction-normalize.ts b/cli/src/analysis/friction-normalize.ts new file mode 100644 index 0000000..2ade90a --- /dev/null +++ b/cli/src/analysis/friction-normalize.ts @@ -0,0 +1,56 @@ +// Friction category normalization. +// Clusters similar free-form friction categories to canonical ones during aggregation. + +import { CANONICAL_FRICTION_CATEGORIES } from './prompt-constants.js'; +import { normalizeCategory } from './normalize-utils.js'; + +// Explicit alias map for clustering emergent category variants. +// Targets don't need to be in CANONICAL_FRICTION_CATEGORIES — +// this clusters semantically-equivalent novel categories together. +// Insert alias lookup runs AFTER exact canonical match but BEFORE Levenshtein, +// so well-known emergent variants are clustered deterministically. +const FRICTION_ALIASES: Record = { + // legacy canonical → new canonical (15→9 taxonomy revision) + 'missing-dependency': 'stale-assumptions', + 'config-drift': 'stale-assumptions', + 'stale-cache': 'stale-assumptions', + 'version-mismatch': 'stale-assumptions', + 'permission-issue': 'stale-assumptions', + 'environment-mismatch': 'stale-assumptions', + 'race-condition': 'wrong-approach', + 'circular-dependency': 'wrong-approach', + 'test-failure': 'wrong-approach', + 'type-error': 'knowledge-gap', + 'api-misunderstanding': 'knowledge-gap', + // agent orchestration variants → cluster under one emergent name + 'agent-lifecycle-issue': 'agent-orchestration-failure', + 'agent-communication-failure': 'agent-orchestration-failure', + 'agent-communication-breakdown': 'agent-orchestration-failure', + 'agent-lifecycle-management': 'agent-orchestration-failure', + 'agent-shutdown-failure': 'agent-orchestration-failure', + // rate limit variants → cluster under one emergent name + 'api-rate-limit': 'rate-limit-hit', + 'rate-limiting': 'rate-limit-hit', + 'rate-limited': 'rate-limit-hit', +}; + +/** + * Normalize a friction category to the closest canonical category. + * Returns the original category if no close match is found. + * + * Matching rules (in order): + * 1. Exact match against canonical list → return as-is + * 1.5. Explicit alias match → return alias target (may be non-canonical) + * 2. Levenshtein distance <= 2 → return canonical match + * 3. Substring match (category contains canonical or vice versa) → return canonical + * 4. No match → return original (novel category) + * + * Note: alias targets in FRICTION_ALIASES bypass the canonical check intentionally. + * e.g., "agent-orchestration-failure" is not canonical but is a valid cluster target. + */ +export function normalizeFrictionCategory(category: string): string { + return normalizeCategory(category, { + canonicalCategories: CANONICAL_FRICTION_CATEGORIES, + aliases: FRICTION_ALIASES, + }); +} diff --git a/cli/src/analysis/message-format.ts b/cli/src/analysis/message-format.ts new file mode 100644 index 0000000..c659f3c --- /dev/null +++ b/cli/src/analysis/message-format.ts @@ -0,0 +1,142 @@ +// SQLite message formatting utilities for LLM prompt construction. +// Extracted from prompts.ts — used by prompt generator functions in prompts.ts. + +import type { SQLiteMessageRow, SessionMetadata } from './prompt-types.js'; + +// Safely parse a JSON-encoded string field from SQLite. +// Returns defaultValue if the field is null, empty, or invalid JSON. +// Mirrors server/src/utils.ts safeParseJson — keep in sync. +function safeParseJson(value: string | null | undefined, defaultValue: T): T { + if (!value) return defaultValue; + try { + return JSON.parse(value) as T; + } catch { + return defaultValue; + } +} + +// Internal types — only used within formatMessagesForAnalysis +interface ParsedToolCall { + name?: string; +} + +interface ParsedToolResult { + output?: string; +} + +/** + * Detect the class of a stored user message from its content string. + * Operates on the DB content field (stringified), not raw JSONL. + * + * This mirrors classifyUserMessage() in cli/src/parser/jsonl.ts but works on + * stored content strings instead of parsed JSONL message objects. The DB stores + * message content as a plain string — tool-results are JSON arrays stringified, + * human text is stored as-is. + * + * Order matters — most specific checks first. + */ +export function classifyStoredUserMessage(content: string): 'human' | 'tool-result' | 'system-artifact' { + // Tool-result: content is a JSON array containing tool_result blocks. + // The DB stores these as stringified JSON arrays starting with '['. + if (content.startsWith('[') && content.includes('"tool_result"')) return 'tool-result'; + + // Auto-compact summary: Claude Code uses two known prefixes for LLM-initiated + // context compaction summaries. Both must be checked. + if (content.startsWith('Here is a summary of our conversation')) return 'system-artifact'; + if (content.startsWith('This session is being continued')) return 'system-artifact'; + + // Slash command or skill load: single-line starting with / followed by a lowercase letter. + // Requires content.trim() to be short (≤2 lines) to avoid false-positives on messages + // containing file paths like "/usr/bin/..." as part of a longer instruction. + const trimmed = content.trim(); + if (/^\/[a-z]/.test(trimmed) && trimmed.split('\n').length <= 2) return 'system-artifact'; + + return 'human'; +} + +/** + * Format SQLite message rows for LLM consumption. + * Handles snake_case fields and JSON-encoded tool_calls/tool_results. + * + * User#N indices only increment for genuine human messages. Tool-results and + * system artifacts (auto-compacts, slash commands) receive bracketed labels + * instead. This ensures User#N references in PQ takeaways and evidence fields + * align with actual human turns, not inflated by tool-result rows. + */ +export function formatMessagesForAnalysis(messages: SQLiteMessageRow[]): string { + let userIndex = 0; + let assistantIndex = 0; + + return messages + .map((m) => { + let roleLabel: string; + + if (m.type === 'user') { + const msgClass = classifyStoredUserMessage(m.content); + if (msgClass === 'tool-result') { + roleLabel = '[tool-result]'; + } else if (msgClass === 'system-artifact') { + // Auto-compact summaries use two known prefixes — everything else (slash commands, + // skill loads) is a generic system artifact, not a compaction event. + const isAutoCompact = m.content.startsWith('Here is a summary of our conversation') + || m.content.startsWith('This session is being continued'); + roleLabel = isAutoCompact ? '[auto-compact]' : '[system]'; + } else { + // Genuine human message — increment counter + roleLabel = `User#${userIndex++}`; + } + } else if (m.type === 'assistant') { + roleLabel = `Assistant#${assistantIndex++}`; + } else { + roleLabel = 'System'; + } + + // Parse JSON-encoded tool_calls and tool_results via safeParseJson + const toolCalls = safeParseJson(m.tool_calls, []); + const toolResults = safeParseJson(m.tool_results, []); + + const toolInfo = toolCalls.length > 0 + ? `\n[Tools used: ${toolCalls.map(t => t.name || 'unknown').join(', ')}]` + : ''; + + // Include thinking content — capped at 1000 chars to stay within token budget + const thinkingInfo = m.thinking + ? `\n[Thinking: ${m.thinking.slice(0, 1000)}]` + : ''; + + // Include tool results for context — 500 chars per result (error messages need ~300-400 chars) + const resultInfo = toolResults.length > 0 + ? `\n[Tool results: ${toolResults.map(r => (r.output || '').slice(0, 500)).join(' | ')}]` + : ''; + + return `### ${roleLabel}:\n${m.content}${thinkingInfo}${toolInfo}${resultInfo}`; + }) + .join('\n\n'); +} + +/** + * Format a one-line context signals header from V6 session metadata. + * Returns empty string when no signals are present (pre-V6 sessions with NULL columns). + * + * Example output: + * "Context signals: 3 context compactions (2 auto, 1 manual) — session exceeded context window; slash commands used: /review, /test\n" + */ +export function formatSessionMetaLine(meta?: SessionMetadata): string { + if (!meta) return ''; + const parts: string[] = []; + + const totalCompacts = (meta.compactCount ?? 0) + (meta.autoCompactCount ?? 0); + if (totalCompacts > 0) { + const breakdown: string[] = []; + if (meta.autoCompactCount) breakdown.push(`${meta.autoCompactCount} auto`); + if (meta.compactCount) breakdown.push(`${meta.compactCount} manual`); + parts.push(`${totalCompacts} context compaction${totalCompacts > 1 ? 's' : ''} (${breakdown.join(', ')}) — session exceeded context window`); + } + + if (meta.slashCommands?.length) { + parts.push(`slash commands used: ${meta.slashCommands.join(', ')}`); + } + + if (parts.length === 0) return ''; + return `Context signals: ${parts.join('; ')}\n`; +} diff --git a/cli/src/analysis/normalize-utils.ts b/cli/src/analysis/normalize-utils.ts new file mode 100644 index 0000000..d86e598 --- /dev/null +++ b/cli/src/analysis/normalize-utils.ts @@ -0,0 +1,87 @@ +// Shared normalization infrastructure for friction, pattern, and prompt-quality categories. +// Each domain provides its own canonical list, alias map, and label map. + +/** Standard Levenshtein distance between two strings */ +export function levenshtein(a: string, b: string): number { + const m = a.length; + const n = b.length; + const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0) as number[]); + + for (let i = 0; i <= m; i++) dp[i][0] = i; + for (let j = 0; j <= n; j++) dp[0][j] = j; + + for (let i = 1; i <= m; i++) { + for (let j = 1; j <= n; j++) { + const cost = a[i - 1] === b[j - 1] ? 0 : 1; + dp[i][j] = Math.min( + dp[i - 1][j] + 1, + dp[i][j - 1] + 1, + dp[i - 1][j - 1] + cost + ); + } + } + + return dp[m][n]; +} + +export interface NormalizerConfig { + /** Canonical category strings (lowercase kebab-case) */ + canonicalCategories: readonly string[]; + /** Maps known aliases to their target (may be non-canonical cluster targets) */ + aliases: Record; +} + +/** + * Generic category normalizer. Matching rules (in order): + * 1. Exact match against canonical list → return as-is + * 1.5. Explicit alias match → return alias target (may be non-canonical) + * 2. Levenshtein distance <= 2 → return canonical match + * 3. Substring match (shorter >= 5 chars, >= 50% of longer) → return canonical + * 4. No match → return original (novel category) + */ +export function normalizeCategory(category: string, config: NormalizerConfig): string { + const lower = category.toLowerCase(); + + // 1. Exact match + for (const canonical of config.canonicalCategories) { + if (lower === canonical) return canonical; + } + + // 1.5. Explicit alias match + if (config.aliases[lower]) return config.aliases[lower]; + + // 2. Levenshtein distance <= 2 + let bestMatch: string | null = null; + let bestDistance = Infinity; + for (const canonical of config.canonicalCategories) { + const dist = levenshtein(lower, canonical); + if (dist <= 2 && dist < bestDistance) { + bestDistance = dist; + bestMatch = canonical; + } + } + if (bestMatch) return bestMatch; + + // 3. Substring match — only if the shorter string is a significant portion of the longer + // to avoid false positives like "type" matching "type-error" + for (const canonical of config.canonicalCategories) { + const shorter = lower.length < canonical.length ? lower : canonical; + const longer = lower.length < canonical.length ? canonical : lower; + if (shorter.length >= 5 && shorter.length / longer.length >= 0.5 && longer.includes(shorter)) { + return canonical; + } + } + + // 4. No match — novel category + return category; +} + +/** + * Convert kebab-case to Title Case. Shared fallback for category label functions. + */ +export function kebabToTitleCase(kebab: string): string { + return kebab + .split('-') + .map(word => word.charAt(0).toUpperCase() + word.slice(1)) + .join(' '); +} diff --git a/cli/src/analysis/pattern-normalize.ts b/cli/src/analysis/pattern-normalize.ts new file mode 100644 index 0000000..ec32d81 --- /dev/null +++ b/cli/src/analysis/pattern-normalize.ts @@ -0,0 +1,101 @@ +// Effective pattern category normalization. +// Clusters similar free-form pattern categories to canonical ones during aggregation. +// Delegates to normalize-utils.ts for the shared levenshtein/normalizeCategory algorithm. + +import { CANONICAL_PATTERN_CATEGORIES } from './prompt-constants.js'; +import { normalizeCategory, kebabToTitleCase } from './normalize-utils.js'; + +// Human-readable labels for each canonical category. +// Used in dashboard display (e.g., "structured-planning" → "Structured Planning"). +export const PATTERN_CATEGORY_LABELS: Record = { + 'structured-planning': 'Structured Planning', + 'incremental-implementation': 'Incremental Implementation', + 'verification-workflow': 'Verification Workflow', + 'systematic-debugging': 'Systematic Debugging', + 'self-correction': 'Self-Correction', + 'context-gathering': 'Context Gathering', + 'domain-expertise': 'Domain Expertise', + 'effective-tooling': 'Effective Tooling', +}; + +// Explicit alias map for clustering emergent category variants. +// Targets don't need to be in CANONICAL_PATTERN_CATEGORIES — +// this clusters semantically-equivalent novel categories together. +// Insert alias lookup runs AFTER exact canonical match but BEFORE Levenshtein, +// so well-known emergent variants are clustered deterministically. +const PATTERN_ALIASES: Record = { + // structured-planning variants + 'task-decomposition': 'structured-planning', + 'plan-first': 'structured-planning', + 'upfront-planning': 'structured-planning', + 'phased-approach': 'structured-planning', + 'task-breakdown': 'structured-planning', + 'planning-before-implementation': 'structured-planning', + + // effective-tooling variants + 'agent-delegation': 'effective-tooling', + 'agent-orchestration': 'effective-tooling', + 'specialized-agents': 'effective-tooling', + 'multi-agent': 'effective-tooling', + 'tool-leverage': 'effective-tooling', + + // verification-workflow variants + 'build-test-verify': 'verification-workflow', + 'test-driven-development': 'verification-workflow', + 'tdd': 'verification-workflow', + 'test-first': 'verification-workflow', + 'pre-commit-checks': 'verification-workflow', + + // systematic-debugging variants + 'binary-search-debugging': 'systematic-debugging', + 'methodical-debugging': 'systematic-debugging', + 'log-based-debugging': 'systematic-debugging', + 'debugging-methodology': 'systematic-debugging', + + // self-correction variants + 'course-correction': 'self-correction', + 'pivot-on-failure': 'self-correction', + 'backtracking': 'self-correction', + + // context-gathering variants + 'code-reading-first': 'context-gathering', + 'codebase-exploration': 'context-gathering', + 'understanding-before-changing': 'context-gathering', + + // domain-expertise variants + 'framework-knowledge': 'domain-expertise', + 'types-first': 'domain-expertise', + 'type-driven-development': 'domain-expertise', + 'schema-first': 'domain-expertise', + + // incremental-implementation variants + 'small-steps': 'incremental-implementation', + 'iterative-building': 'incremental-implementation', + 'iterative-development': 'incremental-implementation', +}; + +/** + * Normalize a pattern category to the closest canonical category. + * Returns the original category if no close match is found. + * + * Matching rules (in order): + * 1. Exact match against canonical list → return as-is + * 1.5. Explicit alias match → return alias target (may be non-canonical) + * 2. Levenshtein distance <= 2 → return canonical match + * 3. Substring match (category contains canonical or vice versa) → return canonical + * 4. No match → return original (novel category) + */ +export function normalizePatternCategory(category: string): string { + return normalizeCategory(category, { + canonicalCategories: CANONICAL_PATTERN_CATEGORIES, + aliases: PATTERN_ALIASES, + }); +} + +/** + * Get a human-readable label for a pattern category. + * Falls back to Title Case conversion for novel categories. + */ +export function getPatternCategoryLabel(category: string): string { + return PATTERN_CATEGORY_LABELS[category] ?? kebabToTitleCase(category); +} diff --git a/cli/src/analysis/prompt-constants.ts b/cli/src/analysis/prompt-constants.ts new file mode 100644 index 0000000..40d4f8c --- /dev/null +++ b/cli/src/analysis/prompt-constants.ts @@ -0,0 +1,189 @@ +// Canonical category arrays and classification guidance strings for LLM analysis. +// Extracted from prompts.ts — imported by normalizers and prompt generators. + +// Shared guidance for friction category and attribution classification. +// Actor-neutral category definitions describe the gap, not the actor. +// Attribution field captures who contributed to the friction for actionability. +export const FRICTION_CLASSIFICATION_GUIDANCE = ` +FRICTION CLASSIFICATION GUIDANCE: + +Each friction point captures WHAT went wrong (category + description), WHO contributed (attribution), and WHY you classified it that way (_reasoning). + +CATEGORIES — classify the TYPE of gap or obstacle: +- "wrong-approach": A strategy was pursued that didn't fit the task — wrong architecture, wrong tool, wrong pattern. Includes choosing a suboptimal tool when a better one was available. +- "knowledge-gap": Incorrect knowledge was applied about a library, API, framework, or language feature. The capability existed but was used wrong. +- "stale-assumptions": Work proceeded from assumptions about current state that were incorrect (stale files, changed config, different environment, tool behavior changed between versions). +- "incomplete-requirements": Instructions were missing critical context, constraints, or acceptance criteria needed to proceed correctly. +- "context-loss": Prior decisions or constraints established earlier in the session were lost or forgotten. +- "scope-creep": Work expanded beyond the boundaries of the stated task. +- "repeated-mistakes": The same or similar error occurred multiple times despite earlier correction. +- "documentation-gap": Relevant docs existed but were inaccessible or unfindable during the session. +- "tooling-limitation": The AI coding tool or its underlying model genuinely could not perform a needed action — missing file system access, unsupported language feature, context window overflow, inability to run a specific command type. Diagnostic: Could a reasonable user prompt or approach have achieved the same result? If the only workaround is unreasonably complex or loses significant fidelity, this IS a tooling-limitation. If a straightforward alternative existed → it is NOT tooling-limitation. + RECLASSIFY if any of these apply: + - Rate-limited or throttled → create "rate-limit-hit" instead + - Agent crashed or lost state → use "wrong-approach" or create "agent-orchestration-failure" + - Wrong tool chosen when a better one existed → "wrong-approach" + - User didn't know the tool could do something → "knowledge-gap" + - Tool worked differently than expected → "stale-assumptions" + +DISAMBIGUATION — use these to break ties when two categories seem to fit: +- tooling-limitation vs wrong-approach: Limitation = the tool CANNOT do it (no workaround exists). Wrong-approach = the tool CAN do it but a suboptimal method was chosen. +- tooling-limitation vs knowledge-gap: Limitation = the capability genuinely does not exist. Knowledge-gap = the capability exists but was applied incorrectly. +- tooling-limitation vs stale-assumptions: Limitation = permanent gap in the tool. Stale-assumptions = the tool USED TO work differently or the assumption about current behavior was wrong. +- wrong-approach vs knowledge-gap: Wrong-approach = strategic choice (chose library X over Y). Knowledge-gap = factual error (used library X's API incorrectly). +- incomplete-requirements vs context-loss: Incomplete = the information was NEVER provided. Context-loss = it WAS provided earlier but was forgotten or dropped. + +When no category fits, create a specific kebab-case category. A precise novel category is better than a vague canonical one. + +ATTRIBUTION — 3-step decision tree (follow IN ORDER): +Step 1: Is the cause external to the user-AI interaction? (missing docs, broken tooling, infra outage) → "environmental" +Step 2: Could the USER have prevented this with better input? Evidence: vague prompt, missing context, no constraints, late requirements, ambiguous correction → "user-actionable" +Step 3: User input was clear and the AI still failed → "ai-capability" +When genuinely mixed between user-actionable and ai-capability, lean "user-actionable" — this tool helps users improve. + +DESCRIPTION RULES: +- One neutral sentence describing the GAP, not the actor +- Include specific details (file names, APIs, error messages) +- Frame as "Missing X caused Y" NOT "The AI failed to X" or "The user forgot to X" +- Let the attribution field carry the who`; + +export const CANONICAL_FRICTION_CATEGORIES = [ + 'wrong-approach', + 'knowledge-gap', + 'stale-assumptions', + 'incomplete-requirements', + 'context-loss', + 'scope-creep', + 'repeated-mistakes', + 'documentation-gap', + 'tooling-limitation', +] as const; + +export const CANONICAL_PATTERN_CATEGORIES = [ + 'structured-planning', + 'incremental-implementation', + 'verification-workflow', + 'systematic-debugging', + 'self-correction', + 'context-gathering', + 'domain-expertise', + 'effective-tooling', +] as const; + +export const CANONICAL_PQ_DEFICIT_CATEGORIES = [ + 'vague-request', + 'missing-context', + 'late-constraint', + 'unclear-correction', + 'scope-drift', + 'missing-acceptance-criteria', + 'assumption-not-surfaced', +] as const; + +export const CANONICAL_PQ_STRENGTH_CATEGORIES = [ + 'precise-request', + 'effective-context', + 'productive-correction', +] as const; + +export const CANONICAL_PQ_CATEGORIES = [ + ...CANONICAL_PQ_DEFICIT_CATEGORIES, + ...CANONICAL_PQ_STRENGTH_CATEGORIES, +] as const; + +export const PROMPT_QUALITY_CLASSIFICATION_GUIDANCE = ` +PROMPT QUALITY CLASSIFICATION GUIDANCE: + +Each finding captures a specific moment where the user's prompting either caused friction (deficit) or enabled productivity (strength). + +DEFICIT CATEGORIES — classify prompting problems: +- "vague-request": Request lacked specificity needed for the AI to act without guessing. Missing file paths, function names, expected behavior, or concrete details. + NOT this category if the AI had enough context to succeed but failed anyway — that is an AI capability issue, not a prompting issue. + +- "missing-context": Critical background knowledge about architecture, conventions, dependencies, or current state was not provided. + NOT this category if the information was available in the codebase and the AI could have found it by reading files — that is an AI context-gathering failure. + +- "late-constraint": A requirement or constraint was provided AFTER the AI had already started implementing a different approach, causing rework. + NOT this category if the constraint was genuinely discovered during implementation (requirements changed). Only classify if the user KNEW the constraint before the session started. + +- "unclear-correction": The user told the AI its output was wrong without explaining what was wrong or why. "That's not right", "try again", "no" without context. + NOT this category if the user gave a brief but sufficient correction ("use map instead of forEach" is clear enough). + +- "scope-drift": The session objective shifted mid-conversation, or multiple unrelated objectives were addressed in one session. + NOT this category if the user is working through logically connected subtasks of one objective. + +- "missing-acceptance-criteria": The user did not define what successful completion looks like, leading to back-and-forth about whether the output meets expectations. + NOT this category for exploratory sessions where the user is discovering what they want. + +- "assumption-not-surfaced": The user held an unstated assumption that the AI could not reasonably infer from code or conversation. + NOT this category if the assumption was reasonable for the AI to make (e.g., standard coding conventions). + +STRENGTH CATEGORIES — classify prompting successes (only when notably above average): +- "precise-request": Request included enough specificity (file paths, function names, expected behavior, error messages) that the AI could act correctly on the first attempt. + +- "effective-context": User proactively shared architecture, conventions, prior decisions, or current state that the AI demonstrably used to make better decisions. + +- "productive-correction": When the AI went off track, the user provided a correction that included WHAT was wrong, WHY, and enough context for the AI to redirect effectively on the next response. + +CONTRASTIVE PAIRS: +- vague-request vs missing-context: Was the problem in HOW THE TASK WAS DESCRIBED (vague-request) or WHAT BACKGROUND KNOWLEDGE WAS ABSENT (missing-context)? +- late-constraint vs missing-context: Did the user EVENTUALLY provide it in the same session? Yes → late-constraint. Never → missing-context. +- missing-context vs assumption-not-surfaced: Is this a FACT the user could have copy-pasted (missing-context), or a BELIEF/PREFERENCE they held (assumption-not-surfaced)? +- scope-drift vs missing-acceptance-criteria: Did the user try to do TOO MANY THINGS (scope-drift) or ONE THING WITHOUT DEFINING SUCCESS (missing-acceptance-criteria)? +- unclear-correction vs vague-request: Was this the user's FIRST MESSAGE about this task (vague-request) or a RESPONSE TO AI OUTPUT (unclear-correction)? + +DIMENSION SCORING (0-100): +- context_provision: How well did the user provide relevant background upfront? + 90+: Proactively shared architecture, constraints, conventions. 50-69: Notable gaps causing detours. <30: No context, AI working blind. +- request_specificity: How precise were task requests? + 90+: File paths, expected behavior, scope boundaries. 50-69: Mix of specific and vague. <30: Nearly all requests lacked detail. +- scope_management: How focused was the session? + 90+: Single clear objective, logical progression. 50-69: Some drift but primary goal met. <30: Unfocused, no clear objective. +- information_timing: Were requirements provided when needed? + 90+: All constraints front-loaded before implementation. 50-69: Some important requirements late. <30: Requirements drip-fed, constant corrections. +- correction_quality: How well did the user redirect the AI? + 90+: Corrections included what, why, and context. 50-69: Mix of clear and unclear. <30: Corrections gave almost no signal. + Score 75 if no corrections were needed (absence of corrections in a successful session = good prompting). + +EDGE CASES: +- Short sessions (<5 user messages): Score conservatively. Do not penalize for missing elements unnecessary in quick tasks. +- Exploration sessions: Do not penalize for missing acceptance criteria or scope drift. +- Sessions where AI performed well despite vague prompts: Still classify deficits. Impact should be "low" since no visible cost. +- Agentic/delegation sessions: If the user gave a clear high-level directive and the AI autonomously planned and executed successfully, do not penalize for low message count or lack of micro-level specificity. Effective delegation IS good prompting. Focus on the quality of the initial delegation prompt.`; + +export const EFFECTIVE_PATTERN_CLASSIFICATION_GUIDANCE = ` +EFFECTIVE PATTERN CLASSIFICATION GUIDANCE: + +Each effective pattern captures a technique or approach that contributed to a productive session outcome. + +BASELINE EXCLUSION — do NOT classify these as patterns: +- Routine file reads at session start (Read/Glob/Grep on <5 files before editing) +- Following explicit user instructions (user said "run tests" → running tests is not a pattern) +- Basic tool usage (single file edits, standard CLI commands) +- Trivial self-corrections (typo fixes, minor syntax errors caught immediately) +Only classify behavior that is NOTABLY thorough, strategic, or beyond baseline expectations. + +CATEGORIES — classify the TYPE of effective pattern: +- "structured-planning": Decomposed the task into explicit steps, defined scope boundaries, or established a plan BEFORE writing code. Signal: plan/task-list/scope-definition appears before implementation. +- "incremental-implementation": Work progressed in small, verifiable steps with validation between them. Signal: multiple small edits with checks between, not one large batch. +- "verification-workflow": Proactive correctness checks (builds, tests, linters, types) BEFORE considering work complete. Signal: test/build/lint commands when nothing was known broken. +- "systematic-debugging": Methodical investigation using structured techniques (binary search, log insertion, reproduction isolation). Signal: multiple targeted diagnostic steps, not random guessing. +- "self-correction": Recognized a wrong path and pivoted WITHOUT user correction. Signal: explicit acknowledgment of mistake + approach change. NOT this if the user pointed out the error. +- "context-gathering": NOTABLY thorough investigation before changes — reading 5+ files, cross-module exploration, schema/type/config review. Signal: substantial Read/Grep/Glob usage spanning multiple directories before any Edit/Write. +- "domain-expertise": Applied specific framework/API/language knowledge correctly on first attempt without searching. Signal: correct non-obvious API usage with no preceding search and no subsequent error. NOT this if files were read first — that is context-gathering. +- "effective-tooling": Leveraged advanced tool capabilities that multiplied productivity — agent delegation, parallel work, multi-file coordination, strategic mode selection. Signal: use of tool features beyond basic read/write/edit. + +CONTRASTIVE PAIRS: +- structured-planning vs incremental-implementation: Planning = DECIDING what to do (before). Incremental = HOW you execute (during). Can have one without the other. +- context-gathering vs domain-expertise: Gathering = ACTIVE INVESTIGATION (reading files). Expertise = APPLYING EXISTING KNOWLEDGE without investigation. If files were read first → context-gathering. +- verification-workflow vs systematic-debugging: Verification = PROACTIVE (checking working code). Debugging = REACTIVE (investigating a failure). +- self-correction vs user-directed: Self-correction = AI caught own mistake unprompted. User said "that's wrong" → NOT self-correction. + +DRIVER — 4-step decision tree (follow IN ORDER): +Step 1: Did user infrastructure enable this? (CLAUDE.md rules, agent configs, hookify hooks, custom commands, system prompts) → "user-driven" +Step 2: Did the user explicitly request this behavior? (asked for plan, requested tests, directed investigation) → "user-driven" +Step 3: Did the AI exhibit this without any user prompting or infrastructure? → "ai-driven" +Step 4: Both made distinct, identifiable contributions → "collaborative" +Use "collaborative" ONLY when you can name what EACH party contributed. If uncertain, prefer the more specific label. + +When no canonical category fits, create a specific kebab-case category (a precise novel category is better than forcing a poor fit).`; diff --git a/cli/src/analysis/prompt-quality-normalize.ts b/cli/src/analysis/prompt-quality-normalize.ts new file mode 100644 index 0000000..9dfa0b2 --- /dev/null +++ b/cli/src/analysis/prompt-quality-normalize.ts @@ -0,0 +1,131 @@ +// Prompt quality category normalization. +// Clusters similar free-form categories to canonical ones during aggregation. +// Delegates to normalize-utils.ts for the shared levenshtein/normalizeCategory algorithm. + +import { CANONICAL_PQ_CATEGORIES, CANONICAL_PQ_STRENGTH_CATEGORIES } from './prompt-constants.js'; +import { normalizeCategory, kebabToTitleCase } from './normalize-utils.js'; + +// Human-readable labels for each canonical category. +export const PQ_CATEGORY_LABELS: Record = { + 'vague-request': 'Vague Request', + 'missing-context': 'Missing Context', + 'late-constraint': 'Late Constraint', + 'unclear-correction': 'Unclear Correction', + 'scope-drift': 'Scope Drift', + 'missing-acceptance-criteria': 'Missing Acceptance Criteria', + 'assumption-not-surfaced': 'Assumption Not Surfaced', + 'precise-request': 'Precise Request', + 'effective-context': 'Effective Context', + 'productive-correction': 'Productive Correction', +}; + +const STRENGTH_SET = new Set(CANONICAL_PQ_STRENGTH_CATEGORIES); + +// Explicit alias map for clustering emergent category variants. +// Targets don't need to be in CANONICAL_PQ_CATEGORIES — +// this clusters semantically-equivalent novel categories together. +// Alias lookup runs AFTER exact canonical match but BEFORE Levenshtein, +// so well-known emergent variants are clustered deterministically. +const PQ_ALIASES: Record = { + // vague-request variants + 'vague-instructions': 'vague-request', + 'unclear-request': 'vague-request', + 'imprecise-prompting': 'vague-request', + 'ambiguous-request': 'vague-request', + 'incomplete-request': 'vague-request', + 'generic-request': 'vague-request', + + // missing-context variants + 'missing-information': 'missing-context', + 'insufficient-context': 'missing-context', + 'no-context': 'missing-context', + 'lack-of-context': 'missing-context', + 'missing-background': 'missing-context', + + // late-constraint variants + 'late-context': 'late-constraint', + 'late-requirements': 'late-constraint', + 'piecemeal-requirements': 'late-constraint', + 'drip-fed-requirements': 'late-constraint', + 'incremental-requirements': 'late-constraint', + 'late-specification': 'late-constraint', + + // unclear-correction variants + 'unclear-feedback': 'unclear-correction', + 'vague-correction': 'unclear-correction', + 'unhelpful-correction': 'unclear-correction', + 'vague-feedback': 'unclear-correction', + + // scope-drift variants + 'context-drift': 'scope-drift', + 'objective-bloat': 'scope-drift', + 'session-bloat': 'scope-drift', + 'topic-switching': 'scope-drift', + 'scope-creep': 'scope-drift', + + // missing-acceptance-criteria variants + 'no-acceptance-criteria': 'missing-acceptance-criteria', + 'undefined-done': 'missing-acceptance-criteria', + 'no-definition-of-done': 'missing-acceptance-criteria', + 'unclear-success-criteria': 'missing-acceptance-criteria', + + // assumption-not-surfaced variants + 'hidden-assumption': 'assumption-not-surfaced', + 'unstated-assumption': 'assumption-not-surfaced', + 'implicit-assumption': 'assumption-not-surfaced', + 'unspoken-expectation': 'assumption-not-surfaced', + + // precise-request variants (strengths) + 'clear-request': 'precise-request', + 'specific-request': 'precise-request', + 'well-specified-request': 'precise-request', + 'detailed-request': 'precise-request', + + // effective-context variants (strengths) + 'good-context': 'effective-context', + 'upfront-context': 'effective-context', + 'proactive-context': 'effective-context', + 'rich-context': 'effective-context', + + // productive-correction variants (strengths) + 'clear-correction': 'productive-correction', + 'effective-feedback': 'productive-correction', + 'helpful-correction': 'productive-correction', + 'constructive-feedback': 'productive-correction', +}; + +/** + * Normalize a prompt quality category to the closest canonical category. + * Returns the original category if no close match is found. + * + * Matching rules (in order): + * 1. Exact match against canonical list → return as-is + * 1.5. Explicit alias match → return alias target (may be non-canonical) + * 2. Levenshtein distance <= 2 → return canonical match + * 3. Substring match (category contains canonical or vice versa) → return canonical + * 4. No match → return original (novel category) + * + * Note: alias targets in PQ_ALIASES bypass the canonical check intentionally. + */ +export function normalizePromptQualityCategory(category: string): string { + return normalizeCategory(category, { + canonicalCategories: CANONICAL_PQ_CATEGORIES, + aliases: PQ_ALIASES, + }); +} + +/** + * Get a human-readable label for a prompt quality category. + * Falls back to Title Case conversion for novel categories. + */ +export function getPQCategoryLabel(category: string): string { + return PQ_CATEGORY_LABELS[category] ?? kebabToTitleCase(category); +} + +/** + * Get the type (deficit or strength) for a prompt quality category. + * Novel categories default to deficit. + */ +export function getPQCategoryType(category: string): 'deficit' | 'strength' { + return STRENGTH_SET.has(category) ? 'strength' : 'deficit'; +} diff --git a/cli/src/analysis/prompt-types.ts b/cli/src/analysis/prompt-types.ts new file mode 100644 index 0000000..dfb5fd8 --- /dev/null +++ b/cli/src/analysis/prompt-types.ts @@ -0,0 +1,143 @@ +// Type definitions for LLM prompt analysis. +// Extracted from prompts.ts — shared by message-format.ts, response-parsers.ts, and analysis.ts. + +// SQLite row format for messages — snake_case with JSON-encoded arrays. +// This matches the shape returned by server/src/routes/messages.ts. +export interface SQLiteMessageRow { + id: string; + session_id: string; + type: 'user' | 'assistant' | 'system'; + content: string; + thinking: string | null; + tool_calls: string; // JSON-encoded ToolCall[] + tool_results: string; // JSON-encoded ToolResult[] + usage: string | null; + timestamp: string; + parent_id: string | null; +} + +/** + * Optional session metadata from V6 columns. + * Passed to prompt generators to add context signals about context compaction + * and slash command usage. Only present when at least one V6 field is non-empty. + */ +export interface SessionMetadata { + compactCount?: number; // from sessions.compact_count (user-initiated /compact) + autoCompactCount?: number; // from sessions.auto_compact_count (LLM-initiated compaction) + slashCommands?: string[]; // from sessions.slash_commands (JSON array of command names) +} + +/** + * A structured content block for LLM messages. + * Used to enable prompt caching (Anthropic ephemeral cache) and structured multi-part messages. + * The `cache_control` field instructs Anthropic to cache everything up to and including this block. + * Mirrors server/src/llm/types.ts ContentBlock — keep in sync. + */ +export interface ContentBlock { + type: 'text'; + text: string; + cache_control?: { type: 'ephemeral' }; +} + +export interface AnalysisResponse { + facets?: { + outcome_satisfaction: string; + workflow_pattern: string | null; + had_course_correction: boolean; + course_correction_reason: string | null; + iteration_count: number; + friction_points: Array<{ + _reasoning?: string; + category: string; + attribution?: string; + description: string; + severity: string; + resolution: string; + }>; + effective_patterns: Array<{ + _reasoning?: string; + category: string; + description: string; + confidence: number; + driver?: 'user-driven' | 'ai-driven' | 'collaborative'; + }>; + }; + summary: { + title: string; + content: string; + outcome?: 'success' | 'partial' | 'abandoned' | 'blocked'; + bullets: string[]; + }; + decisions: Array<{ + title: string; + situation?: string; + choice?: string; + reasoning: string; + alternatives?: Array<{ option: string; rejected_because: string }>; + trade_offs?: string; + revisit_when?: string; + confidence?: number; + evidence?: string[]; + }>; + learnings: Array<{ + title: string; + symptom?: string; + root_cause?: string; + takeaway?: string; + applies_when?: string; + confidence?: number; + evidence?: string[]; + }>; +} + +export interface ParseError { + error_type: 'json_parse_error' | 'no_json_found' | 'invalid_structure'; + error_message: string; + response_length: number; + response_preview: string; +} + +export type ParseResult = + | { success: true; data: T } + | { success: false; error: ParseError }; + +export interface PromptQualityFinding { + category: string; + type: 'deficit' | 'strength'; + description: string; + message_ref: string; + impact: 'high' | 'medium' | 'low'; + confidence: number; + suggested_improvement?: string; +} + +export interface PromptQualityTakeaway { + type: 'improve' | 'reinforce'; + category: string; + label: string; + message_ref: string; + // improve fields + original?: string; + better_prompt?: string; + why?: string; + // reinforce fields + what_worked?: string; + why_effective?: string; +} + +export interface PromptQualityDimensionScores { + context_provision: number; + request_specificity: number; + scope_management: number; + information_timing: number; + correction_quality: number; +} + +export interface PromptQualityResponse { + efficiency_score: number; + message_overhead: number; + assessment: string; + takeaways: PromptQualityTakeaway[]; + findings: PromptQualityFinding[]; + dimension_scores: PromptQualityDimensionScores; +} diff --git a/cli/src/analysis/prompts.ts b/cli/src/analysis/prompts.ts new file mode 100644 index 0000000..6d9e11c --- /dev/null +++ b/cli/src/analysis/prompts.ts @@ -0,0 +1,423 @@ +// Prompt template strings and generator functions for LLM session analysis. +// Types → prompt-types.ts, constants → prompt-constants.ts, +// formatting → message-format.ts, parsers → response-parsers.ts. + +import type { SessionMetadata, ContentBlock } from './prompt-types.js'; +import { + FRICTION_CLASSIFICATION_GUIDANCE, + CANONICAL_FRICTION_CATEGORIES, + CANONICAL_PATTERN_CATEGORIES, + CANONICAL_PQ_DEFICIT_CATEGORIES, + CANONICAL_PQ_STRENGTH_CATEGORIES, + PROMPT_QUALITY_CLASSIFICATION_GUIDANCE, + EFFECTIVE_PATTERN_CLASSIFICATION_GUIDANCE, +} from './prompt-constants.js'; +import { formatSessionMetaLine } from './message-format.js'; + +// ============================================================================= +// SHARED SYSTEM PROMPT +// A minimal (~100 token) system prompt shared by all analysis calls. +// The full classification guidance and schema examples live in the instruction +// suffix (user[1]), keeping the system prompt cacheable across calls. +// ============================================================================= + +/** + * Shared system prompt for all LLM analysis calls. + * Paired with buildCacheableConversationBlock() + an analysis-specific instruction block. + */ +export const SHARED_ANALYST_SYSTEM_PROMPT = `You are a senior staff engineer analyzing an AI coding session. You will receive the conversation transcript followed by specific extraction instructions. Respond with valid JSON only, wrapped in ... tags.`; + +// ============================================================================= +// CACHEABLE CONVERSATION BLOCK +// Wraps the formatted conversation in an Anthropic ephemeral cache block. +// CRITICAL: Must contain ONLY the formatted messages — no project name, no session +// metadata, no per-session variables. This ensures cache hits across sessions. +// ============================================================================= + +/** + * Wrap formatted conversation messages in a cacheable content block. + * The cache_control field instructs Anthropic to cache everything up to + * and including this block (ephemeral, 5-minute TTL). + * + * Non-Anthropic providers receive this as a ContentBlock[] and use + * flattenContent() to convert it to a plain string. + * + * @param formattedMessages - Output of formatMessagesForAnalysis() + */ +export function buildCacheableConversationBlock(formattedMessages: string): ContentBlock { + return { + type: 'text', + // Trailing double newline ensures the instruction block (user[1]) reads as a + // distinct section when providers flatten content blocks to a single string. + text: `--- CONVERSATION ---\n${formattedMessages}\n--- END CONVERSATION ---\n\n`, + cache_control: { type: 'ephemeral' }, + }; +} + +// ============================================================================= +// SESSION ANALYSIS INSTRUCTIONS +// The instruction suffix for session analysis calls (user[1]). +// Contains the full analyst persona, schema, and quality guidance. +// Per-session variables (project name, summary, meta) go here — NOT in the +// cached conversation block. +// ============================================================================= + +/** + * Build the instruction suffix for session analysis. + * Used as the second content block in the user message, after the cached conversation. + */ +export function buildSessionAnalysisInstructions( + projectName: string, + sessionSummary: string | null, + meta?: SessionMetadata +): string { + return `You are a senior staff engineer writing entries for a team's engineering knowledge base. You've just observed an AI-assisted coding session and your job is to extract the insights that would save another engineer time if they encountered a similar situation 6 months from now. + +Your audience is a developer who has never seen this session but works on the same codebase. They need enough context to understand WHY a decision was made, WHAT specific gotcha was discovered, and WHEN this knowledge applies. + +Project: ${projectName} +${sessionSummary ? `Session Summary: ${sessionSummary}\n` : ''}${formatSessionMetaLine(meta)} +=== PART 1: SESSION FACETS === +Extract these FIRST as a holistic session assessment: + +1. outcome_satisfaction: Rate the session outcome. + - "high": Task completed successfully, user satisfied + - "medium": Partial completion or minor issues + - "low": Significant problems, user frustrated + - "abandoned": Session ended without achieving the goal + +2. workflow_pattern: Identify the dominant workflow pattern (or null if unclear). + Recommended values: "plan-then-implement", "iterative-refinement", "debug-fix-verify", "explore-then-build", "direct-execution" + +3. friction_points: Identify up to 5 moments where progress was blocked or slowed (array, max 5). + Each friction point has: + - _reasoning: (REQUIRED) Your reasoning chain for category + attribution. 2-3 sentences max. Walk through the decision tree steps. This field is saved but not shown to users — use it to think before classifying. + - category: Use one of these PREFERRED categories when applicable: ${CANONICAL_FRICTION_CATEGORIES.join(', ')}. Create a new kebab-case category only when none of these fit. + - attribution: "user-actionable" (better user input would have prevented this), "ai-capability" (AI failed despite adequate input), or "environmental" (external constraint) + - description: One neutral sentence describing what happened, with specific details (file names, APIs, errors) + - severity: "high" (blocked progress for multiple turns), "medium" (caused a detour), "low" (minor hiccup) + - resolution: "resolved" (fixed in session), "workaround" (bypassed), "unresolved" (still broken) +${FRICTION_CLASSIFICATION_GUIDANCE} + +4. effective_patterns: Up to 3 techniques or approaches that worked particularly well (array, max 3). + Each has: + - _reasoning: (REQUIRED) Your reasoning chain for category + driver. 2-3 sentences max. Walk through the decision tree steps and baseline exclusion check. This field is saved but not shown to users — use it to think before classifying. + - category: Use one of these PREFERRED categories when applicable: structured-planning, incremental-implementation, verification-workflow, systematic-debugging, self-correction, context-gathering, domain-expertise, effective-tooling. Create a new kebab-case category only when none fit. + - description: Specific technique worth repeating (1-2 sentences with concrete detail) + - confidence: 0-100 how confident you are this is genuinely effective + - driver: Who drove this pattern — "user-driven" (user explicitly requested it), "ai-driven" (AI exhibited it without prompting), or "collaborative" (both contributed or emerged from interaction) +${EFFECTIVE_PATTERN_CLASSIFICATION_GUIDANCE} + +5. had_course_correction: true if the user redirected the AI from a wrong approach, false otherwise +6. course_correction_reason: If had_course_correction is true, briefly explain what was corrected (or null) +7. iteration_count: Number of times the user had to clarify, correct, or re-explain something + +If the session has minimal friction and straightforward execution, use empty arrays for friction_points, set outcome_satisfaction to "high", and iteration_count to 0. + +=== PART 2: INSIGHTS === +Then extract these: + +You will extract: +1. **Summary**: A narrative of what was accomplished and the outcome +2. **Decisions**: Technical choices made — with full situation context, reasoning, rejected alternatives, trade-offs, and conditions for revisiting (max 3) +3. **Learnings**: Technical discoveries, gotchas, debugging breakthroughs — with the observable symptom, root cause, and a transferable takeaway (max 5) + +Quality Standards: +- Only include insights you would write in a team knowledge base for future reference +- Each insight MUST reference concrete details: specific file names, library names, error messages, API endpoints, or code patterns +- Do not invent file names, APIs, errors, or details not present in the conversation +- Rate your confidence in each insight's value (0-100). Only include insights you rate 70+. +- It is better to return 0 insights in a category than to include generic or trivial ones +- If a session is straightforward with no notable decisions or learnings, say so in the summary and leave other categories empty + +Length Guidance: +- Fill every field in the schema. An empty "trade_offs" or "revisit_when" is worse than a longer response. +- Total response: stay under 2000 tokens. If you must cut, drop lower-confidence insights rather than compressing high-confidence ones. +- Evidence: 1-3 short quotes per insight, referencing turn labels. +- Prefer precision over brevity — a specific 3-sentence insight beats a vague 1-sentence insight. + +DO NOT include insights like these (too generic/trivial): +- "Used debugging techniques to fix an issue" +- "Made architectural decisions about the codebase" +- "Implemented a new feature" (the summary already covers this) +- "Used React hooks for state management" (too generic without specifics) +- "Fixed a bug in the code" (what bug? what was the root cause?) +- Anything that restates the task without adding transferable knowledge + +Here is an example of an EXCELLENT insight — this is the quality bar: + +EXCELLENT learning: +{ + "title": "Tailwind v4 requires @theme inline{} for CSS variable utilities", + "symptom": "After Tailwind v3→v4 upgrade, custom utilities like bg-primary stopped working. Classes present in HTML but no styles applied.", + "root_cause": "Tailwind v4 removed tailwind.config.js theme extension. CSS variables in :root are not automatically available as utilities — must be registered via @theme inline {} in the CSS file.", + "takeaway": "When migrating Tailwind v3→v4 with shadcn/ui: add @theme inline {} mapping CSS variables, add @custom-variant dark for class-based dark mode, replace tailwindcss-animate with tw-animate-css.", + "applies_when": "Any Tailwind v3→v4 migration using CSS variables for theming, especially with shadcn/ui.", + "confidence": 95, + "evidence": ["User#12: 'The colors are all gone after the upgrade'", "Assistant#13: 'Tailwind v4 requires explicit @theme inline registration...'"] +} + +Extract insights in this JSON format: +{ + "facets": { + "outcome_satisfaction": "high | medium | low | abandoned", + "workflow_pattern": "plan-then-implement | iterative-refinement | debug-fix-verify | explore-then-build | direct-execution | null", + "had_course_correction": false, + "course_correction_reason": null, + "iteration_count": 0, + "friction_points": [ + { + "_reasoning": "User said 'fix the auth' without specifying OAuth vs session-based or which file. Step 1: not external — this is about the prompt, not infrastructure. Step 2: user could have specified which auth flow → user-actionable. Category: incomplete-requirements fits better than vague-request because specific constraints (which flow, which file) were missing, not the overall task description.", + "category": "incomplete-requirements", + "attribution": "user-actionable", + "description": "Missing specification of which auth flow (OAuth vs session) caused implementation of wrong provider in auth.ts", + "severity": "medium", + "resolution": "resolved" + }, + { + "_reasoning": "AI applied Express middleware pattern to a Hono route despite conversation showing Hono imports. Step 1: not external. Step 2: user provided clear Hono context in prior messages. Step 3: AI failed despite adequate input → ai-capability. Category: knowledge-gap — incorrect framework API knowledge was applied.", + "category": "knowledge-gap", + "attribution": "ai-capability", + "description": "Express-style middleware pattern applied to Hono route despite Hono imports visible in conversation context", + "severity": "high", + "resolution": "resolved" + } + ], + "effective_patterns": [ + { + "_reasoning": "Before editing, AI read 8 files across server/src/routes/ and server/src/llm/ to understand the data flow. Baseline check: 8 files across 2 directories = beyond routine (<5 file) reads. Step 1: no CLAUDE.md rule requiring this. Step 2: user didn't ask for investigation. Step 3: AI explored autonomously → ai-driven. Category: context-gathering (active investigation, not pre-existing knowledge).", + "category": "context-gathering", + "description": "Read 8 files across routes/ and llm/ directories to map the data flow before modifying the aggregation query, preventing a type mismatch that would have required rework", + "confidence": 88, + "driver": "ai-driven" + } + ] + }, + "summary": { + "title": "Brief title describing main accomplishment (max 80 chars)", + "content": "2-4 sentence narrative: what was the goal, what was done, what was the outcome. Mention the primary file or component changed.", + "outcome": "success | partial | abandoned | blocked", + "bullets": ["Each bullet names a specific artifact (file, function, endpoint) and what changed"] + }, + "decisions": [ + { + "title": "The specific technical choice made (max 80 chars)", + "situation": "What problem or requirement led to this decision point", + "choice": "What was chosen and how it was implemented", + "reasoning": "Why this choice was made — the key factors that tipped the decision", + "alternatives": [ + {"option": "Name of alternative", "rejected_because": "Why it was not chosen"} + ], + "trade_offs": "What downsides were accepted, what was given up", + "revisit_when": "Under what conditions this decision should be reconsidered (or 'N/A' if permanent)", + "confidence": 85, + "evidence": ["User#4: quoted text...", "Assistant#5: quoted text..."] + } + ], + "learnings": [ + { + "title": "Specific technical discovery or gotcha (max 80 chars)", + "symptom": "What went wrong or was confusing — the observable behavior that triggered investigation", + "root_cause": "The underlying technical reason — why it happened", + "takeaway": "The transferable lesson — what to do or avoid in similar situations, useful outside this project", + "applies_when": "Conditions under which this knowledge is relevant (framework version, configuration, etc.)", + "confidence": 80, + "evidence": ["User#7: quoted text...", "Assistant#8: quoted text..."] + } + ] +} + +Only include insights rated 70+ confidence. If you cannot cite evidence, drop the insight. Return empty arrays for categories with no strong insights. Max 3 decisions, 5 learnings. +Evidence should reference the labeled turns in the conversation (e.g., "User#2", "Assistant#5"). + +Respond with valid JSON only, wrapped in ... tags. Do not include any other text.`; +} + +// ============================================================================= +// PROMPT QUALITY INSTRUCTIONS +// The instruction suffix for prompt quality analysis calls (user[1]). +// ============================================================================= + +/** + * Build the instruction suffix for prompt quality analysis. + * Used as the second content block in the user message, after the cached conversation. + */ +export function buildPromptQualityInstructions( + projectName: string, + sessionMeta: { + humanMessageCount: number; + assistantMessageCount: number; + toolExchangeCount: number; + }, + meta?: SessionMetadata +): string { + return `You are a prompt engineering coach helping developers communicate more effectively with AI coding assistants. You review conversations and identify specific moments where better prompting would have saved time — AND moments where the user prompted particularly well. + +You will produce: +1. **Takeaways**: Concrete before/after examples the user can learn from (max 4) +2. **Findings**: Categorized findings for cross-session aggregation (max 8) +3. **Dimension scores**: 5 numeric dimensions for progress tracking +4. **Efficiency score**: 0-100 overall rating +5. **Assessment**: 2-3 sentence summary + +Project: ${projectName} +Session shape: ${sessionMeta.humanMessageCount} user messages, ${sessionMeta.assistantMessageCount} assistant messages, ${sessionMeta.toolExchangeCount} tool exchanges +${formatSessionMetaLine(meta)} +Before evaluating, mentally walk through the conversation and identify: +1. Each time the assistant asked for clarification that could have been avoided +2. Each time the user corrected the assistant's interpretation +3. Each time the user repeated an instruction they gave earlier +4. Whether critical context or requirements were provided late +5. Whether the user discussed the plan/approach before implementation +6. Moments where the user's prompt was notably well-crafted +7. If context compactions occurred, note that the AI may have lost context — repeated instructions IMMEDIATELY after a compaction are NOT a user prompting deficit +These are your candidate findings. Only include them if they are genuinely actionable. + +${PROMPT_QUALITY_CLASSIFICATION_GUIDANCE} + +Guidelines: +- Focus on USER messages only — don't critique the assistant's responses +- Be constructive, not judgmental — the goal is to help users improve +- A score of 100 means every user message was perfectly clear and complete +- A score of 50 means about half the messages could have been more efficient +- Include BOTH deficits and strengths — what went right matters as much as what went wrong +- If the user prompted well, say so — don't manufacture issues +- If the session had context compactions, do NOT penalize the user for repeating instructions immediately after a compaction — the AI lost context, not the user. Repetition unrelated to compaction events should still be flagged. + +Length Guidance: +- Max 4 takeaways (ordered: improve first, then reinforce), max 8 findings +- better_prompt must be a complete, usable prompt — not vague meta-advice +- assessment: 2-3 sentences +- Total response: stay under 2500 tokens + +Evaluate the user's prompting quality and respond with this JSON format: +{ + "efficiency_score": 75, + "message_overhead": 3, + "assessment": "2-3 sentence summary of prompting style and efficiency", + "takeaways": [ + { + "type": "improve", + "category": "late-constraint", + "label": "Short human-readable heading", + "message_ref": "User#5", + "original": "The user's original message (abbreviated)", + "better_prompt": "A concrete rewrite with the missing context included", + "why": "One sentence: why the original caused friction" + }, + { + "type": "reinforce", + "category": "precise-request", + "label": "Short human-readable heading", + "message_ref": "User#0", + "what_worked": "What the user did well", + "why_effective": "Why it led to a good outcome" + } + ], + "findings": [ + { + "category": "late-constraint", + "type": "deficit", + "description": "One neutral sentence with specific details", + "message_ref": "User#5", + "impact": "high", + "confidence": 90, + "suggested_improvement": "Concrete rewrite or behavioral change" + }, + { + "category": "precise-request", + "type": "strength", + "description": "One sentence describing what the user did well", + "message_ref": "User#0", + "impact": "medium", + "confidence": 85 + } + ], + "dimension_scores": { + "context_provision": 70, + "request_specificity": 65, + "scope_management": 80, + "information_timing": 55, + "correction_quality": 75 + } +} + +Category values — use these PREFERRED categories: +Deficits: ${CANONICAL_PQ_DEFICIT_CATEGORIES.join(', ')} +Strengths: ${CANONICAL_PQ_STRENGTH_CATEGORIES.join(', ')} +Create a new kebab-case category only when none of these fit. + +Rules: +- message_ref uses the labeled turns in the conversation (e.g., "User#0", "User#5") +- Only include genuinely notable findings, not normal back-and-forth +- Takeaways are the user-facing highlights — max 4, ordered: improve first, then reinforce +- Findings are the full categorized set for aggregation — max 8 +- If the user prompted well, include strength findings and reinforce takeaways — don't manufacture issues +- message_overhead is how many fewer messages the session could have taken with better prompts +- dimension_scores: each 0-100. Score correction_quality as 75 if no corrections were needed. + +Respond with valid JSON only, wrapped in ... tags. Do not include any other text.`; +} + +// ============================================================================= +// FACET-ONLY INSTRUCTIONS +// The instruction suffix for facet-only extraction calls (user[1]). +// ============================================================================= + +/** + * Build the instruction suffix for facet-only extraction (backfill path). + * Used as the second content block in the user message, after the cached conversation. + */ +export function buildFacetOnlyInstructions( + projectName: string, + sessionSummary: string | null, + meta?: SessionMetadata +): string { + return `You are assessing an AI coding session to extract structured metadata for cross-session pattern analysis. + +Project: ${projectName} +${sessionSummary ? `Session Summary: ${sessionSummary}\n` : ''}${formatSessionMetaLine(meta)} +Extract session facets — a holistic assessment of how the session went: + +1. outcome_satisfaction: "high" (completed successfully), "medium" (partial), "low" (problems), "abandoned" (gave up) +2. workflow_pattern: The dominant pattern, or null. Values: "plan-then-implement", "iterative-refinement", "debug-fix-verify", "explore-then-build", "direct-execution" +3. friction_points: Up to 5 moments where progress stalled (array). + Each: { _reasoning (3-step attribution decision tree reasoning), category (kebab-case, prefer: ${CANONICAL_FRICTION_CATEGORIES.join(', ')}), attribution ("user-actionable"|"ai-capability"|"environmental"), description (one neutral sentence with specific details), severity ("high"|"medium"|"low"), resolution ("resolved"|"workaround"|"unresolved") } +${FRICTION_CLASSIFICATION_GUIDANCE} +4. effective_patterns: Up to 3 things that worked well (array). + Each: { _reasoning (driver decision tree reasoning — check user infrastructure first), category (kebab-case, prefer: ${CANONICAL_PATTERN_CATEGORIES.join(', ')}), description (specific technique, 1-2 sentences), confidence (0-100), driver ("user-driven"|"ai-driven"|"collaborative") } +${EFFECTIVE_PATTERN_CLASSIFICATION_GUIDANCE} +5. had_course_correction: true/false — did the user redirect the AI? +6. course_correction_reason: Brief explanation if true, null otherwise +7. iteration_count: How many user clarification/correction cycles occurred + +Extract facets in this JSON format: +{ + "outcome_satisfaction": "high | medium | low | abandoned", + "workflow_pattern": "string or null", + "had_course_correction": false, + "course_correction_reason": null, + "iteration_count": 0, + "friction_points": [ + { + "_reasoning": "Reasoning for category + attribution classification", + "category": "kebab-case-category", + "attribution": "user-actionable | ai-capability | environmental", + "description": "One neutral sentence about the gap, with specific details", + "severity": "high | medium | low", + "resolution": "resolved | workaround | unresolved" + } + ], + "effective_patterns": [ + { + "_reasoning": "Reasoning for category + driver classification, including baseline check", + "category": "kebab-case-category", + "description": "technique", + "confidence": 85, + "driver": "user-driven | ai-driven | collaborative" + } + ] +} + +Respond with valid JSON only, wrapped in ... tags.`; +} diff --git a/cli/src/analysis/response-parsers.ts b/cli/src/analysis/response-parsers.ts new file mode 100644 index 0000000..542b5d6 --- /dev/null +++ b/cli/src/analysis/response-parsers.ts @@ -0,0 +1,200 @@ +// LLM response parsing utilities. +// Extracted from prompts.ts — handles JSON extraction, repair, and validation. + +import { jsonrepair } from 'jsonrepair'; +import type { AnalysisResponse, ParseError, ParseResult, PromptQualityResponse, PromptQualityDimensionScores } from './prompt-types.js'; + +function buildResponsePreview(text: string, head = 200, tail = 200): string { + if (text.length <= head + tail + 20) return text; + return `${text.slice(0, head)}\n...[${text.length - head - tail} chars omitted]...\n${text.slice(-tail)}`; +} + +export function extractJsonPayload(response: string): string | null { + const tagged = response.match(/\s*([\s\S]*?)\s*<\/json>/i); + if (tagged?.[1]) return tagged[1].trim(); + const jsonMatch = response.match(/\{[\s\S]*\}/); + return jsonMatch ? jsonMatch[0] : null; +} + +/** + * Parse the LLM response into structured insights. + */ +export function parseAnalysisResponse(response: string): ParseResult { + const response_length = response.length; + + const preview = buildResponsePreview(response); + + const jsonPayload = extractJsonPayload(response); + if (!jsonPayload) { + console.error('No JSON found in analysis response'); + return { + success: false, + error: { error_type: 'no_json_found', error_message: 'No JSON found in analysis response', response_length, response_preview: preview }, + }; + } + + let parsed: AnalysisResponse; + try { + parsed = JSON.parse(jsonPayload) as AnalysisResponse; + } catch { + // Attempt repair — handles trailing commas, unclosed braces, truncated output + try { + parsed = JSON.parse(jsonrepair(jsonPayload)) as AnalysisResponse; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error('Failed to parse analysis response (after jsonrepair):', err); + return { + success: false, + error: { error_type: 'json_parse_error', error_message: msg, response_length, response_preview: preview }, + }; + } + } + + if (!parsed.summary || typeof parsed.summary.title !== 'string') { + console.error('Invalid analysis response structure'); + return { + success: false, + error: { error_type: 'invalid_structure', error_message: 'Missing or invalid summary field', response_length, response_preview: preview }, + }; + } + + // Guard against LLM returning non-array values (e.g. "decisions": "none"). + // || [] alone won't catch truthy non-arrays — Array.isArray is required. + parsed.decisions = Array.isArray(parsed.decisions) ? parsed.decisions : []; + parsed.learnings = Array.isArray(parsed.learnings) ? parsed.learnings : []; + + // Normalize facet arrays before monitors access .some() — a non-array truthy value + // (e.g. LLM returns "friction_points": "none") would throw a TypeError on .some(). + if (parsed.facets) { + if (!Array.isArray(parsed.facets.friction_points)) parsed.facets.friction_points = []; + if (!Array.isArray(parsed.facets.effective_patterns)) parsed.facets.effective_patterns = []; + } + + // Observability: two-tier tooling-limitation monitor. + // Tier 1: _reasoning contains misclassification signals NOT in a negation context → likely wrong category. + // Tier 2: no conflicting signals (or signal was negated) → generic reminder to verify. + // Re-evaluate after ~30 sessions with improved FRICTION_CLASSIFICATION_GUIDANCE. + if (parsed.facets?.friction_points?.some(fp => fp.category === 'tooling-limitation')) { + // Expanded regex covers both literal terms and GPT-4o paraphrasing patterns + const MISCLASS_SIGNALS = /rate.?limit|throttl|quota.?exceed|crash|fail.{0,10}unexpect|lost.?state|context.{0,10}(?:drop|lost|unavail)|wrong.?tool|different.?(?:approach|method)|(?:didn.t|did not|unaware).{0,10}(?:know|capabil)|(?:older|previous).?version|used to (?:work|be)|behavio.?r.?change/i; + const NEGATION_CONTEXT = /\bnot\b|\bnor\b|\bisn.t\b|\bwasn.t\b|\brule[d]? out\b|\brejected?\b|\beliminated?\b|\breclassif/i; + const toolingFps = parsed.facets.friction_points.filter(fp => fp.category === 'tooling-limitation'); + for (const fp of toolingFps) { + if (!fp._reasoning) { + console.warn('[friction-monitor] LLM classified friction as "tooling-limitation" without _reasoning — cannot verify'); + continue; + } + const matchResult = fp._reasoning.match(MISCLASS_SIGNALS); + if (matchResult) { + // Check if the signal appears in a negation context (model correctly eliminating the alternative) + const matchIdx = fp._reasoning.search(MISCLASS_SIGNALS); + const preceding = fp._reasoning.slice(Math.max(0, matchIdx - 40), matchIdx); + if (!NEGATION_CONTEXT.test(preceding)) { + console.warn(`[friction-monitor] Likely misclassification: "tooling-limitation" with reasoning mentioning "${matchResult[0]}" — review category`); + } + // If negated, the model correctly considered and rejected the alternative — no warning + } else { + console.warn('[friction-monitor] LLM classified friction as "tooling-limitation" — verify genuine tool limitation'); + } + } + } + + // Observability: warn when LLM returns effective_pattern without category or driver field, + // or with an unrecognized driver value. + // Catches models that ignore the classification instructions (especially smaller Ollama models). + // Remove after confirming classification quality over ~20 new sessions. + if (parsed.facets?.effective_patterns?.some(ep => !ep.category)) { + console.warn('[pattern-monitor] LLM returned effective_pattern without category field'); + } + if (parsed.facets?.effective_patterns?.some(ep => !ep.driver)) { + console.warn('[pattern-monitor] LLM returned effective_pattern without driver field — driver classification may be incomplete'); + } + const VALID_DRIVERS = new Set(['user-driven', 'ai-driven', 'collaborative']); + if (parsed.facets?.effective_patterns?.some(ep => ep.driver && !VALID_DRIVERS.has(ep.driver))) { + console.warn('[pattern-monitor] LLM returned unexpected driver value — check classification quality'); + } + + // Validation: check for missing _reasoning CoT scratchpad fields. + // These fields ensure the model walks through the attribution/driver decision trees + // before committing to classification values. + // (Monitoring period complete — warn calls removed after confirming CoT compliance) + if (parsed.facets?.friction_points?.some(fp => !fp._reasoning)) { + // Missing _reasoning: classification may lack decision-tree rigor + } + if (parsed.facets?.effective_patterns?.some(ep => !ep._reasoning)) { + // Missing _reasoning: classification may lack decision-tree rigor + } + + return { success: true, data: parsed }; +} + +export function parsePromptQualityResponse(response: string): ParseResult { + const response_length = response.length; + const preview = buildResponsePreview(response); + + const jsonPayload = extractJsonPayload(response); + if (!jsonPayload) { + console.error('No JSON found in prompt quality response'); + return { + success: false, + error: { error_type: 'no_json_found', error_message: 'No JSON found in prompt quality response', response_length, response_preview: preview }, + }; + } + + let parsed: PromptQualityResponse; + try { + parsed = JSON.parse(jsonPayload) as PromptQualityResponse; + } catch { + try { + parsed = JSON.parse(jsonrepair(jsonPayload)) as PromptQualityResponse; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error('Failed to parse prompt quality response (after jsonrepair):', msg); + return { + success: false, + error: { error_type: 'json_parse_error', error_message: msg, response_length, response_preview: preview }, + }; + } + } + + if (typeof parsed.efficiency_score !== 'number') { + console.error('Invalid prompt quality response: missing efficiency_score'); + return { + success: false, + error: { error_type: 'invalid_structure', error_message: 'Missing or invalid efficiency_score field', response_length, response_preview: preview }, + }; + } + + // Clamp and default + parsed.efficiency_score = Math.max(0, Math.min(100, Math.round(parsed.efficiency_score))); + parsed.message_overhead = parsed.message_overhead ?? 0; + parsed.assessment = parsed.assessment || ''; + // Guard against LLM returning non-array values (e.g. "findings": "none") — + // || [] alone won't catch truthy non-arrays, and .some() on line 166 would throw. + parsed.takeaways = Array.isArray(parsed.takeaways) ? parsed.takeaways : []; + parsed.findings = Array.isArray(parsed.findings) ? parsed.findings : []; + parsed.dimension_scores = parsed.dimension_scores || { + context_provision: 50, + request_specificity: 50, + scope_management: 50, + information_timing: 50, + correction_quality: 50, + }; + + // Clamp dimension scores + for (const key of Object.keys(parsed.dimension_scores) as Array) { + parsed.dimension_scores[key] = Math.max(0, Math.min(100, Math.round(parsed.dimension_scores[key] ?? 50))); + } + + // Validation: check for missing category or unexpected type values in findings. + // (Monitoring period complete — warn calls removed after confirming classification quality) + if (parsed.findings.some(f => !f.category)) { + // Finding missing category field + } + + if (parsed.findings.some(f => f.type && f.type !== 'deficit' && f.type !== 'strength')) { + // Finding has unexpected type value — expected deficit or strength + } + + return { success: true, data: parsed }; +} From 09e0806b48eb92fbbfb62d0dd7b67ea8e48db8e1 Mon Sep 17 00:00:00 2001 From: Srikanth Rao M Date: Sat, 28 Mar 2026 17:10:48 +0530 Subject: [PATCH 2/2] refactor(server): convert 9 llm/ modules to re-exports from @code-insights/cli/analysis/* Server files now re-export from the CLI analysis/ package, preserving the exact same public API surface with zero breaking changes. Converted to re-exports: - server/src/llm/prompt-types.ts - server/src/llm/prompt-constants.ts - server/src/llm/prompts.ts - server/src/llm/message-format.ts - server/src/llm/response-parsers.ts - server/src/llm/normalize-utils.ts - server/src/llm/friction-normalize.ts - server/src/llm/pattern-normalize.ts - server/src/llm/prompt-quality-normalize.ts All existing server imports (analysis.ts, facet-extraction.ts, prompt-quality-analysis.ts, analysis-db.ts, shared-aggregation.ts) continue to work unchanged via re-exports. Co-Authored-By: Claude Sonnet 4.6 --- server/src/llm/friction-normalize.ts | 59 +-- server/src/llm/message-format.ts | 138 +------ server/src/llm/normalize-utils.ts | 91 +---- server/src/llm/pattern-normalize.ts | 104 +---- server/src/llm/prompt-constants.ts | 201 +--------- server/src/llm/prompt-quality-normalize.ts | 139 +------ server/src/llm/prompt-types.ts | 145 +------ server/src/llm/prompts.ts | 434 +-------------------- server/src/llm/response-parsers.ts | 207 +--------- 9 files changed, 67 insertions(+), 1451 deletions(-) diff --git a/server/src/llm/friction-normalize.ts b/server/src/llm/friction-normalize.ts index 2ade90a..ba129a6 100644 --- a/server/src/llm/friction-normalize.ts +++ b/server/src/llm/friction-normalize.ts @@ -1,56 +1,3 @@ -// Friction category normalization. -// Clusters similar free-form friction categories to canonical ones during aggregation. - -import { CANONICAL_FRICTION_CATEGORIES } from './prompt-constants.js'; -import { normalizeCategory } from './normalize-utils.js'; - -// Explicit alias map for clustering emergent category variants. -// Targets don't need to be in CANONICAL_FRICTION_CATEGORIES — -// this clusters semantically-equivalent novel categories together. -// Insert alias lookup runs AFTER exact canonical match but BEFORE Levenshtein, -// so well-known emergent variants are clustered deterministically. -const FRICTION_ALIASES: Record = { - // legacy canonical → new canonical (15→9 taxonomy revision) - 'missing-dependency': 'stale-assumptions', - 'config-drift': 'stale-assumptions', - 'stale-cache': 'stale-assumptions', - 'version-mismatch': 'stale-assumptions', - 'permission-issue': 'stale-assumptions', - 'environment-mismatch': 'stale-assumptions', - 'race-condition': 'wrong-approach', - 'circular-dependency': 'wrong-approach', - 'test-failure': 'wrong-approach', - 'type-error': 'knowledge-gap', - 'api-misunderstanding': 'knowledge-gap', - // agent orchestration variants → cluster under one emergent name - 'agent-lifecycle-issue': 'agent-orchestration-failure', - 'agent-communication-failure': 'agent-orchestration-failure', - 'agent-communication-breakdown': 'agent-orchestration-failure', - 'agent-lifecycle-management': 'agent-orchestration-failure', - 'agent-shutdown-failure': 'agent-orchestration-failure', - // rate limit variants → cluster under one emergent name - 'api-rate-limit': 'rate-limit-hit', - 'rate-limiting': 'rate-limit-hit', - 'rate-limited': 'rate-limit-hit', -}; - -/** - * Normalize a friction category to the closest canonical category. - * Returns the original category if no close match is found. - * - * Matching rules (in order): - * 1. Exact match against canonical list → return as-is - * 1.5. Explicit alias match → return alias target (may be non-canonical) - * 2. Levenshtein distance <= 2 → return canonical match - * 3. Substring match (category contains canonical or vice versa) → return canonical - * 4. No match → return original (novel category) - * - * Note: alias targets in FRICTION_ALIASES bypass the canonical check intentionally. - * e.g., "agent-orchestration-failure" is not canonical but is a valid cluster target. - */ -export function normalizeFrictionCategory(category: string): string { - return normalizeCategory(category, { - canonicalCategories: CANONICAL_FRICTION_CATEGORIES, - aliases: FRICTION_ALIASES, - }); -} +// Re-exports from @code-insights/cli/analysis/friction-normalize. +// Moved to CLI package so the CLI can use friction normalization for native analysis (--native mode). +export { normalizeFrictionCategory } from '@code-insights/cli/analysis/friction-normalize'; diff --git a/server/src/llm/message-format.ts b/server/src/llm/message-format.ts index d7792ab..3e0d7e4 100644 --- a/server/src/llm/message-format.ts +++ b/server/src/llm/message-format.ts @@ -1,131 +1,7 @@ -// SQLite message formatting utilities for LLM prompt construction. -// Extracted from prompts.ts — used by prompt generator functions in prompts.ts. - -import type { SQLiteMessageRow, SessionMetadata } from './prompt-types.js'; -import { safeParseJson } from '../utils.js'; - -// Internal types — only used within formatMessagesForAnalysis -interface ParsedToolCall { - name?: string; -} - -interface ParsedToolResult { - output?: string; -} - -/** - * Detect the class of a stored user message from its content string. - * Operates on the DB content field (stringified), not raw JSONL. - * - * This mirrors classifyUserMessage() in cli/src/parser/jsonl.ts but works on - * stored content strings instead of parsed JSONL message objects. The DB stores - * message content as a plain string — tool-results are JSON arrays stringified, - * human text is stored as-is. - * - * Order matters — most specific checks first. - */ -export function classifyStoredUserMessage(content: string): 'human' | 'tool-result' | 'system-artifact' { - // Tool-result: content is a JSON array containing tool_result blocks. - // The DB stores these as stringified JSON arrays starting with '['. - if (content.startsWith('[') && content.includes('"tool_result"')) return 'tool-result'; - - // Auto-compact summary: Claude Code uses two known prefixes for LLM-initiated - // context compaction summaries. Both must be checked. - if (content.startsWith('Here is a summary of our conversation')) return 'system-artifact'; - if (content.startsWith('This session is being continued')) return 'system-artifact'; - - // Slash command or skill load: single-line starting with / followed by a lowercase letter. - // Requires content.trim() to be short (≤2 lines) to avoid false-positives on messages - // containing file paths like "/usr/bin/..." as part of a longer instruction. - const trimmed = content.trim(); - if (/^\/[a-z]/.test(trimmed) && trimmed.split('\n').length <= 2) return 'system-artifact'; - - return 'human'; -} - -/** - * Format SQLite message rows for LLM consumption. - * Handles snake_case fields and JSON-encoded tool_calls/tool_results. - * - * User#N indices only increment for genuine human messages. Tool-results and - * system artifacts (auto-compacts, slash commands) receive bracketed labels - * instead. This ensures User#N references in PQ takeaways and evidence fields - * align with actual human turns, not inflated by tool-result rows. - */ -export function formatMessagesForAnalysis(messages: SQLiteMessageRow[]): string { - let userIndex = 0; - let assistantIndex = 0; - - return messages - .map((m) => { - let roleLabel: string; - - if (m.type === 'user') { - const msgClass = classifyStoredUserMessage(m.content); - if (msgClass === 'tool-result') { - roleLabel = '[tool-result]'; - } else if (msgClass === 'system-artifact') { - // Auto-compact summaries use two known prefixes — everything else (slash commands, - // skill loads) is a generic system artifact, not a compaction event. - const isAutoCompact = m.content.startsWith('Here is a summary of our conversation') - || m.content.startsWith('This session is being continued'); - roleLabel = isAutoCompact ? '[auto-compact]' : '[system]'; - } else { - // Genuine human message — increment counter - roleLabel = `User#${userIndex++}`; - } - } else if (m.type === 'assistant') { - roleLabel = `Assistant#${assistantIndex++}`; - } else { - roleLabel = 'System'; - } - - // Parse JSON-encoded tool_calls and tool_results via safeParseJson - const toolCalls = safeParseJson(m.tool_calls, []); - const toolResults = safeParseJson(m.tool_results, []); - - const toolInfo = toolCalls.length > 0 - ? `\n[Tools used: ${toolCalls.map(t => t.name || 'unknown').join(', ')}]` - : ''; - - // Include thinking content — capped at 1000 chars to stay within token budget - const thinkingInfo = m.thinking - ? `\n[Thinking: ${m.thinking.slice(0, 1000)}]` - : ''; - - // Include tool results for context — 500 chars per result (error messages need ~300-400 chars) - const resultInfo = toolResults.length > 0 - ? `\n[Tool results: ${toolResults.map(r => (r.output || '').slice(0, 500)).join(' | ')}]` - : ''; - - return `### ${roleLabel}:\n${m.content}${thinkingInfo}${toolInfo}${resultInfo}`; - }) - .join('\n\n'); -} - -/** - * Format a one-line context signals header from V6 session metadata. - * Returns empty string when no signals are present (pre-V6 sessions with NULL columns). - * - * Example output: - * "Context signals: 3 context compactions (2 auto, 1 manual) — session exceeded context window; slash commands used: /review, /test\n" - */ -export function formatSessionMetaLine(meta?: SessionMetadata): string { - if (!meta) return ''; - const parts: string[] = []; - - const totalCompacts = (meta.compactCount ?? 0) + (meta.autoCompactCount ?? 0); - if (totalCompacts > 0) { - const breakdown: string[] = []; - if (meta.autoCompactCount) breakdown.push(`${meta.autoCompactCount} auto`); - if (meta.compactCount) breakdown.push(`${meta.compactCount} manual`); - parts.push(`${totalCompacts} context compaction${totalCompacts > 1 ? 's' : ''} (${breakdown.join(', ')}) — session exceeded context window`); - } - - if (meta.slashCommands?.length) { - parts.push(`slash commands used: ${meta.slashCommands.join(', ')}`); - } - - if (parts.length === 0) return ''; - return `Context signals: ${parts.join('; ')}\n`; -} +// Re-exports from @code-insights/cli/analysis/message-format. +// Moved to CLI package so the CLI can use message formatting for native analysis (--native mode). +export { + classifyStoredUserMessage, + formatMessagesForAnalysis, + formatSessionMetaLine, +} from '@code-insights/cli/analysis/message-format'; diff --git a/server/src/llm/normalize-utils.ts b/server/src/llm/normalize-utils.ts index d86e598..86ba81f 100644 --- a/server/src/llm/normalize-utils.ts +++ b/server/src/llm/normalize-utils.ts @@ -1,87 +1,4 @@ -// Shared normalization infrastructure for friction, pattern, and prompt-quality categories. -// Each domain provides its own canonical list, alias map, and label map. - -/** Standard Levenshtein distance between two strings */ -export function levenshtein(a: string, b: string): number { - const m = a.length; - const n = b.length; - const dp: number[][] = Array.from({ length: m + 1 }, () => Array(n + 1).fill(0) as number[]); - - for (let i = 0; i <= m; i++) dp[i][0] = i; - for (let j = 0; j <= n; j++) dp[0][j] = j; - - for (let i = 1; i <= m; i++) { - for (let j = 1; j <= n; j++) { - const cost = a[i - 1] === b[j - 1] ? 0 : 1; - dp[i][j] = Math.min( - dp[i - 1][j] + 1, - dp[i][j - 1] + 1, - dp[i - 1][j - 1] + cost - ); - } - } - - return dp[m][n]; -} - -export interface NormalizerConfig { - /** Canonical category strings (lowercase kebab-case) */ - canonicalCategories: readonly string[]; - /** Maps known aliases to their target (may be non-canonical cluster targets) */ - aliases: Record; -} - -/** - * Generic category normalizer. Matching rules (in order): - * 1. Exact match against canonical list → return as-is - * 1.5. Explicit alias match → return alias target (may be non-canonical) - * 2. Levenshtein distance <= 2 → return canonical match - * 3. Substring match (shorter >= 5 chars, >= 50% of longer) → return canonical - * 4. No match → return original (novel category) - */ -export function normalizeCategory(category: string, config: NormalizerConfig): string { - const lower = category.toLowerCase(); - - // 1. Exact match - for (const canonical of config.canonicalCategories) { - if (lower === canonical) return canonical; - } - - // 1.5. Explicit alias match - if (config.aliases[lower]) return config.aliases[lower]; - - // 2. Levenshtein distance <= 2 - let bestMatch: string | null = null; - let bestDistance = Infinity; - for (const canonical of config.canonicalCategories) { - const dist = levenshtein(lower, canonical); - if (dist <= 2 && dist < bestDistance) { - bestDistance = dist; - bestMatch = canonical; - } - } - if (bestMatch) return bestMatch; - - // 3. Substring match — only if the shorter string is a significant portion of the longer - // to avoid false positives like "type" matching "type-error" - for (const canonical of config.canonicalCategories) { - const shorter = lower.length < canonical.length ? lower : canonical; - const longer = lower.length < canonical.length ? canonical : lower; - if (shorter.length >= 5 && shorter.length / longer.length >= 0.5 && longer.includes(shorter)) { - return canonical; - } - } - - // 4. No match — novel category - return category; -} - -/** - * Convert kebab-case to Title Case. Shared fallback for category label functions. - */ -export function kebabToTitleCase(kebab: string): string { - return kebab - .split('-') - .map(word => word.charAt(0).toUpperCase() + word.slice(1)) - .join(' '); -} +// Re-exports from @code-insights/cli/analysis/normalize-utils. +// Moved to CLI package so the CLI can use these utilities for native analysis (--native mode). +export type { NormalizerConfig } from '@code-insights/cli/analysis/normalize-utils'; +export { levenshtein, normalizeCategory, kebabToTitleCase } from '@code-insights/cli/analysis/normalize-utils'; diff --git a/server/src/llm/pattern-normalize.ts b/server/src/llm/pattern-normalize.ts index ec32d81..1403f2c 100644 --- a/server/src/llm/pattern-normalize.ts +++ b/server/src/llm/pattern-normalize.ts @@ -1,101 +1,3 @@ -// Effective pattern category normalization. -// Clusters similar free-form pattern categories to canonical ones during aggregation. -// Delegates to normalize-utils.ts for the shared levenshtein/normalizeCategory algorithm. - -import { CANONICAL_PATTERN_CATEGORIES } from './prompt-constants.js'; -import { normalizeCategory, kebabToTitleCase } from './normalize-utils.js'; - -// Human-readable labels for each canonical category. -// Used in dashboard display (e.g., "structured-planning" → "Structured Planning"). -export const PATTERN_CATEGORY_LABELS: Record = { - 'structured-planning': 'Structured Planning', - 'incremental-implementation': 'Incremental Implementation', - 'verification-workflow': 'Verification Workflow', - 'systematic-debugging': 'Systematic Debugging', - 'self-correction': 'Self-Correction', - 'context-gathering': 'Context Gathering', - 'domain-expertise': 'Domain Expertise', - 'effective-tooling': 'Effective Tooling', -}; - -// Explicit alias map for clustering emergent category variants. -// Targets don't need to be in CANONICAL_PATTERN_CATEGORIES — -// this clusters semantically-equivalent novel categories together. -// Insert alias lookup runs AFTER exact canonical match but BEFORE Levenshtein, -// so well-known emergent variants are clustered deterministically. -const PATTERN_ALIASES: Record = { - // structured-planning variants - 'task-decomposition': 'structured-planning', - 'plan-first': 'structured-planning', - 'upfront-planning': 'structured-planning', - 'phased-approach': 'structured-planning', - 'task-breakdown': 'structured-planning', - 'planning-before-implementation': 'structured-planning', - - // effective-tooling variants - 'agent-delegation': 'effective-tooling', - 'agent-orchestration': 'effective-tooling', - 'specialized-agents': 'effective-tooling', - 'multi-agent': 'effective-tooling', - 'tool-leverage': 'effective-tooling', - - // verification-workflow variants - 'build-test-verify': 'verification-workflow', - 'test-driven-development': 'verification-workflow', - 'tdd': 'verification-workflow', - 'test-first': 'verification-workflow', - 'pre-commit-checks': 'verification-workflow', - - // systematic-debugging variants - 'binary-search-debugging': 'systematic-debugging', - 'methodical-debugging': 'systematic-debugging', - 'log-based-debugging': 'systematic-debugging', - 'debugging-methodology': 'systematic-debugging', - - // self-correction variants - 'course-correction': 'self-correction', - 'pivot-on-failure': 'self-correction', - 'backtracking': 'self-correction', - - // context-gathering variants - 'code-reading-first': 'context-gathering', - 'codebase-exploration': 'context-gathering', - 'understanding-before-changing': 'context-gathering', - - // domain-expertise variants - 'framework-knowledge': 'domain-expertise', - 'types-first': 'domain-expertise', - 'type-driven-development': 'domain-expertise', - 'schema-first': 'domain-expertise', - - // incremental-implementation variants - 'small-steps': 'incremental-implementation', - 'iterative-building': 'incremental-implementation', - 'iterative-development': 'incremental-implementation', -}; - -/** - * Normalize a pattern category to the closest canonical category. - * Returns the original category if no close match is found. - * - * Matching rules (in order): - * 1. Exact match against canonical list → return as-is - * 1.5. Explicit alias match → return alias target (may be non-canonical) - * 2. Levenshtein distance <= 2 → return canonical match - * 3. Substring match (category contains canonical or vice versa) → return canonical - * 4. No match → return original (novel category) - */ -export function normalizePatternCategory(category: string): string { - return normalizeCategory(category, { - canonicalCategories: CANONICAL_PATTERN_CATEGORIES, - aliases: PATTERN_ALIASES, - }); -} - -/** - * Get a human-readable label for a pattern category. - * Falls back to Title Case conversion for novel categories. - */ -export function getPatternCategoryLabel(category: string): string { - return PATTERN_CATEGORY_LABELS[category] ?? kebabToTitleCase(category); -} +// Re-exports from @code-insights/cli/analysis/pattern-normalize. +// Moved to CLI package so the CLI can use pattern normalization for native analysis (--native mode). +export { PATTERN_CATEGORY_LABELS, normalizePatternCategory, getPatternCategoryLabel } from '@code-insights/cli/analysis/pattern-normalize'; diff --git a/server/src/llm/prompt-constants.ts b/server/src/llm/prompt-constants.ts index 40d4f8c..8befdf4 100644 --- a/server/src/llm/prompt-constants.ts +++ b/server/src/llm/prompt-constants.ts @@ -1,189 +1,12 @@ -// Canonical category arrays and classification guidance strings for LLM analysis. -// Extracted from prompts.ts — imported by normalizers and prompt generators. - -// Shared guidance for friction category and attribution classification. -// Actor-neutral category definitions describe the gap, not the actor. -// Attribution field captures who contributed to the friction for actionability. -export const FRICTION_CLASSIFICATION_GUIDANCE = ` -FRICTION CLASSIFICATION GUIDANCE: - -Each friction point captures WHAT went wrong (category + description), WHO contributed (attribution), and WHY you classified it that way (_reasoning). - -CATEGORIES — classify the TYPE of gap or obstacle: -- "wrong-approach": A strategy was pursued that didn't fit the task — wrong architecture, wrong tool, wrong pattern. Includes choosing a suboptimal tool when a better one was available. -- "knowledge-gap": Incorrect knowledge was applied about a library, API, framework, or language feature. The capability existed but was used wrong. -- "stale-assumptions": Work proceeded from assumptions about current state that were incorrect (stale files, changed config, different environment, tool behavior changed between versions). -- "incomplete-requirements": Instructions were missing critical context, constraints, or acceptance criteria needed to proceed correctly. -- "context-loss": Prior decisions or constraints established earlier in the session were lost or forgotten. -- "scope-creep": Work expanded beyond the boundaries of the stated task. -- "repeated-mistakes": The same or similar error occurred multiple times despite earlier correction. -- "documentation-gap": Relevant docs existed but were inaccessible or unfindable during the session. -- "tooling-limitation": The AI coding tool or its underlying model genuinely could not perform a needed action — missing file system access, unsupported language feature, context window overflow, inability to run a specific command type. Diagnostic: Could a reasonable user prompt or approach have achieved the same result? If the only workaround is unreasonably complex or loses significant fidelity, this IS a tooling-limitation. If a straightforward alternative existed → it is NOT tooling-limitation. - RECLASSIFY if any of these apply: - - Rate-limited or throttled → create "rate-limit-hit" instead - - Agent crashed or lost state → use "wrong-approach" or create "agent-orchestration-failure" - - Wrong tool chosen when a better one existed → "wrong-approach" - - User didn't know the tool could do something → "knowledge-gap" - - Tool worked differently than expected → "stale-assumptions" - -DISAMBIGUATION — use these to break ties when two categories seem to fit: -- tooling-limitation vs wrong-approach: Limitation = the tool CANNOT do it (no workaround exists). Wrong-approach = the tool CAN do it but a suboptimal method was chosen. -- tooling-limitation vs knowledge-gap: Limitation = the capability genuinely does not exist. Knowledge-gap = the capability exists but was applied incorrectly. -- tooling-limitation vs stale-assumptions: Limitation = permanent gap in the tool. Stale-assumptions = the tool USED TO work differently or the assumption about current behavior was wrong. -- wrong-approach vs knowledge-gap: Wrong-approach = strategic choice (chose library X over Y). Knowledge-gap = factual error (used library X's API incorrectly). -- incomplete-requirements vs context-loss: Incomplete = the information was NEVER provided. Context-loss = it WAS provided earlier but was forgotten or dropped. - -When no category fits, create a specific kebab-case category. A precise novel category is better than a vague canonical one. - -ATTRIBUTION — 3-step decision tree (follow IN ORDER): -Step 1: Is the cause external to the user-AI interaction? (missing docs, broken tooling, infra outage) → "environmental" -Step 2: Could the USER have prevented this with better input? Evidence: vague prompt, missing context, no constraints, late requirements, ambiguous correction → "user-actionable" -Step 3: User input was clear and the AI still failed → "ai-capability" -When genuinely mixed between user-actionable and ai-capability, lean "user-actionable" — this tool helps users improve. - -DESCRIPTION RULES: -- One neutral sentence describing the GAP, not the actor -- Include specific details (file names, APIs, error messages) -- Frame as "Missing X caused Y" NOT "The AI failed to X" or "The user forgot to X" -- Let the attribution field carry the who`; - -export const CANONICAL_FRICTION_CATEGORIES = [ - 'wrong-approach', - 'knowledge-gap', - 'stale-assumptions', - 'incomplete-requirements', - 'context-loss', - 'scope-creep', - 'repeated-mistakes', - 'documentation-gap', - 'tooling-limitation', -] as const; - -export const CANONICAL_PATTERN_CATEGORIES = [ - 'structured-planning', - 'incremental-implementation', - 'verification-workflow', - 'systematic-debugging', - 'self-correction', - 'context-gathering', - 'domain-expertise', - 'effective-tooling', -] as const; - -export const CANONICAL_PQ_DEFICIT_CATEGORIES = [ - 'vague-request', - 'missing-context', - 'late-constraint', - 'unclear-correction', - 'scope-drift', - 'missing-acceptance-criteria', - 'assumption-not-surfaced', -] as const; - -export const CANONICAL_PQ_STRENGTH_CATEGORIES = [ - 'precise-request', - 'effective-context', - 'productive-correction', -] as const; - -export const CANONICAL_PQ_CATEGORIES = [ - ...CANONICAL_PQ_DEFICIT_CATEGORIES, - ...CANONICAL_PQ_STRENGTH_CATEGORIES, -] as const; - -export const PROMPT_QUALITY_CLASSIFICATION_GUIDANCE = ` -PROMPT QUALITY CLASSIFICATION GUIDANCE: - -Each finding captures a specific moment where the user's prompting either caused friction (deficit) or enabled productivity (strength). - -DEFICIT CATEGORIES — classify prompting problems: -- "vague-request": Request lacked specificity needed for the AI to act without guessing. Missing file paths, function names, expected behavior, or concrete details. - NOT this category if the AI had enough context to succeed but failed anyway — that is an AI capability issue, not a prompting issue. - -- "missing-context": Critical background knowledge about architecture, conventions, dependencies, or current state was not provided. - NOT this category if the information was available in the codebase and the AI could have found it by reading files — that is an AI context-gathering failure. - -- "late-constraint": A requirement or constraint was provided AFTER the AI had already started implementing a different approach, causing rework. - NOT this category if the constraint was genuinely discovered during implementation (requirements changed). Only classify if the user KNEW the constraint before the session started. - -- "unclear-correction": The user told the AI its output was wrong without explaining what was wrong or why. "That's not right", "try again", "no" without context. - NOT this category if the user gave a brief but sufficient correction ("use map instead of forEach" is clear enough). - -- "scope-drift": The session objective shifted mid-conversation, or multiple unrelated objectives were addressed in one session. - NOT this category if the user is working through logically connected subtasks of one objective. - -- "missing-acceptance-criteria": The user did not define what successful completion looks like, leading to back-and-forth about whether the output meets expectations. - NOT this category for exploratory sessions where the user is discovering what they want. - -- "assumption-not-surfaced": The user held an unstated assumption that the AI could not reasonably infer from code or conversation. - NOT this category if the assumption was reasonable for the AI to make (e.g., standard coding conventions). - -STRENGTH CATEGORIES — classify prompting successes (only when notably above average): -- "precise-request": Request included enough specificity (file paths, function names, expected behavior, error messages) that the AI could act correctly on the first attempt. - -- "effective-context": User proactively shared architecture, conventions, prior decisions, or current state that the AI demonstrably used to make better decisions. - -- "productive-correction": When the AI went off track, the user provided a correction that included WHAT was wrong, WHY, and enough context for the AI to redirect effectively on the next response. - -CONTRASTIVE PAIRS: -- vague-request vs missing-context: Was the problem in HOW THE TASK WAS DESCRIBED (vague-request) or WHAT BACKGROUND KNOWLEDGE WAS ABSENT (missing-context)? -- late-constraint vs missing-context: Did the user EVENTUALLY provide it in the same session? Yes → late-constraint. Never → missing-context. -- missing-context vs assumption-not-surfaced: Is this a FACT the user could have copy-pasted (missing-context), or a BELIEF/PREFERENCE they held (assumption-not-surfaced)? -- scope-drift vs missing-acceptance-criteria: Did the user try to do TOO MANY THINGS (scope-drift) or ONE THING WITHOUT DEFINING SUCCESS (missing-acceptance-criteria)? -- unclear-correction vs vague-request: Was this the user's FIRST MESSAGE about this task (vague-request) or a RESPONSE TO AI OUTPUT (unclear-correction)? - -DIMENSION SCORING (0-100): -- context_provision: How well did the user provide relevant background upfront? - 90+: Proactively shared architecture, constraints, conventions. 50-69: Notable gaps causing detours. <30: No context, AI working blind. -- request_specificity: How precise were task requests? - 90+: File paths, expected behavior, scope boundaries. 50-69: Mix of specific and vague. <30: Nearly all requests lacked detail. -- scope_management: How focused was the session? - 90+: Single clear objective, logical progression. 50-69: Some drift but primary goal met. <30: Unfocused, no clear objective. -- information_timing: Were requirements provided when needed? - 90+: All constraints front-loaded before implementation. 50-69: Some important requirements late. <30: Requirements drip-fed, constant corrections. -- correction_quality: How well did the user redirect the AI? - 90+: Corrections included what, why, and context. 50-69: Mix of clear and unclear. <30: Corrections gave almost no signal. - Score 75 if no corrections were needed (absence of corrections in a successful session = good prompting). - -EDGE CASES: -- Short sessions (<5 user messages): Score conservatively. Do not penalize for missing elements unnecessary in quick tasks. -- Exploration sessions: Do not penalize for missing acceptance criteria or scope drift. -- Sessions where AI performed well despite vague prompts: Still classify deficits. Impact should be "low" since no visible cost. -- Agentic/delegation sessions: If the user gave a clear high-level directive and the AI autonomously planned and executed successfully, do not penalize for low message count or lack of micro-level specificity. Effective delegation IS good prompting. Focus on the quality of the initial delegation prompt.`; - -export const EFFECTIVE_PATTERN_CLASSIFICATION_GUIDANCE = ` -EFFECTIVE PATTERN CLASSIFICATION GUIDANCE: - -Each effective pattern captures a technique or approach that contributed to a productive session outcome. - -BASELINE EXCLUSION — do NOT classify these as patterns: -- Routine file reads at session start (Read/Glob/Grep on <5 files before editing) -- Following explicit user instructions (user said "run tests" → running tests is not a pattern) -- Basic tool usage (single file edits, standard CLI commands) -- Trivial self-corrections (typo fixes, minor syntax errors caught immediately) -Only classify behavior that is NOTABLY thorough, strategic, or beyond baseline expectations. - -CATEGORIES — classify the TYPE of effective pattern: -- "structured-planning": Decomposed the task into explicit steps, defined scope boundaries, or established a plan BEFORE writing code. Signal: plan/task-list/scope-definition appears before implementation. -- "incremental-implementation": Work progressed in small, verifiable steps with validation between them. Signal: multiple small edits with checks between, not one large batch. -- "verification-workflow": Proactive correctness checks (builds, tests, linters, types) BEFORE considering work complete. Signal: test/build/lint commands when nothing was known broken. -- "systematic-debugging": Methodical investigation using structured techniques (binary search, log insertion, reproduction isolation). Signal: multiple targeted diagnostic steps, not random guessing. -- "self-correction": Recognized a wrong path and pivoted WITHOUT user correction. Signal: explicit acknowledgment of mistake + approach change. NOT this if the user pointed out the error. -- "context-gathering": NOTABLY thorough investigation before changes — reading 5+ files, cross-module exploration, schema/type/config review. Signal: substantial Read/Grep/Glob usage spanning multiple directories before any Edit/Write. -- "domain-expertise": Applied specific framework/API/language knowledge correctly on first attempt without searching. Signal: correct non-obvious API usage with no preceding search and no subsequent error. NOT this if files were read first — that is context-gathering. -- "effective-tooling": Leveraged advanced tool capabilities that multiplied productivity — agent delegation, parallel work, multi-file coordination, strategic mode selection. Signal: use of tool features beyond basic read/write/edit. - -CONTRASTIVE PAIRS: -- structured-planning vs incremental-implementation: Planning = DECIDING what to do (before). Incremental = HOW you execute (during). Can have one without the other. -- context-gathering vs domain-expertise: Gathering = ACTIVE INVESTIGATION (reading files). Expertise = APPLYING EXISTING KNOWLEDGE without investigation. If files were read first → context-gathering. -- verification-workflow vs systematic-debugging: Verification = PROACTIVE (checking working code). Debugging = REACTIVE (investigating a failure). -- self-correction vs user-directed: Self-correction = AI caught own mistake unprompted. User said "that's wrong" → NOT self-correction. - -DRIVER — 4-step decision tree (follow IN ORDER): -Step 1: Did user infrastructure enable this? (CLAUDE.md rules, agent configs, hookify hooks, custom commands, system prompts) → "user-driven" -Step 2: Did the user explicitly request this behavior? (asked for plan, requested tests, directed investigation) → "user-driven" -Step 3: Did the AI exhibit this without any user prompting or infrastructure? → "ai-driven" -Step 4: Both made distinct, identifiable contributions → "collaborative" -Use "collaborative" ONLY when you can name what EACH party contributed. If uncertain, prefer the more specific label. - -When no canonical category fits, create a specific kebab-case category (a precise novel category is better than forcing a poor fit).`; +// Re-exports from @code-insights/cli/analysis/prompt-constants. +// Moved to CLI package so the CLI can use these constants for native analysis (--native mode). +export { + FRICTION_CLASSIFICATION_GUIDANCE, + CANONICAL_FRICTION_CATEGORIES, + CANONICAL_PATTERN_CATEGORIES, + CANONICAL_PQ_DEFICIT_CATEGORIES, + CANONICAL_PQ_STRENGTH_CATEGORIES, + CANONICAL_PQ_CATEGORIES, + PROMPT_QUALITY_CLASSIFICATION_GUIDANCE, + EFFECTIVE_PATTERN_CLASSIFICATION_GUIDANCE, +} from '@code-insights/cli/analysis/prompt-constants'; diff --git a/server/src/llm/prompt-quality-normalize.ts b/server/src/llm/prompt-quality-normalize.ts index 9dfa0b2..8112b9e 100644 --- a/server/src/llm/prompt-quality-normalize.ts +++ b/server/src/llm/prompt-quality-normalize.ts @@ -1,131 +1,8 @@ -// Prompt quality category normalization. -// Clusters similar free-form categories to canonical ones during aggregation. -// Delegates to normalize-utils.ts for the shared levenshtein/normalizeCategory algorithm. - -import { CANONICAL_PQ_CATEGORIES, CANONICAL_PQ_STRENGTH_CATEGORIES } from './prompt-constants.js'; -import { normalizeCategory, kebabToTitleCase } from './normalize-utils.js'; - -// Human-readable labels for each canonical category. -export const PQ_CATEGORY_LABELS: Record = { - 'vague-request': 'Vague Request', - 'missing-context': 'Missing Context', - 'late-constraint': 'Late Constraint', - 'unclear-correction': 'Unclear Correction', - 'scope-drift': 'Scope Drift', - 'missing-acceptance-criteria': 'Missing Acceptance Criteria', - 'assumption-not-surfaced': 'Assumption Not Surfaced', - 'precise-request': 'Precise Request', - 'effective-context': 'Effective Context', - 'productive-correction': 'Productive Correction', -}; - -const STRENGTH_SET = new Set(CANONICAL_PQ_STRENGTH_CATEGORIES); - -// Explicit alias map for clustering emergent category variants. -// Targets don't need to be in CANONICAL_PQ_CATEGORIES — -// this clusters semantically-equivalent novel categories together. -// Alias lookup runs AFTER exact canonical match but BEFORE Levenshtein, -// so well-known emergent variants are clustered deterministically. -const PQ_ALIASES: Record = { - // vague-request variants - 'vague-instructions': 'vague-request', - 'unclear-request': 'vague-request', - 'imprecise-prompting': 'vague-request', - 'ambiguous-request': 'vague-request', - 'incomplete-request': 'vague-request', - 'generic-request': 'vague-request', - - // missing-context variants - 'missing-information': 'missing-context', - 'insufficient-context': 'missing-context', - 'no-context': 'missing-context', - 'lack-of-context': 'missing-context', - 'missing-background': 'missing-context', - - // late-constraint variants - 'late-context': 'late-constraint', - 'late-requirements': 'late-constraint', - 'piecemeal-requirements': 'late-constraint', - 'drip-fed-requirements': 'late-constraint', - 'incremental-requirements': 'late-constraint', - 'late-specification': 'late-constraint', - - // unclear-correction variants - 'unclear-feedback': 'unclear-correction', - 'vague-correction': 'unclear-correction', - 'unhelpful-correction': 'unclear-correction', - 'vague-feedback': 'unclear-correction', - - // scope-drift variants - 'context-drift': 'scope-drift', - 'objective-bloat': 'scope-drift', - 'session-bloat': 'scope-drift', - 'topic-switching': 'scope-drift', - 'scope-creep': 'scope-drift', - - // missing-acceptance-criteria variants - 'no-acceptance-criteria': 'missing-acceptance-criteria', - 'undefined-done': 'missing-acceptance-criteria', - 'no-definition-of-done': 'missing-acceptance-criteria', - 'unclear-success-criteria': 'missing-acceptance-criteria', - - // assumption-not-surfaced variants - 'hidden-assumption': 'assumption-not-surfaced', - 'unstated-assumption': 'assumption-not-surfaced', - 'implicit-assumption': 'assumption-not-surfaced', - 'unspoken-expectation': 'assumption-not-surfaced', - - // precise-request variants (strengths) - 'clear-request': 'precise-request', - 'specific-request': 'precise-request', - 'well-specified-request': 'precise-request', - 'detailed-request': 'precise-request', - - // effective-context variants (strengths) - 'good-context': 'effective-context', - 'upfront-context': 'effective-context', - 'proactive-context': 'effective-context', - 'rich-context': 'effective-context', - - // productive-correction variants (strengths) - 'clear-correction': 'productive-correction', - 'effective-feedback': 'productive-correction', - 'helpful-correction': 'productive-correction', - 'constructive-feedback': 'productive-correction', -}; - -/** - * Normalize a prompt quality category to the closest canonical category. - * Returns the original category if no close match is found. - * - * Matching rules (in order): - * 1. Exact match against canonical list → return as-is - * 1.5. Explicit alias match → return alias target (may be non-canonical) - * 2. Levenshtein distance <= 2 → return canonical match - * 3. Substring match (category contains canonical or vice versa) → return canonical - * 4. No match → return original (novel category) - * - * Note: alias targets in PQ_ALIASES bypass the canonical check intentionally. - */ -export function normalizePromptQualityCategory(category: string): string { - return normalizeCategory(category, { - canonicalCategories: CANONICAL_PQ_CATEGORIES, - aliases: PQ_ALIASES, - }); -} - -/** - * Get a human-readable label for a prompt quality category. - * Falls back to Title Case conversion for novel categories. - */ -export function getPQCategoryLabel(category: string): string { - return PQ_CATEGORY_LABELS[category] ?? kebabToTitleCase(category); -} - -/** - * Get the type (deficit or strength) for a prompt quality category. - * Novel categories default to deficit. - */ -export function getPQCategoryType(category: string): 'deficit' | 'strength' { - return STRENGTH_SET.has(category) ? 'strength' : 'deficit'; -} +// Re-exports from @code-insights/cli/analysis/prompt-quality-normalize. +// Moved to CLI package so the CLI can use PQ normalization for native analysis (--native mode). +export { + PQ_CATEGORY_LABELS, + normalizePromptQualityCategory, + getPQCategoryLabel, + getPQCategoryType, +} from '@code-insights/cli/analysis/prompt-quality-normalize'; diff --git a/server/src/llm/prompt-types.ts b/server/src/llm/prompt-types.ts index 7b550a2..38be34f 100644 --- a/server/src/llm/prompt-types.ts +++ b/server/src/llm/prompt-types.ts @@ -1,131 +1,14 @@ -// Type definitions for LLM prompt analysis. -// Extracted from prompts.ts — shared by message-format.ts, response-parsers.ts, and analysis.ts. - -// SQLite row format for messages — snake_case with JSON-encoded arrays. -// This matches the shape returned by server/src/routes/messages.ts. -export interface SQLiteMessageRow { - id: string; - session_id: string; - type: 'user' | 'assistant' | 'system'; - content: string; - thinking: string | null; - tool_calls: string; // JSON-encoded ToolCall[] - tool_results: string; // JSON-encoded ToolResult[] - usage: string | null; - timestamp: string; - parent_id: string | null; -} - -/** - * Optional session metadata from V6 columns. - * Passed to prompt generators to add context signals about context compaction - * and slash command usage. Only present when at least one V6 field is non-empty. - */ -export interface SessionMetadata { - compactCount?: number; // from sessions.compact_count (user-initiated /compact) - autoCompactCount?: number; // from sessions.auto_compact_count (LLM-initiated compaction) - slashCommands?: string[]; // from sessions.slash_commands (JSON array of command names) -} - -export interface AnalysisResponse { - facets?: { - outcome_satisfaction: string; - workflow_pattern: string | null; - had_course_correction: boolean; - course_correction_reason: string | null; - iteration_count: number; - friction_points: Array<{ - _reasoning?: string; - category: string; - attribution?: string; - description: string; - severity: string; - resolution: string; - }>; - effective_patterns: Array<{ - _reasoning?: string; - category: string; - description: string; - confidence: number; - driver?: 'user-driven' | 'ai-driven' | 'collaborative'; - }>; - }; - summary: { - title: string; - content: string; - outcome?: 'success' | 'partial' | 'abandoned' | 'blocked'; - bullets: string[]; - }; - decisions: Array<{ - title: string; - situation?: string; - choice?: string; - reasoning: string; - alternatives?: Array<{ option: string; rejected_because: string }>; - trade_offs?: string; - revisit_when?: string; - confidence?: number; - evidence?: string[]; - }>; - learnings: Array<{ - title: string; - symptom?: string; - root_cause?: string; - takeaway?: string; - applies_when?: string; - confidence?: number; - evidence?: string[]; - }>; -} - -export interface ParseError { - error_type: 'json_parse_error' | 'no_json_found' | 'invalid_structure'; - error_message: string; - response_length: number; - response_preview: string; -} - -export type ParseResult = - | { success: true; data: T } - | { success: false; error: ParseError }; - -export interface PromptQualityFinding { - category: string; - type: 'deficit' | 'strength'; - description: string; - message_ref: string; - impact: 'high' | 'medium' | 'low'; - confidence: number; - suggested_improvement?: string; -} - -export interface PromptQualityTakeaway { - type: 'improve' | 'reinforce'; - category: string; - label: string; - message_ref: string; - // improve fields - original?: string; - better_prompt?: string; - why?: string; - // reinforce fields - what_worked?: string; - why_effective?: string; -} - -export interface PromptQualityDimensionScores { - context_provision: number; - request_specificity: number; - scope_management: number; - information_timing: number; - correction_quality: number; -} - -export interface PromptQualityResponse { - efficiency_score: number; - message_overhead: number; - assessment: string; - takeaways: PromptQualityTakeaway[]; - findings: PromptQualityFinding[]; - dimension_scores: PromptQualityDimensionScores; -} +// Re-exports from @code-insights/cli/analysis/prompt-types. +// Moved to CLI package so the CLI can use these types for native analysis (--native mode). +export type { + SQLiteMessageRow, + SessionMetadata, + ContentBlock, + AnalysisResponse, + ParseError, + ParseResult, + PromptQualityFinding, + PromptQualityTakeaway, + PromptQualityDimensionScores, + PromptQualityResponse, +} from '@code-insights/cli/analysis/prompt-types'; diff --git a/server/src/llm/prompts.ts b/server/src/llm/prompts.ts index 6d0f228..f9a6968 100644 --- a/server/src/llm/prompts.ts +++ b/server/src/llm/prompts.ts @@ -1,425 +1,9 @@ -// Prompt template strings and generator functions for LLM session analysis. -// Types → prompt-types.ts, constants → prompt-constants.ts, -// formatting → message-format.ts, parsers → response-parsers.ts. - -import type { SessionMetadata } from './prompt-types.js'; -import type { ContentBlock } from './types.js'; -import { - FRICTION_CLASSIFICATION_GUIDANCE, - CANONICAL_FRICTION_CATEGORIES, - CANONICAL_PATTERN_CATEGORIES, - CANONICAL_PQ_DEFICIT_CATEGORIES, - CANONICAL_PQ_STRENGTH_CATEGORIES, - PROMPT_QUALITY_CLASSIFICATION_GUIDANCE, - EFFECTIVE_PATTERN_CLASSIFICATION_GUIDANCE, -} from './prompt-constants.js'; -import { formatSessionMetaLine } from './message-format.js'; - -// ============================================================================= -// SHARED SYSTEM PROMPT -// A minimal (~100 token) system prompt shared by all analysis calls. -// The full classification guidance and schema examples live in the instruction -// suffix (user[1]), keeping the system prompt cacheable across calls. -// ============================================================================= - -/** - * Shared system prompt for all LLM analysis calls. - * Paired with buildCacheableConversationBlock() + an analysis-specific instruction block. - */ -export const SHARED_ANALYST_SYSTEM_PROMPT = `You are a senior staff engineer analyzing an AI coding session. You will receive the conversation transcript followed by specific extraction instructions. Respond with valid JSON only, wrapped in ... tags.`; - -// ============================================================================= -// CACHEABLE CONVERSATION BLOCK -// Wraps the formatted conversation in an Anthropic ephemeral cache block. -// CRITICAL: Must contain ONLY the formatted messages — no project name, no session -// metadata, no per-session variables. This ensures cache hits across sessions. -// ============================================================================= - -/** - * Wrap formatted conversation messages in a cacheable content block. - * The cache_control field instructs Anthropic to cache everything up to - * and including this block (ephemeral, 5-minute TTL). - * - * Non-Anthropic providers receive this as a ContentBlock[] and use - * flattenContent() to convert it to a plain string. - * - * @param formattedMessages - Output of formatMessagesForAnalysis() - */ -export function buildCacheableConversationBlock(formattedMessages: string): ContentBlock { - return { - type: 'text', - // Trailing double newline ensures the instruction block (user[1]) reads as a - // distinct section when providers flatten content blocks to a single string. - text: `--- CONVERSATION ---\n${formattedMessages}\n--- END CONVERSATION ---\n\n`, - cache_control: { type: 'ephemeral' }, - }; -} - -// ============================================================================= -// SESSION ANALYSIS INSTRUCTIONS -// The instruction suffix for session analysis calls (user[1]). -// Contains the full analyst persona, schema, and quality guidance. -// Per-session variables (project name, summary, meta) go here — NOT in the -// cached conversation block. -// ============================================================================= - -/** - * Build the instruction suffix for session analysis. - * Used as the second content block in the user message, after the cached conversation. - */ -export function buildSessionAnalysisInstructions( - projectName: string, - sessionSummary: string | null, - meta?: SessionMetadata -): string { - return `You are a senior staff engineer writing entries for a team's engineering knowledge base. You've just observed an AI-assisted coding session and your job is to extract the insights that would save another engineer time if they encountered a similar situation 6 months from now. - -Your audience is a developer who has never seen this session but works on the same codebase. They need enough context to understand WHY a decision was made, WHAT specific gotcha was discovered, and WHEN this knowledge applies. - -Project: ${projectName} -${sessionSummary ? `Session Summary: ${sessionSummary}\n` : ''}${formatSessionMetaLine(meta)} -=== PART 1: SESSION FACETS === -Extract these FIRST as a holistic session assessment: - -1. outcome_satisfaction: Rate the session outcome. - - "high": Task completed successfully, user satisfied - - "medium": Partial completion or minor issues - - "low": Significant problems, user frustrated - - "abandoned": Session ended without achieving the goal - -2. workflow_pattern: Identify the dominant workflow pattern (or null if unclear). - Recommended values: "plan-then-implement", "iterative-refinement", "debug-fix-verify", "explore-then-build", "direct-execution" - -3. friction_points: Identify up to 5 moments where progress was blocked or slowed (array, max 5). - Each friction point has: - - _reasoning: (REQUIRED) Your reasoning chain for category + attribution. 2-3 sentences max. Walk through the decision tree steps. This field is saved but not shown to users — use it to think before classifying. - - category: Use one of these PREFERRED categories when applicable: ${CANONICAL_FRICTION_CATEGORIES.join(', ')}. Create a new kebab-case category only when none of these fit. - - attribution: "user-actionable" (better user input would have prevented this), "ai-capability" (AI failed despite adequate input), or "environmental" (external constraint) - - description: One neutral sentence describing what happened, with specific details (file names, APIs, errors) - - severity: "high" (blocked progress for multiple turns), "medium" (caused a detour), "low" (minor hiccup) - - resolution: "resolved" (fixed in session), "workaround" (bypassed), "unresolved" (still broken) -${FRICTION_CLASSIFICATION_GUIDANCE} - -4. effective_patterns: Up to 3 techniques or approaches that worked particularly well (array, max 3). - Each has: - - _reasoning: (REQUIRED) Your reasoning chain for category + driver. 2-3 sentences max. Walk through the decision tree steps and baseline exclusion check. This field is saved but not shown to users — use it to think before classifying. - - category: Use one of these PREFERRED categories when applicable: structured-planning, incremental-implementation, verification-workflow, systematic-debugging, self-correction, context-gathering, domain-expertise, effective-tooling. Create a new kebab-case category only when none fit. - - description: Specific technique worth repeating (1-2 sentences with concrete detail) - - confidence: 0-100 how confident you are this is genuinely effective - - driver: Who drove this pattern — "user-driven" (user explicitly requested it), "ai-driven" (AI exhibited it without prompting), or "collaborative" (both contributed or emerged from interaction) -${EFFECTIVE_PATTERN_CLASSIFICATION_GUIDANCE} - -5. had_course_correction: true if the user redirected the AI from a wrong approach, false otherwise -6. course_correction_reason: If had_course_correction is true, briefly explain what was corrected (or null) -7. iteration_count: Number of times the user had to clarify, correct, or re-explain something - -If the session has minimal friction and straightforward execution, use empty arrays for friction_points, set outcome_satisfaction to "high", and iteration_count to 0. - -=== PART 2: INSIGHTS === -Then extract these: - -You will extract: -1. **Summary**: A narrative of what was accomplished and the outcome -2. **Decisions**: Technical choices made — with full situation context, reasoning, rejected alternatives, trade-offs, and conditions for revisiting (max 3) -3. **Learnings**: Technical discoveries, gotchas, debugging breakthroughs — with the observable symptom, root cause, and a transferable takeaway (max 5) - -Quality Standards: -- Only include insights you would write in a team knowledge base for future reference -- Each insight MUST reference concrete details: specific file names, library names, error messages, API endpoints, or code patterns -- Do not invent file names, APIs, errors, or details not present in the conversation -- Rate your confidence in each insight's value (0-100). Only include insights you rate 70+. -- It is better to return 0 insights in a category than to include generic or trivial ones -- If a session is straightforward with no notable decisions or learnings, say so in the summary and leave other categories empty - -Length Guidance: -- Fill every field in the schema. An empty "trade_offs" or "revisit_when" is worse than a longer response. -- Total response: stay under 2000 tokens. If you must cut, drop lower-confidence insights rather than compressing high-confidence ones. -- Evidence: 1-3 short quotes per insight, referencing turn labels. -- Prefer precision over brevity — a specific 3-sentence insight beats a vague 1-sentence insight. - -DO NOT include insights like these (too generic/trivial): -- "Used debugging techniques to fix an issue" -- "Made architectural decisions about the codebase" -- "Implemented a new feature" (the summary already covers this) -- "Used React hooks for state management" (too generic without specifics) -- "Fixed a bug in the code" (what bug? what was the root cause?) -- Anything that restates the task without adding transferable knowledge - -Here is an example of an EXCELLENT insight — this is the quality bar: - -EXCELLENT learning: -{ - "title": "Tailwind v4 requires @theme inline{} for CSS variable utilities", - "symptom": "After Tailwind v3→v4 upgrade, custom utilities like bg-primary stopped working. Classes present in HTML but no styles applied.", - "root_cause": "Tailwind v4 removed tailwind.config.js theme extension. CSS variables in :root are not automatically available as utilities — must be registered via @theme inline {} in the CSS file.", - "takeaway": "When migrating Tailwind v3→v4 with shadcn/ui: add @theme inline {} mapping CSS variables, add @custom-variant dark for class-based dark mode, replace tailwindcss-animate with tw-animate-css.", - "applies_when": "Any Tailwind v3→v4 migration using CSS variables for theming, especially with shadcn/ui.", - "confidence": 95, - "evidence": ["User#12: 'The colors are all gone after the upgrade'", "Assistant#13: 'Tailwind v4 requires explicit @theme inline registration...'"] -} - -Extract insights in this JSON format: -{ - "facets": { - "outcome_satisfaction": "high | medium | low | abandoned", - "workflow_pattern": "plan-then-implement | iterative-refinement | debug-fix-verify | explore-then-build | direct-execution | null", - "had_course_correction": false, - "course_correction_reason": null, - "iteration_count": 0, - "friction_points": [ - { - "_reasoning": "User said 'fix the auth' without specifying OAuth vs session-based or which file. Step 1: not external — this is about the prompt, not infrastructure. Step 2: user could have specified which auth flow → user-actionable. Category: incomplete-requirements fits better than vague-request because specific constraints (which flow, which file) were missing, not the overall task description.", - "category": "incomplete-requirements", - "attribution": "user-actionable", - "description": "Missing specification of which auth flow (OAuth vs session) caused implementation of wrong provider in auth.ts", - "severity": "medium", - "resolution": "resolved" - }, - { - "_reasoning": "AI applied Express middleware pattern to a Hono route despite conversation showing Hono imports. Step 1: not external. Step 2: user provided clear Hono context in prior messages. Step 3: AI failed despite adequate input → ai-capability. Category: knowledge-gap — incorrect framework API knowledge was applied.", - "category": "knowledge-gap", - "attribution": "ai-capability", - "description": "Express-style middleware pattern applied to Hono route despite Hono imports visible in conversation context", - "severity": "high", - "resolution": "resolved" - } - ], - "effective_patterns": [ - { - "_reasoning": "Before editing, AI read 8 files across server/src/routes/ and server/src/llm/ to understand the data flow. Baseline check: 8 files across 2 directories = beyond routine (<5 file) reads. Step 1: no CLAUDE.md rule requiring this. Step 2: user didn't ask for investigation. Step 3: AI explored autonomously → ai-driven. Category: context-gathering (active investigation, not pre-existing knowledge).", - "category": "context-gathering", - "description": "Read 8 files across routes/ and llm/ directories to map the data flow before modifying the aggregation query, preventing a type mismatch that would have required rework", - "confidence": 88, - "driver": "ai-driven" - } - ] - }, - "summary": { - "title": "Brief title describing main accomplishment (max 80 chars)", - "content": "2-4 sentence narrative: what was the goal, what was done, what was the outcome. Mention the primary file or component changed.", - "outcome": "success | partial | abandoned | blocked", - "bullets": ["Each bullet names a specific artifact (file, function, endpoint) and what changed"] - }, - "decisions": [ - { - "title": "The specific technical choice made (max 80 chars)", - "situation": "What problem or requirement led to this decision point", - "choice": "What was chosen and how it was implemented", - "reasoning": "Why this choice was made — the key factors that tipped the decision", - "alternatives": [ - {"option": "Name of alternative", "rejected_because": "Why it was not chosen"} - ], - "trade_offs": "What downsides were accepted, what was given up", - "revisit_when": "Under what conditions this decision should be reconsidered (or 'N/A' if permanent)", - "confidence": 85, - "evidence": ["User#4: quoted text...", "Assistant#5: quoted text..."] - } - ], - "learnings": [ - { - "title": "Specific technical discovery or gotcha (max 80 chars)", - "symptom": "What went wrong or was confusing — the observable behavior that triggered investigation", - "root_cause": "The underlying technical reason — why it happened", - "takeaway": "The transferable lesson — what to do or avoid in similar situations, useful outside this project", - "applies_when": "Conditions under which this knowledge is relevant (framework version, configuration, etc.)", - "confidence": 80, - "evidence": ["User#7: quoted text...", "Assistant#8: quoted text..."] - } - ] -} - -Only include insights rated 70+ confidence. If you cannot cite evidence, drop the insight. Return empty arrays for categories with no strong insights. Max 3 decisions, 5 learnings. -Evidence should reference the labeled turns in the conversation (e.g., "User#2", "Assistant#5"). - -Respond with valid JSON only, wrapped in ... tags. Do not include any other text.`; -} - -// ============================================================================= -// PROMPT QUALITY INSTRUCTIONS -// The instruction suffix for prompt quality analysis calls (user[1]). -// ============================================================================= - -/** - * Build the instruction suffix for prompt quality analysis. - * Used as the second content block in the user message, after the cached conversation. - */ -export function buildPromptQualityInstructions( - projectName: string, - sessionMeta: { - humanMessageCount: number; - assistantMessageCount: number; - toolExchangeCount: number; - }, - meta?: SessionMetadata -): string { - return `You are a prompt engineering coach helping developers communicate more effectively with AI coding assistants. You review conversations and identify specific moments where better prompting would have saved time — AND moments where the user prompted particularly well. - -You will produce: -1. **Takeaways**: Concrete before/after examples the user can learn from (max 4) -2. **Findings**: Categorized findings for cross-session aggregation (max 8) -3. **Dimension scores**: 5 numeric dimensions for progress tracking -4. **Efficiency score**: 0-100 overall rating -5. **Assessment**: 2-3 sentence summary - -Project: ${projectName} -Session shape: ${sessionMeta.humanMessageCount} user messages, ${sessionMeta.assistantMessageCount} assistant messages, ${sessionMeta.toolExchangeCount} tool exchanges -${formatSessionMetaLine(meta)} -Before evaluating, mentally walk through the conversation and identify: -1. Each time the assistant asked for clarification that could have been avoided -2. Each time the user corrected the assistant's interpretation -3. Each time the user repeated an instruction they gave earlier -4. Whether critical context or requirements were provided late -5. Whether the user discussed the plan/approach before implementation -6. Moments where the user's prompt was notably well-crafted -7. If context compactions occurred, note that the AI may have lost context — repeated instructions IMMEDIATELY after a compaction are NOT a user prompting deficit -These are your candidate findings. Only include them if they are genuinely actionable. - -${PROMPT_QUALITY_CLASSIFICATION_GUIDANCE} - -Guidelines: -- Focus on USER messages only — don't critique the assistant's responses -- Be constructive, not judgmental — the goal is to help users improve -- A score of 100 means every user message was perfectly clear and complete -- A score of 50 means about half the messages could have been more efficient -- Include BOTH deficits and strengths — what went right matters as much as what went wrong -- If the user prompted well, say so — don't manufacture issues -- If the session had context compactions, do NOT penalize the user for repeating instructions immediately after a compaction — the AI lost context, not the user. Repetition unrelated to compaction events should still be flagged. - -Length Guidance: -- Max 4 takeaways (ordered: improve first, then reinforce), max 8 findings -- better_prompt must be a complete, usable prompt — not vague meta-advice -- assessment: 2-3 sentences -- Total response: stay under 2500 tokens - -Evaluate the user's prompting quality and respond with this JSON format: -{ - "efficiency_score": 75, - "message_overhead": 3, - "assessment": "2-3 sentence summary of prompting style and efficiency", - "takeaways": [ - { - "type": "improve", - "category": "late-constraint", - "label": "Short human-readable heading", - "message_ref": "User#5", - "original": "The user's original message (abbreviated)", - "better_prompt": "A concrete rewrite with the missing context included", - "why": "One sentence: why the original caused friction" - }, - { - "type": "reinforce", - "category": "precise-request", - "label": "Short human-readable heading", - "message_ref": "User#0", - "what_worked": "What the user did well", - "why_effective": "Why it led to a good outcome" - } - ], - "findings": [ - { - "category": "late-constraint", - "type": "deficit", - "description": "One neutral sentence with specific details", - "message_ref": "User#5", - "impact": "high", - "confidence": 90, - "suggested_improvement": "Concrete rewrite or behavioral change" - }, - { - "category": "precise-request", - "type": "strength", - "description": "One sentence describing what the user did well", - "message_ref": "User#0", - "impact": "medium", - "confidence": 85 - } - ], - "dimension_scores": { - "context_provision": 70, - "request_specificity": 65, - "scope_management": 80, - "information_timing": 55, - "correction_quality": 75 - } -} - -Category values — use these PREFERRED categories: -Deficits: ${CANONICAL_PQ_DEFICIT_CATEGORIES.join(', ')} -Strengths: ${CANONICAL_PQ_STRENGTH_CATEGORIES.join(', ')} -Create a new kebab-case category only when none of these fit. - -Rules: -- message_ref uses the labeled turns in the conversation (e.g., "User#0", "User#5") -- Only include genuinely notable findings, not normal back-and-forth -- Takeaways are the user-facing highlights — max 4, ordered: improve first, then reinforce -- Findings are the full categorized set for aggregation — max 8 -- If the user prompted well, include strength findings and reinforce takeaways — don't manufacture issues -- message_overhead is how many fewer messages the session could have taken with better prompts -- dimension_scores: each 0-100. Score correction_quality as 75 if no corrections were needed. - -Respond with valid JSON only, wrapped in ... tags. Do not include any other text.`; -} - -// ============================================================================= -// FACET-ONLY INSTRUCTIONS -// The instruction suffix for facet-only extraction calls (user[1]). -// ============================================================================= - -/** - * Build the instruction suffix for facet-only extraction (backfill path). - * Used as the second content block in the user message, after the cached conversation. - */ -export function buildFacetOnlyInstructions( - projectName: string, - sessionSummary: string | null, - meta?: SessionMetadata -): string { - return `You are assessing an AI coding session to extract structured metadata for cross-session pattern analysis. - -Project: ${projectName} -${sessionSummary ? `Session Summary: ${sessionSummary}\n` : ''}${formatSessionMetaLine(meta)} -Extract session facets — a holistic assessment of how the session went: - -1. outcome_satisfaction: "high" (completed successfully), "medium" (partial), "low" (problems), "abandoned" (gave up) -2. workflow_pattern: The dominant pattern, or null. Values: "plan-then-implement", "iterative-refinement", "debug-fix-verify", "explore-then-build", "direct-execution" -3. friction_points: Up to 5 moments where progress stalled (array). - Each: { _reasoning (3-step attribution decision tree reasoning), category (kebab-case, prefer: ${CANONICAL_FRICTION_CATEGORIES.join(', ')}), attribution ("user-actionable"|"ai-capability"|"environmental"), description (one neutral sentence with specific details), severity ("high"|"medium"|"low"), resolution ("resolved"|"workaround"|"unresolved") } -${FRICTION_CLASSIFICATION_GUIDANCE} -4. effective_patterns: Up to 3 things that worked well (array). - Each: { _reasoning (driver decision tree reasoning — check user infrastructure first), category (kebab-case, prefer: ${CANONICAL_PATTERN_CATEGORIES.join(', ')}), description (specific technique, 1-2 sentences), confidence (0-100), driver ("user-driven"|"ai-driven"|"collaborative") } -${EFFECTIVE_PATTERN_CLASSIFICATION_GUIDANCE} -5. had_course_correction: true/false — did the user redirect the AI? -6. course_correction_reason: Brief explanation if true, null otherwise -7. iteration_count: How many user clarification/correction cycles occurred - -Extract facets in this JSON format: -{ - "outcome_satisfaction": "high | medium | low | abandoned", - "workflow_pattern": "string or null", - "had_course_correction": false, - "course_correction_reason": null, - "iteration_count": 0, - "friction_points": [ - { - "_reasoning": "Reasoning for category + attribution classification", - "category": "kebab-case-category", - "attribution": "user-actionable | ai-capability | environmental", - "description": "One neutral sentence about the gap, with specific details", - "severity": "high | medium | low", - "resolution": "resolved | workaround | unresolved" - } - ], - "effective_patterns": [ - { - "_reasoning": "Reasoning for category + driver classification, including baseline check", - "category": "kebab-case-category", - "description": "technique", - "confidence": 85, - "driver": "user-driven | ai-driven | collaborative" - } - ] -} - -Respond with valid JSON only, wrapped in ... tags.`; -} - +// Re-exports from @code-insights/cli/analysis/prompts. +// Moved to CLI package so the CLI can use prompt builders for native analysis (--native mode). +export { + SHARED_ANALYST_SYSTEM_PROMPT, + buildCacheableConversationBlock, + buildSessionAnalysisInstructions, + buildPromptQualityInstructions, + buildFacetOnlyInstructions, +} from '@code-insights/cli/analysis/prompts'; diff --git a/server/src/llm/response-parsers.ts b/server/src/llm/response-parsers.ts index 542b5d6..6af7d40 100644 --- a/server/src/llm/response-parsers.ts +++ b/server/src/llm/response-parsers.ts @@ -1,200 +1,7 @@ -// LLM response parsing utilities. -// Extracted from prompts.ts — handles JSON extraction, repair, and validation. - -import { jsonrepair } from 'jsonrepair'; -import type { AnalysisResponse, ParseError, ParseResult, PromptQualityResponse, PromptQualityDimensionScores } from './prompt-types.js'; - -function buildResponsePreview(text: string, head = 200, tail = 200): string { - if (text.length <= head + tail + 20) return text; - return `${text.slice(0, head)}\n...[${text.length - head - tail} chars omitted]...\n${text.slice(-tail)}`; -} - -export function extractJsonPayload(response: string): string | null { - const tagged = response.match(/\s*([\s\S]*?)\s*<\/json>/i); - if (tagged?.[1]) return tagged[1].trim(); - const jsonMatch = response.match(/\{[\s\S]*\}/); - return jsonMatch ? jsonMatch[0] : null; -} - -/** - * Parse the LLM response into structured insights. - */ -export function parseAnalysisResponse(response: string): ParseResult { - const response_length = response.length; - - const preview = buildResponsePreview(response); - - const jsonPayload = extractJsonPayload(response); - if (!jsonPayload) { - console.error('No JSON found in analysis response'); - return { - success: false, - error: { error_type: 'no_json_found', error_message: 'No JSON found in analysis response', response_length, response_preview: preview }, - }; - } - - let parsed: AnalysisResponse; - try { - parsed = JSON.parse(jsonPayload) as AnalysisResponse; - } catch { - // Attempt repair — handles trailing commas, unclosed braces, truncated output - try { - parsed = JSON.parse(jsonrepair(jsonPayload)) as AnalysisResponse; - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error('Failed to parse analysis response (after jsonrepair):', err); - return { - success: false, - error: { error_type: 'json_parse_error', error_message: msg, response_length, response_preview: preview }, - }; - } - } - - if (!parsed.summary || typeof parsed.summary.title !== 'string') { - console.error('Invalid analysis response structure'); - return { - success: false, - error: { error_type: 'invalid_structure', error_message: 'Missing or invalid summary field', response_length, response_preview: preview }, - }; - } - - // Guard against LLM returning non-array values (e.g. "decisions": "none"). - // || [] alone won't catch truthy non-arrays — Array.isArray is required. - parsed.decisions = Array.isArray(parsed.decisions) ? parsed.decisions : []; - parsed.learnings = Array.isArray(parsed.learnings) ? parsed.learnings : []; - - // Normalize facet arrays before monitors access .some() — a non-array truthy value - // (e.g. LLM returns "friction_points": "none") would throw a TypeError on .some(). - if (parsed.facets) { - if (!Array.isArray(parsed.facets.friction_points)) parsed.facets.friction_points = []; - if (!Array.isArray(parsed.facets.effective_patterns)) parsed.facets.effective_patterns = []; - } - - // Observability: two-tier tooling-limitation monitor. - // Tier 1: _reasoning contains misclassification signals NOT in a negation context → likely wrong category. - // Tier 2: no conflicting signals (or signal was negated) → generic reminder to verify. - // Re-evaluate after ~30 sessions with improved FRICTION_CLASSIFICATION_GUIDANCE. - if (parsed.facets?.friction_points?.some(fp => fp.category === 'tooling-limitation')) { - // Expanded regex covers both literal terms and GPT-4o paraphrasing patterns - const MISCLASS_SIGNALS = /rate.?limit|throttl|quota.?exceed|crash|fail.{0,10}unexpect|lost.?state|context.{0,10}(?:drop|lost|unavail)|wrong.?tool|different.?(?:approach|method)|(?:didn.t|did not|unaware).{0,10}(?:know|capabil)|(?:older|previous).?version|used to (?:work|be)|behavio.?r.?change/i; - const NEGATION_CONTEXT = /\bnot\b|\bnor\b|\bisn.t\b|\bwasn.t\b|\brule[d]? out\b|\brejected?\b|\beliminated?\b|\breclassif/i; - const toolingFps = parsed.facets.friction_points.filter(fp => fp.category === 'tooling-limitation'); - for (const fp of toolingFps) { - if (!fp._reasoning) { - console.warn('[friction-monitor] LLM classified friction as "tooling-limitation" without _reasoning — cannot verify'); - continue; - } - const matchResult = fp._reasoning.match(MISCLASS_SIGNALS); - if (matchResult) { - // Check if the signal appears in a negation context (model correctly eliminating the alternative) - const matchIdx = fp._reasoning.search(MISCLASS_SIGNALS); - const preceding = fp._reasoning.slice(Math.max(0, matchIdx - 40), matchIdx); - if (!NEGATION_CONTEXT.test(preceding)) { - console.warn(`[friction-monitor] Likely misclassification: "tooling-limitation" with reasoning mentioning "${matchResult[0]}" — review category`); - } - // If negated, the model correctly considered and rejected the alternative — no warning - } else { - console.warn('[friction-monitor] LLM classified friction as "tooling-limitation" — verify genuine tool limitation'); - } - } - } - - // Observability: warn when LLM returns effective_pattern without category or driver field, - // or with an unrecognized driver value. - // Catches models that ignore the classification instructions (especially smaller Ollama models). - // Remove after confirming classification quality over ~20 new sessions. - if (parsed.facets?.effective_patterns?.some(ep => !ep.category)) { - console.warn('[pattern-monitor] LLM returned effective_pattern without category field'); - } - if (parsed.facets?.effective_patterns?.some(ep => !ep.driver)) { - console.warn('[pattern-monitor] LLM returned effective_pattern without driver field — driver classification may be incomplete'); - } - const VALID_DRIVERS = new Set(['user-driven', 'ai-driven', 'collaborative']); - if (parsed.facets?.effective_patterns?.some(ep => ep.driver && !VALID_DRIVERS.has(ep.driver))) { - console.warn('[pattern-monitor] LLM returned unexpected driver value — check classification quality'); - } - - // Validation: check for missing _reasoning CoT scratchpad fields. - // These fields ensure the model walks through the attribution/driver decision trees - // before committing to classification values. - // (Monitoring period complete — warn calls removed after confirming CoT compliance) - if (parsed.facets?.friction_points?.some(fp => !fp._reasoning)) { - // Missing _reasoning: classification may lack decision-tree rigor - } - if (parsed.facets?.effective_patterns?.some(ep => !ep._reasoning)) { - // Missing _reasoning: classification may lack decision-tree rigor - } - - return { success: true, data: parsed }; -} - -export function parsePromptQualityResponse(response: string): ParseResult { - const response_length = response.length; - const preview = buildResponsePreview(response); - - const jsonPayload = extractJsonPayload(response); - if (!jsonPayload) { - console.error('No JSON found in prompt quality response'); - return { - success: false, - error: { error_type: 'no_json_found', error_message: 'No JSON found in prompt quality response', response_length, response_preview: preview }, - }; - } - - let parsed: PromptQualityResponse; - try { - parsed = JSON.parse(jsonPayload) as PromptQualityResponse; - } catch { - try { - parsed = JSON.parse(jsonrepair(jsonPayload)) as PromptQualityResponse; - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error('Failed to parse prompt quality response (after jsonrepair):', msg); - return { - success: false, - error: { error_type: 'json_parse_error', error_message: msg, response_length, response_preview: preview }, - }; - } - } - - if (typeof parsed.efficiency_score !== 'number') { - console.error('Invalid prompt quality response: missing efficiency_score'); - return { - success: false, - error: { error_type: 'invalid_structure', error_message: 'Missing or invalid efficiency_score field', response_length, response_preview: preview }, - }; - } - - // Clamp and default - parsed.efficiency_score = Math.max(0, Math.min(100, Math.round(parsed.efficiency_score))); - parsed.message_overhead = parsed.message_overhead ?? 0; - parsed.assessment = parsed.assessment || ''; - // Guard against LLM returning non-array values (e.g. "findings": "none") — - // || [] alone won't catch truthy non-arrays, and .some() on line 166 would throw. - parsed.takeaways = Array.isArray(parsed.takeaways) ? parsed.takeaways : []; - parsed.findings = Array.isArray(parsed.findings) ? parsed.findings : []; - parsed.dimension_scores = parsed.dimension_scores || { - context_provision: 50, - request_specificity: 50, - scope_management: 50, - information_timing: 50, - correction_quality: 50, - }; - - // Clamp dimension scores - for (const key of Object.keys(parsed.dimension_scores) as Array) { - parsed.dimension_scores[key] = Math.max(0, Math.min(100, Math.round(parsed.dimension_scores[key] ?? 50))); - } - - // Validation: check for missing category or unexpected type values in findings. - // (Monitoring period complete — warn calls removed after confirming classification quality) - if (parsed.findings.some(f => !f.category)) { - // Finding missing category field - } - - if (parsed.findings.some(f => f.type && f.type !== 'deficit' && f.type !== 'strength')) { - // Finding has unexpected type value — expected deficit or strength - } - - return { success: true, data: parsed }; -} +// Re-exports from @code-insights/cli/analysis/response-parsers. +// Moved to CLI package so the CLI can use response parsers for native analysis (--native mode). +export { + extractJsonPayload, + parseAnalysisResponse, + parsePromptQualityResponse, +} from '@code-insights/cli/analysis/response-parsers';