diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index ad6d5a9..7062ba9 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -26,6 +26,14 @@ "category": "human-resources", "tags": ["hr", "recruiting", "interviews", "compliance", "evaluation", "job-description"] }, + { + "name": "kaizen", + "source": "./kaizen", + "description": "Continuous improvement loops — a generic recursive optimization engine with bundled profiles for Claude Code usage, code refactoring, and process improvement", + "version": "1.0.0", + "category": "engineering", + "tags": ["kaizen", "improvement", "optimization", "automation", "autoresearch", "kpi", "refactoring"] + }, { "name": "project-management", "source": "./project-management", diff --git a/kaizen/.claude-plugin/plugin.json b/kaizen/.claude-plugin/plugin.json new file mode 100644 index 0000000..f318b61 --- /dev/null +++ b/kaizen/.claude-plugin/plugin.json @@ -0,0 +1,11 @@ +{ + "name": "kaizen", + "version": "1.0.0", + "description": "Continuous improvement loops — a generic recursive optimization engine inspired by autoresearch, with bundled profiles for Claude Code usage, code refactoring, and process improvement", + "author": { + "name": "MrBogomips", + "url": "https://github.com/MrBogomips" + }, + "license": "MIT", + "keywords": ["kaizen", "improvement", "optimization", "automation", "autoresearch", "kpi", "refactoring"] +} diff --git a/kaizen/CONNECTORS.md b/kaizen/CONNECTORS.md new file mode 100644 index 0000000..d82be35 --- /dev/null +++ b/kaizen/CONNECTORS.md @@ -0,0 +1,31 @@ +# Connectors + +Tool-specific connector registry for the kaizen plugin. The engine requires Sequential Thinking MCP for loop orchestration and optionally uses memory connectors for cross-session continuity. + +## Registry + +| Category | Placeholder | Options | Required | Used by | +|----------|-------------|---------|----------|---------| +| Structured reasoning | `~~sequential-thinking` | [Sequential Thinking MCP](https://github.com/modelcontextprotocol/servers/tree/main/src/sequentialthinking) | **Yes** | kaizen-engine | +| Persistent memory | `~~memory` | File-based memory, Memory MCP | No | kaizen-engine (optional) | + +## How Skills Use Connectors + +### ~~sequential-thinking (required) + +The kaizen engine uses Sequential Thinking MCP to orchestrate each improvement iteration as a structured thought chain. Each thought maps to a phase of the improvement loop: + +1. MEASURE — collect current KPIs +2. ANALYZE — compare to baseline and history +3. HYPOTHESIZE — identify root causes and opportunities +4. PROPOSE — generate concrete change plan +5. APPLY — mutate target assets +6. VERIFY — re-measure KPIs +7. DECIDE — keep improvement or revert +8. LOG — write audit record + +Without this connector, the plugin cannot function. See the README for installation instructions. + +### ~~memory (optional) + +When available, the engine uses persistent memory to maintain context across sessions and improvement runs. When unavailable, the engine relies solely on `.kaizen/` audit logs for cross-run continuity. diff --git a/kaizen/README.md b/kaizen/README.md new file mode 100644 index 0000000..3f904e8 --- /dev/null +++ b/kaizen/README.md @@ -0,0 +1,444 @@ +# Kaizen — Continuous Improvement Loops for Claude Code + +A recursive optimization engine inspired by [karpathy/autoresearch](https://github.com/karpathy/autoresearch). Define what to improve, how to measure, and what to mutate — the engine handles the rest. + +--- + +## Table of Contents + +- [Quick Start](#quick-start) +- [How It Works](#how-it-works) +- [Architecture](#architecture) +- [Bundled Profiles](#bundled-profiles) +- [Commands](#commands) +- [Creating Custom Profiles](#creating-custom-profiles) +- [Agents](#agents) +- [Audit Trail](#audit-trail) +- [Setup](#setup) +- [Troubleshooting](#troubleshooting) +- [Roadmap](#roadmap) + +--- + +## Quick Start + +```bash +# 1. Install the Sequential Thinking MCP (required) +# Add to your Claude Code MCP configuration: +# { +# "mcpServers": { +# "sequential-thinking": { +# "command": "npx", +# "args": ["-y", "@modelcontextprotocol/server-sequential-thinking"] +# } +# } +# } + +# 2. Run an improvement loop +/kaizen claude-code-usage + +# 3. View results +/kaizen-history claude-code-usage +``` + +--- + +## How It Works + +Kaizen runs **recursive improvement loops** against measurable KPIs. Each loop follows a ratcheting mechanism: every iteration either locks in an improvement or reverts to the previous best state. + +``` +┌─────────────────────────────────────────────┐ +│ KAIZEN IMPROVEMENT LOOP │ +│ │ +│ BOOTSTRAP │ +│ ├── Load profile │ +│ ├── Collect data sources │ +│ ├── Scaffold measurement tool │ +│ ├── Capture baseline KPIs │ +│ └── Adversarial review of measurement tool │ +│ │ +│ ITERATION LOOP (repeat until convergence) │ +│ ├── MEASURE → collect current KPIs │ +│ ├── ANALYZE → compare to baseline │ +│ ├── HYPOTHESIZE→ identify root causes │ +│ ├── PROPOSE → generate change plan │ +│ ├── APPLY → mutate target assets │ +│ ├── VERIFY → re-measure KPIs │ +│ ├── DECIDE → keep (commit) or revert │ +│ └── LOG → write audit record │ +│ │ +│ FINAL REVIEW │ +│ └── Adversarial validation of all changes │ +└─────────────────────────────────────────────┘ +``` + +### Ratcheting Strategies + +| Strategy | Logic | Use When | +|----------|-------|----------| +| **Greedy** | Keep if single KPI improves by >= epsilon; revert otherwise | Single optimization target | +| **Multi-objective** | Keep only if no KPI regresses AND at least one improves (Pareto dominance) | Multiple competing metrics | + +### Autonomy Levels + +| Level | Behavior | Best For | +|-------|----------|----------| +| `supervised` | Pause for approval at every proposal | First runs, sensitive targets | +| `autonomous` | Run until convergence or budget | Well-understood domains, overnight runs | +| `hybrid(N)` | Autonomous for N iterations, then checkpoint | Balanced confidence/control | + +### Stopping Conditions + +- **Convergence** — KPI delta < epsilon for `patience` consecutive iterations +- **Budget** — iteration count exceeded +- **User interrupt** — manual stop +- **Adversarial flag** — reviewer detects measurement integrity issues + +--- + +## Architecture + +### Engine + Profiles + +The plugin follows an **engine + profiles** architecture: + +- **Engine** (`kaizen-engine` skill) — generic loop orchestrator. Handles iteration control, context management, subagent dispatch, ratcheting, and audit logging. +- **Profiles** (`profiles/{name}/PROFILE.md`) — domain-specific specs. Define KPIs, data sources, mutation targets, and improvement instructions. +- **Agents** — specialized subagents dispatched by the engine for specific phases. + +### Sequential Thinking MCP Integration + +Each iteration is orchestrated as a **Sequential Thinking chain** with 8 thoughts (one per phase). This provides structured reasoning throughout the loop and enables the engine to track its progress through the iteration. + +### Context Management + +The engine compacts context between iterations to prevent window exhaustion: + +1. After each iteration, detailed analysis is written to disk (audit trail) +2. The next iteration starts with **reconstructed minimal context**: profile config + current summary + previous decision +3. Full history is available on disk but not loaded unless needed + +This allows the engine to run many iterations without degradation. + +--- + +## Bundled Profiles + +### claude-code-usage + +Analyzes and improves how Claude Code tools and skills are used within a project. + +| KPI | Direction | Description | +|-----|-----------|-------------| +| `tool_efficiency` | maximize | Ratio of dedicated tools vs bash fallbacks | +| `search_precision` | minimize | Average searches needed to find a target | +| `config_completeness` | maximize | Coverage of recommended configurations | +| `skill_utilization` | maximize | Ratio of installed skills actually triggered | + +**Data sources:** Session transcripts, `.claude/` config, git history, agent memory +**Mutates:** `.claude/CLAUDE.md`, `.claude/settings.json`, `.claude/settings.local.json` +**Autonomy:** supervised + +**Example improvement:** "You used `bash grep` 47 times last week but `Grep` tool only 12 times. Adding this convention to CLAUDE.md would improve tool_efficiency from 0.20 to 0.65." + +### code-refactoring + +Recursively improves code quality metrics using safe, behavior-preserving refactorings. + +| KPI | Direction | Description | +|-----|-----------|-------------| +| `cyclomatic_complexity` | minimize | Average complexity per function | +| `duplication_ratio` | minimize | Percentage of duplicated code | +| `file_size_compliance` | maximize | Percentage of files under 400 lines | + +**Data sources:** Source files in scope, linter output, test results +**Mutates:** Source files in user-specified scope (tests are immutable) +**Autonomy:** hybrid(3) + +### process-improvement + +Guides you through designing and running kaizen loops for business and operational processes. + +| KPI | Direction | Description | +|-----|-----------|-------------| +| `primary_metric` | user-defined | Main process KPI (e.g., cycle time) | +| `secondary_metric` | user-defined | Trade-off tracker (e.g., quality) | + +**Data sources:** User-provided process documentation, metrics +**Mutates:** Process documents, SOPs, checklists +**Autonomy:** supervised (always) +**Methodology:** PDCA, 5S, 5 Whys, Ishikawa, value stream mapping + +--- + +## Commands + +### /kaizen + +Run an improvement loop. + +``` +/kaizen [profile-name] [--scope ] [--budget ] [--autonomy ] +``` + +| Argument | Description | +|----------|-------------| +| `profile-name` | Bundled profile name or path to custom PROFILE.md | +| `--scope` | Override mutation targets | +| `--budget` | Override iteration budget | +| `--autonomy` | Override autonomy level | + +### /kaizen-help + +Display comprehensive help — commands, profiles, architecture, setup, troubleshooting. + +``` +/kaizen-help +``` + +### /kaizen-history + +View improvement run history and KPI trends. + +``` +/kaizen-history [profile-name] [--run ] +``` + +--- + +## Creating Custom Profiles + +### Interactive Designer + +Use the profile designer skill: + +``` +/kaizen-profile-designer +``` + +It guides you through KPI definition, data sources, mutation scope, and autonomy configuration. + +### Manual Creation + +Copy the profile template from `skills/kaizen-engine/references/profile-template.md` and customize. + +A profile is a Markdown file with YAML frontmatter: + +```yaml +--- +name: my-profile +description: "What this profile improves" +version: 0.1.0 +strategy: multi-objective +autonomy: supervised +iteration_budget: 10 +convergence: + epsilon: 0.02 + patience: 3 +kpis: + - name: my_metric + description: "What this measures" + direction: maximize + unit: ratio + measurement_method: automated + formula: "numerator / denominator" +mutation_targets: + defaults: + - path: "src/**" + description: "Files to improve" + immutable: + - path: "tests/**" +connectors: + required: + - "~~sequential-thinking" +--- + +# Improvement Instructions + +## MEASURE Phase +[How to collect KPI data] + +## ANALYZE Phase +[How to interpret measurements] + +## HYPOTHESIZE Phase +[Root causes to consider] + +## PROPOSE Phase +[Types of changes to make] + +## APPLY Phase +[Special considerations] + +## VERIFY Phase +[Additional verification] +``` + +### Profile Guidelines + +1. Start with 1-2 KPIs — add more only if needed +2. Keep formulas concrete and unambiguous +3. Set conservative epsilon to filter noise +4. Use supervised autonomy for first runs +5. Define immutables carefully — err on the side of protection +6. Write detailed phase instructions — your domain knowledge lives here + +--- + +## Agents + +The engine dispatches 4 specialized agents with optimized model routing: + +| Agent | Model | Purpose | Invoked During | +|-------|-------|---------|----------------| +| **kaizen-measurer** | haiku | Run measurement tools, collect KPIs | MEASURE, VERIFY | +| **kaizen-analyzer** | sonnet | Interpret data, find patterns, rank opportunities | ANALYZE | +| **kaizen-proposer** | sonnet | Generate minimal, targeted change proposals | PROPOSE | +| **kaizen-reviewer** | opus | Adversarial validation of tools and changes | BOOTSTRAP, Final Review | + +Each agent receives a **tailored, minimal context package** — only the information needed for its phase. This isolation ensures: +- Measurement can't be biased by proposals +- Review can't be influenced by having generated the changes +- Context stays lean across many iterations + +--- + +## Audit Trail + +Every run creates a structured audit trail: + +``` +.kaizen/ +└── runs/ + └── 2026-03-23-claude-code-usage-001/ + ├── manifest.json # Run configuration and overrides + ├── baseline.json # Initial KPI snapshot + ├── measure.py # Auto-generated measurement tool + ├── config.json # Measurement tool configuration + ├── iterations/ + │ ├── 001/ + │ │ ├── measurement.json # KPIs before iteration + │ │ ├── analysis.md # Analysis and hypotheses + │ │ ├── proposal.md # Proposed change + │ │ ├── backup/ # Pre-change file backups + │ │ ├── diff.patch # Applied changes + │ │ ├── verification.json # KPIs after change + │ │ └── decision.json # Keep/revert + reasoning + │ └── 002/ ... + ├── adversarial-review.md # Final review gate + └── summary.json # Aggregate results +``` + +### Storage Location + +| Scope | Location | +|-------|----------| +| Project-level | `.kaizen/` at project root | +| User-level | `~/.kaizen/` | + +### Cross-Run Continuity + +When you run the same profile again, the engine reads the previous run's `summary.json` and uses its final KPIs as the new baseline. This enables: +- **Trend tracking** across runs +- **Diminishing returns detection** +- **No duplicate baseline capture** on subsequent runs + +--- + +## Setup + +### Required: Sequential Thinking MCP + +The kaizen engine requires the Sequential Thinking MCP server for loop orchestration. + +**Option 1: Claude Code MCP settings** + +Add to `.claude/settings.json` or `~/.claude/settings.json`: + +```json +{ + "mcpServers": { + "sequential-thinking": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-sequential-thinking"] + } + } +} +``` + +**Option 2: Project-level .mcp.json** + +Create `.mcp.json` at your project root: + +```json +{ + "mcpServers": { + "sequential-thinking": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-sequential-thinking"] + } + } +} +``` + +### Optional: Python or TypeScript Runtime + +For profiles with `tool_generation: true`, the engine scaffolds measurement scripts: +- **Python**: requires `python3` on PATH +- **TypeScript**: requires `npx tsx` (install via `npm install -g tsx`) + +### Optional: Memory Connector + +For enhanced cross-session continuity, configure a memory-capable MCP server or use Claude Code's built-in file-based memory. + +--- + +## Troubleshooting + +| Issue | Cause | Solution | +|-------|-------|----------| +| "Sequential Thinking MCP not found" | MCP server not configured | Follow Setup instructions above | +| Measurement tool fails | Python/TS runtime missing | Install the required runtime | +| Measurement tool produces wrong values | Tool implementation bug | Review the tool source in `.kaizen/runs/{id}/measure.py`; check adversarial review findings | +| All iterations revert | Epsilon too high; scope too narrow; wrong approach | Lower epsilon; expand mutation scope; try different profile | +| KPIs don't improve after many runs | Diminishing returns | Run `/kaizen-history` to check trends; consider shifting focus | +| Context window exhaustion | Too many iterations without compaction | Reduce iteration budget; the engine should compact automatically | +| Adversarial review flags issues | Measurement artifacts detected | Review the flagged issues in `.kaizen/runs/{id}/adversarial-review.md`; fix measurement tool | +| Git commit errors during DECIDE | Git state conflicts | Ensure working tree is clean before running kaizen | + +--- + +## Roadmap + +### v1.0 (Current) +- Generic improvement engine with 8-phase loop +- 3 bundled profiles (claude-code-usage, code-refactoring, process-improvement) +- 4 specialized agents with model routing +- Profile validation in CI +- Audit trail with cross-run continuity +- Adversarial review gates + +### v2.0 (Planned) +- **Scheduling** — recurring improvement loops via cron (daily, weekly, biweekly, monthly) +- **Auto-run mode** — fully autonomous scheduled loops +- **Additional profiles** — performance optimization, security hardening, test coverage improvement +- **Dashboard** — web-based visualization of KPI trends across profiles +- **Profile marketplace** — community-contributed improvement profiles + +--- + +## Component Inventory + +| Type | Count | Components | +|------|-------|------------| +| Skills | 3 | kaizen-engine, kaizen-report, kaizen-profile-designer | +| Profiles | 3 | claude-code-usage, code-refactoring, process-improvement | +| Agents | 4 | kaizen-measurer, kaizen-analyzer, kaizen-proposer, kaizen-reviewer | +| Commands | 3 | /kaizen, /kaizen-help, /kaizen-history | + +--- + +## License + +MIT diff --git a/kaizen/agents/kaizen-analyzer/AGENT.md b/kaizen/agents/kaizen-analyzer/AGENT.md new file mode 100644 index 0000000..6879149 --- /dev/null +++ b/kaizen/agents/kaizen-analyzer/AGENT.md @@ -0,0 +1,78 @@ +--- +name: kaizen-analyzer +description: "Interpret kaizen KPI measurements by comparing current values against baseline and history. Identifies trends, calculates deltas, performs root-cause analysis, ranks improvement opportunities, and flags anomalies. Dispatched during ANALYZE phase of each kaizen iteration." +model: sonnet +allowed-tools: Read, Grep, Glob +--- + +# Kaizen Analyzer Agent + +You are an analytical agent for the kaizen improvement loop. Your job is to interpret measurement data and surface actionable insights for the proposer. + +## Protocol + +1. **Receive** from the engine: + - Current measurement data (measurement.json) + - Reference data (baseline.json or previous iteration's measurement) + - Profile KPI definitions (names, descriptions, directions, units) + - Domain-specific analysis guidance from the profile + +2. **Calculate** per-KPI deltas: + - Absolute delta: `current - reference` + - Percentage delta: `(current - reference) / reference * 100` + - Direction assessment: improving, regressing, or unchanged (relative to KPI direction) + +3. **Assess** trends (if multiple iterations available): + - Is each KPI consistently improving, plateauing, or oscillating? + - What is the rate of improvement (accelerating or decelerating)? + +4. **Rank** improvement opportunities: + - Which KPI has the most room for improvement? + - Which KPI is closest to its target/ideal value? + - Which KPI would benefit most from attention this iteration? + +5. **Detect** anomalies: + - Sudden jumps or drops (>2x the running average delta) + - Values outside expected range for the domain + - KPIs moving in opposite directions simultaneously + +6. **Write** structured analysis to the output path + +## Output Format + +Write analysis as markdown to the specified output path: + +```markdown +# Iteration {N} Analysis + +## KPI Deltas + +| KPI | Current | Reference | Delta | % Change | Direction | +|-----|---------|-----------|-------|----------|-----------| +| ... | ... | ... | ... | ... | improving/regressing/unchanged | + +## Trend Assessment + +[Per-KPI trend description with supporting data] + +## Improvement Opportunities + +1. **{highest priority KPI}** — [why this has the most room for improvement] +2. **{second priority}** — [reasoning] + +## Anomalies + +[Any detected anomalies, or "None detected"] + +## Recommendation + +[Which KPI to focus on this iteration and why] +``` + +## Constraints + +- Do NOT propose changes — that's the proposer's job +- Do NOT access or modify files outside the measurement data +- Base all analysis on data, not assumptions +- If data is insufficient for trend analysis, say so explicitly +- Apply the domain-specific analysis guidance from the profile when available diff --git a/kaizen/agents/kaizen-measurer/AGENT.md b/kaizen/agents/kaizen-measurer/AGENT.md new file mode 100644 index 0000000..94e720f --- /dev/null +++ b/kaizen/agents/kaizen-measurer/AGENT.md @@ -0,0 +1,61 @@ +--- +name: kaizen-measurer +description: "Execute kaizen measurement tools and collect KPI values. Runs auto-generated Python or TypeScript measurement scripts, captures JSON output, and writes structured results to the iteration directory. Dispatched during MEASURE and VERIFY phases of each kaizen iteration. Lightweight and fast — optimized for frequent invocation." +model: haiku +allowed-tools: Bash, Read, Write, Grep, Glob +--- + +# Kaizen Measurer Agent + +You are a measurement execution agent for the kaizen improvement loop. Your job is simple and critical: run the measurement tool and deliver clean KPI data. + +## Protocol + +1. **Receive** the measurement script path, run directory, and output file path from the engine +2. **Verify** the measurement script exists at the given path +3. **Execute** the script: + - Python: `python3 {script_path}` + - TypeScript: `npx tsx {script_path}` +4. **Capture** stdout (the JSON result) and stderr (any errors) +5. **Validate** the output: + - Is it valid JSON? + - Does it have the expected `kpis` object? + - Are all KPI values numeric? +6. **Write** the validated result to the output file path +7. **Report** back to the engine with: + - The KPI values + - Execution time + - Any warnings or errors + +## Error Handling + +| Situation | Action | +|-----------|--------| +| Script not found | Report CRITICAL error, do not proceed | +| Runtime not available | Report CRITICAL error with installation instructions | +| Script exits with error (code 1) | Read stderr JSON, report error with `recoverable` flag | +| Script hangs (>60s timeout) | Kill process, report CRITICAL timeout | +| Invalid JSON output | Report error, include raw stdout for debugging | +| Missing KPIs in output | Report WARNING, return partial results | + +## Output Format + +Write to the specified output file path: + +```json +{ + "iteration": N, + "timestamp": "ISO-8601", + "kpis": {"kpi_name": numeric_value}, + "source": "automated", + "execution_time_ms": 1234, + "warnings": [] +} +``` + +## Constraints + +- Do NOT analyze or interpret KPI values — that's the analyzer's job +- Do NOT modify the measurement script — report issues back to the engine +- Do NOT access files outside the run directory and measurement scope +- Keep execution simple and fast — you will be invoked many times per run diff --git a/kaizen/agents/kaizen-proposer/AGENT.md b/kaizen/agents/kaizen-proposer/AGENT.md new file mode 100644 index 0000000..e8c853b --- /dev/null +++ b/kaizen/agents/kaizen-proposer/AGENT.md @@ -0,0 +1,116 @@ +--- +name: kaizen-proposer +description: "Generate concrete, minimal improvement proposals for kaizen iterations. Reads analysis and hypotheses, respects mutation boundaries, avoids repeating reverted approaches, and produces actionable change plans with expected KPI impact estimates. Dispatched during PROPOSE phase." +model: sonnet +allowed-tools: Read, Grep, Glob, Edit, Write +--- + +# Kaizen Proposer Agent + +You are a proposal generation agent for the kaizen improvement loop. Your job is to translate analysis insights into a specific, minimal, actionable change. + +## Protocol + +1. **Receive** from the engine: + - Analysis and hypotheses (from analyzer) + - Mutable targets (list of files/patterns the engine allows you to modify) + - Immutable targets (list of files/patterns you MUST NOT touch) + - Domain-specific proposal guidance from the profile + - Previous reverted proposal (if any — DO NOT repeat this approach) + +2. **Identify** the highest-impact change: + - Focus on the KPI ranked highest in the analysis + - Consider the hypothesized root causes + - Select the simplest change that addresses the root cause + +3. **Verify** the proposal is valid: + - Does it target only mutable files? + - Does it avoid ALL immutable patterns? + - Is it minimal (smallest change for maximum impact)? + - Is it different from any previously reverted proposal? + +4. **Estimate** expected impact: + - Which KPIs will improve and by approximately how much? + - Which KPIs might be affected negatively (trade-offs)? + - What is the confidence level? + +5. **Write** the proposal to the output path + +## Output Format + +Write proposal as markdown: + +```markdown +# Iteration {N} Proposal + +## Target +- **File(s):** {list of files to modify} +- **Scope:** {brief description of what area/aspect is being changed} + +## Change Description + +{Precise description of what to change. Be specific enough that someone could implement it without ambiguity.} + +### Before +{Show the current state of the code/config being changed} + +### After +{Show the proposed state after the change} + +## Rationale +- **Hypothesis tested:** {which hypothesis from the analysis} +- **Root cause addressed:** {what underlying issue this fixes} +- **Expected KPI impact:** + | KPI | Expected Delta | Confidence | + |-----|---------------|------------| + | ... | +/- estimate | high/medium/low | + +## Risk Assessment +- **What could go wrong:** {potential negative effects} +- **Mitigation:** {how the engine can detect and revert if needed} + +## Confidence: {high|medium|low} +{Reasoning for the confidence level} +``` + +## Proposal Quality Guidelines + +**Good proposals:** +- Target one specific issue +- Make the minimum necessary change +- Have clear, testable expected outcomes +- Include before/after examples + +**Bad proposals:** +- Change multiple unrelated things at once +- Make sweeping refactors +- Have vague expected outcomes ("should improve things") +- Repeat a previously reverted approach + +## When No Proposal Is Viable + +If you genuinely cannot find a viable improvement: + +```markdown +# Iteration {N} Proposal + +## Status: NO VIABLE PROPOSAL + +## Reasoning +{Why no proposal could be generated:} +- All promising approaches have been tried and reverted +- The KPIs are near their practical limits +- The remaining improvements require changes outside the mutation scope +- Other reason with specifics + +## Suggestion +{What the user could do: expand mutation scope, change profile, accept current state} +``` + +## Constraints + +- NEVER propose changes to immutable files — this is a hard boundary +- NEVER repeat a reverted proposal — try a completely different approach +- Prefer small, targeted changes over large restructuring +- One logical change per proposal (the engine evaluates atomically) +- Read the actual file contents before proposing changes (don't guess) diff --git a/kaizen/agents/kaizen-reviewer/AGENT.md b/kaizen/agents/kaizen-reviewer/AGENT.md new file mode 100644 index 0000000..0352e74 --- /dev/null +++ b/kaizen/agents/kaizen-reviewer/AGENT.md @@ -0,0 +1,147 @@ +--- +name: kaizen-reviewer +description: "Adversarial validation agent for kaizen improvement loops. Reviews measurement tools for correctness and gaming vulnerability, validates that improvements are genuine and aligned with the profile mission, and checks immutability boundary compliance. Dispatched at BOOTSTRAP for tool review and as final gate after loop completion. Uses deep reasoning to catch subtle issues." +model: opus +allowed-tools: Read, Grep, Glob, Bash +--- + +# Kaizen Reviewer Agent + +You are an adversarial reviewer for the kaizen improvement loop. Your role is to be skeptical — assume that improvements might be artifacts, measurement tools might be flawed, and changes might drift from the mission. Your job is to catch these issues before they are accepted. + +## Two Review Modes + +### Mode 1: Measurement Tool Review (BOOTSTRAP) + +Dispatched after the engine scaffolds a measurement tool. You receive: +- Profile KPI definitions (names, descriptions, directions, formulas) +- Measurement tool source code +- Baseline measurement output + +**Review checklist:** + +1. **Formula fidelity** — Does the tool actually implement each KPI formula? + - Trace each KPI computation from input data to output value + - Check that the formula's numerator and denominator match the description + - Flag any simplifications or approximations that could mislead + +2. **Edge cases** — Does the tool handle missing/malformed data? + - What happens if a data source is empty? + - What happens if expected files don't exist? + - What happens if values are zero (division by zero)? + +3. **Gaming vulnerability** — Could trivial changes game the metric? + - Could adding a comment improve a count-based metric? + - Could renaming a file affect path-based matching? + - Could deleting test files improve a ratio? + - Would the metric still be meaningful after 10 iterations of optimization? + +4. **Output compliance** — Does the JSON conform to the interface contract? + - All expected KPIs present + - Values are numeric + - Metadata is complete + +5. **Determinism** — Does the same input produce the same output? + - No random elements + - No time-dependent calculations (except timestamps in metadata) + - No external API calls that could vary + +**Output format:** + +```markdown +# Measurement Tool Review + +## Verdict: {PASSED|FLAGGED} + +## Findings + +### CRITICAL (must fix before proceeding) +- {finding with evidence and suggested fix} + +### MEDIUM (note and monitor) +- {finding with reasoning} + +### LOW (acceptable) +- {minor observation} + +## KPI-by-KPI Assessment + +| KPI | Formula Match | Edge Cases | Gaming Risk | Verdict | +|-----|--------------|------------|-------------|---------| +| ... | yes/partial/no | handled/gaps | low/medium/high | ok/concern | + +## Recommendations +{Specific changes to make, if any} +``` + +### Mode 2: Final Run Review (Post-Loop) + +Dispatched after the improvement loop completes. You receive: +- Profile mission (name, description, KPIs) +- Measurement tool source code +- Run summary (baseline → final KPIs, iterations, kept/reverted counts) +- Sample iteration decisions (first, best, last) + +**Review checklist:** + +1. **Improvement genuineness** — Are the reported gains real? + - Compare baseline and final KPIs: are the improvements plausible for the number of iterations? + - Could the improvement be an artifact of measurement drift? + - If improvement is >50%, scrutinize more carefully — large gains are more likely to be artifacts + +2. **Mission alignment** — Do the changes serve the profile's purpose? + - Read the profile description and KPI descriptions + - Do the kept iteration decisions logically serve these goals? + - Were there any "technically correct but meaningfully useless" improvements? + +3. **Measurement integrity** — Could the tool have been gamed? + - Review the tool source code against the types of changes that were made + - Check if any kept iteration specifically targeted the measurement mechanism + - Look for Goodhart's Law patterns ("when a measure becomes a target, it ceases to be a good measure") + +4. **Immutability compliance** — Were boundaries respected? + - Check diff.patch files from kept iterations + - Verify no immutable patterns were touched + +5. **Convergence appropriateness** — Was the stopping reason valid? + - patience_exceeded: was the patience threshold reasonable? + - budget_exhausted: could more iterations have helped? + - user_stopped: was this premature? + +**Output format:** + +```markdown +# Final Adversarial Review + +## Verdict: {PASSED|FLAGGED} + +## Improvement Assessment +- Reported improvement: {percentage per KPI} +- Plausibility: {high|medium|low} — {reasoning} +- Artifact risk: {low|medium|high} — {reasoning} + +## Mission Alignment +{Assessment of whether changes serve the profile's stated purpose} + +## Measurement Integrity +{Assessment of whether the measurement tool remained valid throughout} + +## Boundary Compliance +{Confirmation that immutable targets were respected} + +## Convergence Assessment +{Whether the stopping reason was appropriate} + +## Recommendations +{Suggestions for the user: trust results, investigate specific iterations, re-run with changes} +``` + +## Adversarial Mindset + +When reviewing, adopt the perspective of someone who: +- **Doesn't trust LLM-generated code** — assume the measurement tool has bugs until proven otherwise +- **Expects Goodhart's Law** — optimizing a metric often corrupts the metric +- **Looks for the simplest explanation** — if a metric improved dramatically, the simplest explanation might be a measurement bug, not genuine improvement +- **Checks boundary conditions** — immutability violations are the most dangerous failure mode + +You are the last line of defense before results are presented to the user. Be thorough. diff --git a/kaizen/commands/kaizen-help.md b/kaizen/commands/kaizen-help.md new file mode 100644 index 0000000..184e7b0 --- /dev/null +++ b/kaizen/commands/kaizen-help.md @@ -0,0 +1,111 @@ +--- +description: "Show kaizen usage guide, available profiles, examples, and configuration tips" +--- + +# /kaizen-help + +Display comprehensive help for the kaizen plugin — commands, profiles, architecture, and troubleshooting. + +## Commands + +| Command | Purpose | +|---------|---------| +| `/kaizen [profile]` | Run an improvement loop | +| `/kaizen-help` | Show this help guide | +| `/kaizen-history [profile]` | View improvement history and KPI trends | + +## How It Works + +The kaizen engine runs recursive improvement loops inspired by [karpathy/autoresearch](https://github.com/karpathy/autoresearch). Each loop: + +1. Loads a **profile** that defines what to improve and how to measure +2. Captures a **baseline** snapshot of current KPI values +3. Runs **iterations** of the MEASURE → ANALYZE → HYPOTHESIZE → PROPOSE → APPLY → VERIFY → DECIDE → LOG cycle +4. Each iteration either **keeps** the improvement (ratchets forward) or **reverts** (tries again) +5. Stops when KPIs converge, budget is exhausted, or the user intervenes +6. Runs a **final adversarial review** to validate improvements are genuine + +## Profiles + +Profiles define the improvement domain. Three are bundled: + +### claude-code-usage +Analyzes Claude Code tool and skill usage patterns. Detects anti-patterns like bash grep instead of Grep tool, missing CLAUDE.md sections, unconfigured permissions. Suggests configuration improvements. + +**KPIs:** tool_efficiency, search_precision, config_completeness, skill_utilization +**Mutates:** `.claude/CLAUDE.md`, `.claude/settings.json` +**Best for:** Optimizing your Claude Code workflow + +### code-refactoring +Recursively improves code quality metrics. Applies safe, behavior-preserving refactorings one at a time — extract functions, reduce complexity, eliminate duplication, split large files. + +**KPIs:** cyclomatic_complexity, duplication_ratio, file_size_compliance +**Mutates:** Source files in scope (tests are immutable) +**Best for:** Cleaning up a codebase area + +### process-improvement +Guides you through designing and running kaizen improvement loops for business processes. Uses PDCA, 5 Whys, value stream mapping, and other lean methodologies. + +**KPIs:** User-defined (guided during setup) +**Mutates:** Process documents, SOPs, checklists +**Best for:** Operational and workflow improvements + +### Custom Profiles +Create your own with `/kaizen-profile-designer` or by copying the profile template. + +## Strategies + +- **Greedy** — single KPI, pure hill-climbing. Keep if improved, revert if not. +- **Multi-objective** — Pareto dominance. Keep only if no KPI regressed AND at least one improved. + +## Autonomy Levels + +- **supervised** — pause for human approval at every proposal +- **autonomous** — run until convergence or budget (no human intervention) +- **hybrid(N)** — autonomous for N iterations, then checkpoint + +## Audit Trail + +Every run creates a structured audit trail in `.kaizen/runs/{run-id}/`: + +``` +manifest.json — run configuration +baseline.json — initial KPI values +measure.py — auto-generated measurement tool +iterations/NNN/ — per-iteration data (measurement, analysis, proposal, diff, decision) +adversarial-review.md — final review +summary.json — aggregate results and KPI improvement +``` + +Use `/kaizen-history` to browse the audit trail. + +## Setup + +### Required: Sequential Thinking MCP + +The kaizen engine requires Sequential Thinking MCP for loop orchestration. + +**Installation:** + +Add to your `.claude/settings.json` or MCP configuration: + +```json +{ + "mcpServers": { + "sequential-thinking": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-sequential-thinking"] + } + } +} +``` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| "Sequential Thinking MCP not found" | Install the MCP server (see Setup above) | +| Measurement tool fails | Check Python/TS runtime is installed; read the error in the audit trail | +| All iterations revert | The epsilon may be too high; the scope may be too narrow; try a different approach | +| Context window exhaustion | The engine compacts between iterations; reduce iteration budget if needed | +| KPIs don't improve | Check if the measurement tool is correct; review the adversarial review output | diff --git a/kaizen/commands/kaizen-history.md b/kaizen/commands/kaizen-history.md new file mode 100644 index 0000000..bad5338 --- /dev/null +++ b/kaizen/commands/kaizen-history.md @@ -0,0 +1,51 @@ +--- +description: "Show improvement run history and KPI trends for a kaizen profile" +--- + +# /kaizen-history + +View improvement history, KPI trends, and run details from past kaizen loops. + +## Usage + +``` +/kaizen-history [profile-name] [--run ] +``` + +## Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `profile-name` | No | Filter history to a specific profile. If omitted, shows all profiles. | +| `--run ` | No | Drill into a specific run for detailed iteration-by-iteration data. | + +## Examples + +``` +/kaizen-history # Overview of all profiles +/kaizen-history claude-code-usage # History for a specific profile +/kaizen-history --run 2026-03-23-claude-code-usage-001 # Detailed run view +``` + +## What It Shows + +### Overview (no arguments) +- All profiles with run counts +- Last run date and best KPI improvement per profile +- Active vs converged status + +### Profile History (with profile name) +- KPI trajectory table across all runs +- Cumulative improvement from first baseline to latest +- Trend analysis (accelerating, steady, diminishing returns) +- Adversarial review status per run + +### Run Detail (with --run) +- Iteration-by-iteration decision log +- KPI deltas per iteration +- Proposals that were kept +- Adversarial review findings + +## Invoke + +Load and activate the `kaizen-report` skill with the provided arguments. diff --git a/kaizen/commands/kaizen.md b/kaizen/commands/kaizen.md new file mode 100644 index 0000000..f674ef5 --- /dev/null +++ b/kaizen/commands/kaizen.md @@ -0,0 +1,56 @@ +--- +description: "Run a kaizen improvement loop with a specified profile" +--- + +# /kaizen + +Run a recursive improvement loop using the kaizen engine. + +## Usage + +``` +/kaizen [profile-name] [--scope ] [--budget ] [--autonomy ] +``` + +## Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `profile-name` | No | Name of a bundled profile or path to a custom PROFILE.md. If omitted, shows available profiles and asks user to choose. | +| `--scope ` | No | Override the default mutation targets. Narrows the improvement scope to the specified path. | +| `--budget ` | No | Override the iteration budget for this run. | +| `--autonomy ` | No | Override autonomy level: `supervised`, `autonomous`, or `hybrid(N)`. | + +## Available Profiles + +| Profile | Description | Default Autonomy | +|---------|-------------|------------------| +| `claude-code-usage` | Analyze and improve Claude Code tool/skill usage | supervised | +| `code-refactoring` | Recursively improve code quality metrics | hybrid(3) | +| `process-improvement` | Design kaizen loops for business processes | supervised | + +## Examples + +``` +/kaizen claude-code-usage +/kaizen code-refactoring --scope src/api/ +/kaizen process-improvement +/kaizen ./my-custom-profile/PROFILE.md --budget 5 +``` + +## What Happens + +1. **BOOTSTRAP** — loads the profile, collects data sources, scaffolds measurement tools, captures baseline +2. **ITERATION LOOP** — repeats: MEASURE → ANALYZE → HYPOTHESIZE → PROPOSE → APPLY → VERIFY → DECIDE → LOG +3. **FINAL REVIEW** — adversarial review of all changes and KPI integrity +4. **REPORT** — summary of improvements, KPI results, and recommendations + +Audit trail is written to `.kaizen/runs/` (project-level) or `~/.kaizen/runs/` (user-level). + +## Prerequisites + +- **~~sequential-thinking** MCP connector must be configured. See the kaizen README for setup instructions. + +## Invoke + +Load and activate the `kaizen-engine` skill with the provided arguments. diff --git a/kaizen/profiles/claude-code-usage/PROFILE.md b/kaizen/profiles/claude-code-usage/PROFILE.md new file mode 100644 index 0000000..7d9eff2 --- /dev/null +++ b/kaizen/profiles/claude-code-usage/PROFILE.md @@ -0,0 +1,175 @@ +--- +name: claude-code-usage +description: "Analyze and improve Claude Code tool and skill usage patterns within a project or globally" +version: 1.0.0 + +strategy: multi-objective +autonomy: supervised +iteration_budget: 10 +convergence: + epsilon: 0.03 + patience: 3 + +initial_state: + capture_strategy: automatic + sources: + - type: session_transcripts + path: "~/.claude/projects/*/sessions/" + description: "Claude Code session conversation logs" + - type: config + path: ".claude/" + description: "Project-level Claude Code configuration (CLAUDE.md, settings.json)" + - type: git_history + command: "git log --oneline -100" + description: "Recent commit activity for context on what work was done" + - type: memory + path: "~/.claude/projects/*/memory/" + description: "Project memory files with learned preferences and feedback" + - type: config + path: "~/.claude/settings.json" + description: "Global Claude Code settings and permissions" + +measurement: + tool_generation: true + language: python + +kpis: + - name: tool_efficiency + description: "Ratio of dedicated tool usage (Read, Grep, Glob, Edit, Write) vs bash fallback equivalents (cat, grep, rg, sed, awk, echo). Higher is better." + direction: maximize + unit: ratio + measurement_method: automated + formula: "dedicated_tool_calls / (dedicated_tool_calls + bash_fallback_calls)" + - name: search_precision + description: "Average number of search operations (Grep, Glob, or bash find/grep) needed to locate a target file or code pattern. Lower is better." + direction: minimize + unit: count + measurement_method: automated + formula: "total_search_operations / unique_search_targets_found" + - name: config_completeness + description: "Coverage of recommended Claude Code configurations — permissions, rules, memory, CLAUDE.md sections. Higher is better." + direction: maximize + unit: percentage + measurement_method: automated + formula: "configured_items / recommended_items * 100" + - name: skill_utilization + description: "Ratio of installed skills that were actually triggered in recent sessions vs total installed skills. Higher means the skill portfolio is well-curated." + direction: maximize + unit: ratio + measurement_method: automated + formula: "triggered_skills / installed_skills" + +mutation_targets: + defaults: + - path: ".claude/CLAUDE.md" + description: "Project instructions — add tool usage conventions, search strategies" + - path: ".claude/settings.json" + description: "Permissions and tool configuration" + - path: ".claude/settings.local.json" + description: "Local settings overrides" + immutable: + - path: "**/*.ts" + - path: "**/*.js" + - path: "**/*.py" + - path: "**/*.go" + - path: "**/*.rs" + - path: "tests/**" + - path: ".git/**" + - path: "node_modules/**" + +connectors: + required: + - "~~sequential-thinking" + optional: + - "~~memory" +--- + +# Claude Code Usage Improvement Instructions + +## MEASURE Phase + +Analyze recent Claude Code session transcripts to extract tool usage statistics: + +1. **Scan session files** — look for tool invocation patterns in conversation transcripts +2. **Classify tool calls** — categorize each tool usage as: + - **Dedicated tool**: Read, Write, Edit, Grep, Glob, Agent, Bash (for system commands only) + - **Bash fallback**: bash commands that duplicate dedicated tool functionality: + - `cat`, `head`, `tail` → should be `Read` + - `grep`, `rg` (via bash) → should be `Grep` + - `find`, `ls` (for file search) → should be `Glob` + - `sed`, `awk` (for file editing) → should be `Edit` + - `echo >`, `cat <<` (for file creation) → should be `Write` +3. **Count search operations** — group sequential searches for the same target +4. **Inventory installed skills** — list all skills in `~/.claude/plugins/` and project plugins +5. **Check config coverage** — compare current `.claude/` configuration against recommended items: + - CLAUDE.md exists and has project-specific content + - settings.json has appropriate allowedTools + - Memory files exist and are actively used + - Rules directory has relevant conventions + +`Read references/tool-taxonomy.md` for the complete classification guide. + +## ANALYZE Phase + +Compare current tool usage ratios against industry best practices: + +- **tool_efficiency < 0.5**: Significant reliance on bash fallbacks. Look for patterns — is it a specific category (search, read, edit) or across the board? +- **tool_efficiency 0.5-0.8**: Moderate. Focus on the most frequent fallback category. +- **tool_efficiency > 0.8**: Good. Look for subtle improvements. + +- **search_precision > 5**: Too many searches per target. Likely missing proper glob patterns or searching too broadly. +- **search_precision 2-5**: Average. Room for improvement with better search strategies. +- **search_precision < 2**: Efficient. Check if this is genuine or if search targets are too easy. + +- **config_completeness < 50%**: Basic setup. Many recommended configurations missing. +- **config_completeness 50-80%**: Partial. Focus on the highest-impact missing items. +- **config_completeness > 80%**: Well-configured. Look for fine-tuning opportunities. + +- **skill_utilization < 0.3**: Many dormant skills. Portfolio needs pruning or the user needs guidance. +- **skill_utilization 0.3-0.7**: Moderate. Check if dormant skills are relevant to current work. +- **skill_utilization > 0.7**: Well-curated portfolio. + +`Read references/anti-patterns.md` for common inefficiency patterns and their signatures. + +## HYPOTHESIZE Phase + +Common root causes for poor Claude Code usage: + +1. **Habit patterns** — user or Claude defaults to bash because it's familiar, not because it's better +2. **Permission gaps** — dedicated tools not in allowedTools, forcing bash workarounds +3. **CLAUDE.md gaps** — project instructions don't mention preferred tool usage patterns +4. **Missing skills** — relevant skills are available but not installed +5. **Over-installed skills** — too many skills create noise and reduce triggering precision +6. **Search strategy gaps** — no documented file organization conventions, leading to broad searches + +## PROPOSE Phase + +Appropriate changes for this profile: + +- **Add tool usage conventions to CLAUDE.md** — e.g., "Always use Grep instead of bash grep for code search" +- **Update allowedTools in settings.json** — grant permissions for frequently-used dedicated tools +- **Add search strategy hints to CLAUDE.md** — document the project structure so searches are targeted +- **Recommend skill installation/removal** — suggest installing relevant skills or removing dormant ones + +**Constraints:** +- NEVER modify source code files — this profile only touches Claude Code configuration +- Changes should be conservative — one config change per iteration +- Prefer CLAUDE.md additions over settings.json changes (more visible, easier to review) +- Provide the user with context for why the change helps (reference the specific anti-pattern) + +## APPLY Phase + +When modifying CLAUDE.md: +- Add new sections at the end, don't reorganize existing content +- Use clear headings that indicate the content was added by kaizen + +When modifying settings.json: +- Validate JSON syntax after changes +- Preserve all existing settings — only add or modify, never remove + +## VERIFY Phase + +After applying changes, re-run the measurement tool. Additionally: +- Verify that modified configuration files are syntactically valid +- Check that no existing functionality was broken by the config change +- Note that tool_efficiency improvements may not be immediately visible (they affect future sessions) diff --git a/kaizen/profiles/claude-code-usage/references/anti-patterns.md b/kaizen/profiles/claude-code-usage/references/anti-patterns.md new file mode 100644 index 0000000..7fc2a5a --- /dev/null +++ b/kaizen/profiles/claude-code-usage/references/anti-patterns.md @@ -0,0 +1,83 @@ +# Anti-Patterns in Claude Code Usage + +## Search Anti-Patterns + +### AP-1: Shotgun Search +**Pattern:** Multiple broad searches before finding the target +**Signature:** 3+ `Grep` or `Glob` calls with different patterns for the same target +**Root cause:** Unclear project structure conventions in CLAUDE.md +**Fix:** Add file organization section to CLAUDE.md with directory purpose and naming conventions + +### AP-2: Bash Grep Habit +**Pattern:** Using `bash grep -r` instead of `Grep` tool +**Signature:** `Bash` tool with `grep` or `rg` commands for content search +**Root cause:** Tool permissions not configured; old habit from manual CLI use +**Fix:** Add `Grep` to allowedTools; add "prefer Grep over bash grep" to CLAUDE.md conventions + +### AP-3: Find Instead of Glob +**Pattern:** Using `bash find` instead of `Glob` tool +**Signature:** `Bash` tool with `find . -name` commands +**Root cause:** Unfamiliarity with Glob's pattern syntax +**Fix:** Add glob pattern examples to CLAUDE.md; document common search patterns + +## Reading Anti-Patterns + +### AP-4: Cat for Reading +**Pattern:** Using `bash cat` instead of `Read` tool +**Signature:** `Bash` tool with `cat`, `head`, or `tail` commands for file reading +**Root cause:** Habit pattern; Read tool may not be in allowedTools +**Fix:** Add Read to allowedTools; add note to CLAUDE.md + +### AP-5: Excessive Full-File Reads +**Pattern:** Reading entire large files when only a section is needed +**Signature:** `Read` calls without `offset`/`limit` on files >500 lines +**Root cause:** Not knowing the file structure; missing documentation +**Fix:** Add file size expectations and key section locations to CLAUDE.md + +## Editing Anti-Patterns + +### AP-6: Sed for Editing +**Pattern:** Using `bash sed` instead of `Edit` tool +**Signature:** `Bash` tool with `sed -i` or `sed -e` commands +**Root cause:** Complex regex edits that feel natural in sed; Edit tool not trusted +**Fix:** Document Edit tool capabilities in CLAUDE.md; use `replace_all` for bulk replacements + +### AP-7: Write Instead of Edit +**Pattern:** Using `Write` to overwrite a file when `Edit` would be more targeted +**Signature:** `Write` tool on an existing file for small changes +**Root cause:** Simpler mental model (replace entire file vs find-and-replace) +**Fix:** Add convention to CLAUDE.md: "Use Edit for modifications, Write only for new files" + +## Configuration Anti-Patterns + +### AP-8: Missing CLAUDE.md +**Pattern:** No project-level CLAUDE.md or empty CLAUDE.md +**Signature:** CLAUDE.md doesn't exist or has only boilerplate +**Root cause:** Never set up; unclear what to include +**Fix:** Create CLAUDE.md with project structure, conventions, and key commands + +### AP-9: Over-Broad Permissions +**Pattern:** Using `dangerouslySkipPermissions` or overly broad allowedTools +**Signature:** Settings that bypass the permission system +**Root cause:** Permission prompts felt slow; quick fix applied +**Fix:** Configure specific allowedTools for commonly used tools; remove dangerous overrides + +### AP-10: Dormant Skills +**Pattern:** Many installed skills that never trigger +**Signature:** skill_utilization ratio < 0.3 +**Root cause:** Installed "just in case" but not relevant to current work +**Fix:** Audit installed skills; remove or reconfigure dormant ones; add trigger phrases to CLAUDE.md + +## Agent Anti-Patterns + +### AP-11: Sequential When Parallel +**Pattern:** Running agent tasks one at a time when they could run in parallel +**Signature:** Multiple sequential `Agent` dispatches with no dependencies between them +**Root cause:** Not aware of parallel dispatch capability +**Fix:** Add parallel agent patterns to CLAUDE.md + +### AP-12: Agent for Simple Tasks +**Pattern:** Dispatching an agent for a task that could be done with a single tool call +**Signature:** `Agent` dispatch followed by a single Read/Grep/Glob inside the agent +**Root cause:** Over-reliance on agent abstraction +**Fix:** Add guidance on when to use agents vs direct tool calls diff --git a/kaizen/profiles/claude-code-usage/references/best-practices.md b/kaizen/profiles/claude-code-usage/references/best-practices.md new file mode 100644 index 0000000..5da391b --- /dev/null +++ b/kaizen/profiles/claude-code-usage/references/best-practices.md @@ -0,0 +1,86 @@ +# Claude Code Best Practices — Configuration Reference + +## Recommended CLAUDE.md Sections + +A well-configured CLAUDE.md should include: + +1. **Project Overview** — what the project does, tech stack, architecture +2. **Directory Structure** — purpose of each top-level directory +3. **Development Workflow** — how to build, test, and deploy +4. **Tool Usage Conventions** — preferred tools for common operations +5. **Coding Standards** — naming, formatting, patterns to follow +6. **Testing Requirements** — coverage targets, test patterns + +## Recommended settings.json Configuration + +### allowedTools + +Tools that should typically be allowed for productive development: + +```json +{ + "allowedTools": [ + "Read", + "Write", + "Edit", + "Grep", + "Glob", + "Bash(git *)", + "Bash(npm *)", + "Bash(npx *)" + ] +} +``` + +Adjust based on tech stack (add `pip`, `cargo`, `go`, `docker` as needed). + +### Model Configuration + +- Use `sonnet` for day-to-day development +- Switch to `opus` for complex architectural decisions +- Use `haiku` for lightweight agents + +## Recommended Rules + +The `.claude/rules/` directory should contain: + +- `coding-style.md` — language-specific conventions +- `git-workflow.md` — commit message format, branch naming +- `testing.md` — coverage requirements, TDD workflow +- `security.md` — input validation, secret handling + +## Recommended Memory Usage + +Active memory should capture: + +- User role and expertise level +- Project-specific feedback (corrections, preferences) +- External resource references (issue trackers, dashboards) + +## Configuration Completeness Checklist + +The `config_completeness` KPI measures coverage of these items: + +| Item | Category | Weight | +|------|----------|--------| +| CLAUDE.md exists | Essential | 2 | +| CLAUDE.md has project overview | Essential | 2 | +| CLAUDE.md has directory structure | Essential | 2 | +| CLAUDE.md has dev workflow | Important | 1 | +| CLAUDE.md has tool conventions | Important | 1 | +| settings.json exists | Essential | 2 | +| allowedTools configured | Essential | 2 | +| Rules directory exists | Important | 1 | +| At least 1 rule file | Important | 1 | +| Memory directory exists | Optional | 1 | +| At least 1 memory file | Optional | 1 | +| .gitignore includes .claude/ | Important | 1 | + +**Score:** `sum(present_items * weight) / sum(all_items * weight) * 100` + +## Skill Portfolio Guidelines + +- **Install only skills relevant to current work** — dormant skills add context overhead +- **Review skill descriptions** — ensure trigger phrases match your typical requests +- **Prefer specific over generic** — a language-specific reviewer beats a generic one +- **Remove after project ends** — project-specific skills should be uninstalled when done diff --git a/kaizen/profiles/claude-code-usage/references/tool-taxonomy.md b/kaizen/profiles/claude-code-usage/references/tool-taxonomy.md new file mode 100644 index 0000000..03bb44f --- /dev/null +++ b/kaizen/profiles/claude-code-usage/references/tool-taxonomy.md @@ -0,0 +1,87 @@ +# Tool Taxonomy — Dedicated Tools vs Bash Fallbacks + +## Dedicated Tools (Preferred) + +These are Claude Code's built-in tools designed for specific operations. They provide better user experience, are reviewable, and integrate with the permission system. + +| Tool | Purpose | When to Use | +|------|---------|-------------| +| `Read` | Read file contents | Any file reading operation | +| `Write` | Create new files | Creating new files or complete rewrites | +| `Edit` | Modify existing files | Targeted text replacements | +| `Grep` | Search file contents | Pattern matching across files | +| `Glob` | Find files by name | File discovery by path pattern | +| `Agent` | Dispatch subagents | Complex multi-step research or parallel tasks | +| `Bash` | System commands | ONLY for operations that have no dedicated tool equivalent | + +## Bash Fallback Patterns (Anti-Patterns) + +These are bash commands used via the `Bash` tool that duplicate dedicated tool functionality. Each has a preferred dedicated tool alternative. + +### File Reading Fallbacks + +| Bash Command | Dedicated Alternative | Detection Pattern | +|-------------|----------------------|-------------------| +| `cat {file}` | `Read` | `Bash` tool with `cat` command | +| `head -n {N} {file}` | `Read` with `limit` parameter | `Bash` tool with `head` command | +| `tail -n {N} {file}` | `Read` with `offset` parameter | `Bash` tool with `tail` command | +| `less {file}` | `Read` | `Bash` tool with `less` command | + +### Content Search Fallbacks + +| Bash Command | Dedicated Alternative | Detection Pattern | +|-------------|----------------------|-------------------| +| `grep {pattern} {files}` | `Grep` | `Bash` tool with `grep` command | +| `grep -r {pattern} .` | `Grep` with `path` parameter | `Bash` tool with recursive grep | +| `rg {pattern}` | `Grep` | `Bash` tool with `rg` command | +| `ag {pattern}` | `Grep` | `Bash` tool with `ag` command | +| `ack {pattern}` | `Grep` | `Bash` tool with `ack` command | + +### File Discovery Fallbacks + +| Bash Command | Dedicated Alternative | Detection Pattern | +|-------------|----------------------|-------------------| +| `find . -name {pattern}` | `Glob` | `Bash` tool with `find` command | +| `ls {pattern}` | `Glob` | `Bash` tool with `ls` for file discovery | +| `fd {pattern}` | `Glob` | `Bash` tool with `fd` command | + +### File Editing Fallbacks + +| Bash Command | Dedicated Alternative | Detection Pattern | +|-------------|----------------------|-------------------| +| `sed -i {expr} {file}` | `Edit` | `Bash` tool with `sed` command | +| `awk '{script}' {file}` | `Edit` | `Bash` tool with `awk` for file modification | +| `perl -pi -e {expr} {file}` | `Edit` | `Bash` tool with `perl` for editing | + +### File Creation Fallbacks + +| Bash Command | Dedicated Alternative | Detection Pattern | +|-------------|----------------------|-------------------| +| `echo "..." > {file}` | `Write` | `Bash` tool with echo redirect | +| `cat << 'EOF' > {file}` | `Write` | `Bash` tool with heredoc redirect | +| `printf "..." > {file}` | `Write` | `Bash` tool with printf redirect | + +## Legitimate Bash Usage + +These bash commands have NO dedicated tool equivalent and are appropriate: + +- `git` commands (status, log, diff, commit, push, branch) +- `npm`, `yarn`, `pnpm` package management +- `pip`, `poetry`, `uv` package management +- `cargo`, `go`, `dotnet` build tools +- `docker`, `docker-compose` container management +- `curl`, `wget` HTTP requests (when not using WebFetch) +- `make`, `cmake` build systems +- Process management (`ps`, `kill`, `lsof`) +- System information (`uname`, `whoami`, `env`) +- Test runners (`pytest`, `jest`, `cargo test`) +- Linters and formatters (`eslint`, `prettier`, `black`, `rustfmt`) + +## Measurement Logic + +To calculate `tool_efficiency`: + +1. Count all tool invocations in session transcripts +2. For each `Bash` invocation, check if the command matches a fallback pattern +3. Classify: `dedicated_tool_calls` = non-Bash tools + legitimate Bash uses; `bash_fallback_calls` = Bash invocations matching fallback patterns +4. Ratio = `dedicated_tool_calls / (dedicated_tool_calls + bash_fallback_calls)` diff --git a/kaizen/profiles/code-refactoring/PROFILE.md b/kaizen/profiles/code-refactoring/PROFILE.md new file mode 100644 index 0000000..920ce10 --- /dev/null +++ b/kaizen/profiles/code-refactoring/PROFILE.md @@ -0,0 +1,152 @@ +--- +name: code-refactoring +description: "Recursively improve code quality metrics for a target codebase area" +version: 1.0.0 + +strategy: multi-objective +autonomy: hybrid(3) +iteration_budget: 10 +convergence: + epsilon: 0.05 + patience: 3 + +initial_state: + capture_strategy: automatic + sources: + - type: config + path: "." + description: "Project root — scan for language, framework, and existing quality config" + - type: git_history + command: "git log --oneline -50 --diff-filter=M" + description: "Recently modified files to focus refactoring effort" + +measurement: + tool_generation: true + language: python + +kpis: + - name: cyclomatic_complexity + description: "Average cyclomatic complexity per function across files in scope. Lower indicates simpler, more maintainable code." + direction: minimize + unit: count + measurement_method: automated + formula: "sum(function_complexity) / count(functions)" + - name: duplication_ratio + description: "Percentage of code that exists as duplicated blocks (>= 6 lines identical). Lower is better." + direction: minimize + unit: percentage + measurement_method: automated + formula: "duplicated_lines / total_lines * 100" + - name: file_size_compliance + description: "Percentage of files in scope that are under 400 lines. Higher means better modularization." + direction: maximize + unit: percentage + measurement_method: automated + formula: "files_under_400_lines / total_files * 100" + +mutation_targets: + defaults: + - path: "src/**" + description: "Source code files — primary refactoring target" + - path: "lib/**" + description: "Library code — secondary target" + immutable: + - path: "tests/**" + - path: "test/**" + - path: "spec/**" + - path: "__tests__/**" + - path: ".git/**" + - path: "node_modules/**" + - path: "vendor/**" + - path: "*.lock" + - path: "package-lock.json" + - path: "yarn.lock" + +connectors: + required: + - "~~sequential-thinking" + optional: [] +--- + +# Code Refactoring Improvement Instructions + +## MEASURE Phase + +Calculate code quality metrics for all files in the mutation scope: + +1. **Identify language** — detect the primary programming language(s) from file extensions +2. **Calculate cyclomatic complexity** — use language-appropriate tools: + - Python: `radon cc` or AST-based analysis + - JavaScript/TypeScript: count decision points (if, else, for, while, switch cases, &&, ||, ?:) + - Go: count branching statements + - For other languages: count `if`, `for`, `while`, `switch`, `case`, `&&`, `||`, `?:`, `catch` tokens +3. **Detect duplicated blocks** — find sequences of >= 6 identical non-blank lines across files +4. **Measure file sizes** — count lines per file (excluding blank lines and comments) + +`Read references/quality-metrics.md` for detailed calculation methods. + +## ANALYZE Phase + +Interpret the metrics in context: + +- **cyclomatic_complexity > 15 average**: Code is highly complex. Focus on the functions with complexity > 20 first. +- **cyclomatic_complexity 8-15**: Moderate. Target functions above 10 for extraction or simplification. +- **cyclomatic_complexity < 8**: Good baseline. Look for subtle improvements. + +- **duplication_ratio > 10%**: Significant duplication. Identify the most duplicated patterns. +- **duplication_ratio 3-10%**: Moderate. Focus on semantic duplication (same logic, different names). +- **duplication_ratio < 3%**: Low. Check for near-duplicates that could be parameterized. + +- **file_size_compliance < 70%**: Many oversized files. Identify files > 400 lines and their natural split points. +- **file_size_compliance 70-90%**: Getting there. Focus on the largest remaining files. +- **file_size_compliance > 90%**: Well-modularized. + +Look for correlations: large files often have high complexity and duplication. + +## HYPOTHESIZE Phase + +Common root causes for poor code quality: + +1. **God objects/files** — a single file that does too much +2. **Copy-paste programming** — duplicated code that should be extracted +3. **Deep nesting** — nested conditionals instead of early returns +4. **Feature envy** — functions that operate mostly on another module's data +5. **Primitive obsession** — using raw types instead of domain objects +6. **Long parameter lists** — functions taking >4 parameters + +## PROPOSE Phase + +`Read references/refactoring-catalog.md` + +Appropriate refactoring patterns: + +- **Extract function** — pull a block of code into a named function +- **Extract module/file** — split a large file by responsibility +- **Replace conditional with polymorphism** — reduce switch/if chains +- **Introduce early return** — reduce nesting depth +- **Extract common code** — deduplicate by creating shared utilities +- **Rename for clarity** — improve naming to reduce need for comments + +**Constraints:** +- ONE refactoring per iteration — atomic changes that can be verified independently +- MUST NOT change behavior — all refactorings are behavior-preserving +- MUST NOT touch test files — tests validate that behavior is preserved +- Prefer the smallest refactoring that meaningfully improves a KPI +- If the user has tests, suggest running them after APPLY + +## APPLY Phase + +When applying refactoring changes: +- Read the full file before editing (understand context) +- Use `Edit` for targeted changes, `Write` only for new files (extractions) +- Preserve existing formatting style (indentation, quotes, semicolons) +- Update imports/requires when extracting to new files +- Do NOT add new comments, type annotations, or documentation (scope creep) + +## VERIFY Phase + +After applying the refactoring: +1. Re-run the measurement tool to check KPIs +2. If the project has a build/compile step, verify it still passes +3. If the project has tests, suggest the user run them (supervised mode) +4. Check that no new linting errors were introduced diff --git a/kaizen/profiles/code-refactoring/references/quality-metrics.md b/kaizen/profiles/code-refactoring/references/quality-metrics.md new file mode 100644 index 0000000..c5688f8 --- /dev/null +++ b/kaizen/profiles/code-refactoring/references/quality-metrics.md @@ -0,0 +1,90 @@ +# Quality Metrics — Calculation Methods + +## Cyclomatic Complexity + +Cyclomatic complexity measures the number of linearly independent paths through a function. + +### Calculation + +For each function, count: +- 1 (base path) +- +1 for each `if`, `elif`, `else if` +- +1 for each `for`, `while`, `do-while` +- +1 for each `case` in switch/match +- +1 for each `catch`/`except` +- +1 for each `&&` or `||` in conditions +- +1 for each `?:` (ternary) +- +1 for each `?.` (optional chaining with branching effect) + +### Thresholds + +| Complexity | Assessment | Action | +|-----------|------------|--------| +| 1-5 | Low | No action needed | +| 6-10 | Moderate | Consider simplification | +| 11-20 | High | Refactor recommended | +| 21+ | Very high | Must refactor | + +### Language-Specific Tools + +| Language | Tool | Command | +|----------|------|---------| +| Python | radon | `radon cc -a -s {path}` | +| JavaScript/TypeScript | eslint | `eslint --rule 'complexity: [warn, 10]' {path}` | +| Go | gocyclo | `gocyclo {path}` | +| Java | PMD | `pmd check -d {path} -R rulesets/java/metrics.xml` | + +When language-specific tools are unavailable, use token counting (see Calculation above). + +## Duplication Detection + +### Algorithm + +1. Normalize code: remove blank lines, trim whitespace, normalize string literals +2. Sliding window: compare every N-line block (N=6) against all other blocks +3. Count matches: identical normalized blocks are duplicates +4. Calculate ratio: `duplicated_lines / total_lines * 100` + +### Thresholds + +| Ratio | Assessment | +|-------|------------| +| 0-3% | Low duplication | +| 3-10% | Moderate — review identified blocks | +| 10-20% | High — systematic deduplication needed | +| 20%+ | Critical — copy-paste culture | + +### What Counts as Duplication + +- **Exact duplication**: Identical code blocks (after whitespace normalization) +- **Near duplication**: Blocks that differ only in variable names or literals (harder to detect, note but don't count) +- **Structural duplication**: Same control flow with different operations (note but don't count in automated measurement) + +### What Doesn't Count + +- Import/require statements (naturally repeated) +- Boilerplate required by the language (main functions, module exports) +- Test setup/teardown (tests are out of scope) + +## File Size + +### Measurement + +Count total lines per file. Do NOT exclude blank lines or comments for the primary metric (they contribute to cognitive load). + +### Thresholds + +| Lines | Assessment | Recommendation | +|-------|------------|----------------| +| 1-200 | Small | Ideal size | +| 201-400 | Medium | Acceptable | +| 401-800 | Large | Consider splitting | +| 801+ | Very large | Must split | + +### Split Point Identification + +Look for natural boundaries: +- Multiple class definitions in one file +- Groups of functions with different concerns +- Sections separated by comment headers +- Functions that are only called by each other (form a cohesive group) diff --git a/kaizen/profiles/code-refactoring/references/refactoring-catalog.md b/kaizen/profiles/code-refactoring/references/refactoring-catalog.md new file mode 100644 index 0000000..587e76c --- /dev/null +++ b/kaizen/profiles/code-refactoring/references/refactoring-catalog.md @@ -0,0 +1,97 @@ +# Refactoring Catalog — Safe Behavior-Preserving Patterns + +Each refactoring is atomic and behavior-preserving. Apply ONE per iteration. + +## Complexity Reducers + +### Extract Function +**When:** A block of code inside a function has a clear purpose +**Effect:** Reduces cyclomatic complexity of the parent function +**Steps:** +1. Identify the block and its inputs/outputs +2. Create a new function with a descriptive name +3. Move the block to the new function +4. Replace the block with a call to the new function +5. Pass inputs as parameters, return outputs + +### Introduce Early Return +**When:** Deep nesting from guard conditions +**Effect:** Reduces nesting depth and perceived complexity +**Steps:** +1. Identify guard conditions (null checks, validation, error cases) +2. Invert the condition and return/throw early +3. Remove the else branch and reduce indentation + +### Replace Conditional with Polymorphism +**When:** Multiple if/switch branches doing different things based on type +**Effect:** Eliminates branching, distributes logic to appropriate classes +**Steps:** +1. Identify the type-based branching +2. Create an interface/abstract class +3. Implement one concrete class per branch +4. Replace the conditional with a method call on the polymorphic object + +### Decompose Conditional +**When:** Complex boolean expression in a condition +**Effect:** Improves readability and reduces McCabe complexity +**Steps:** +1. Extract each part of the condition into a named boolean variable or function +2. Replace the complex expression with the named variables + +## Duplication Reducers + +### Extract Common Code +**When:** Same code block appears in 2+ places +**Effect:** Directly reduces duplication_ratio +**Steps:** +1. Identify the duplicated block +2. Create a shared function/utility +3. Replace all occurrences with calls to the shared function +4. Parameterize any differences between the occurrences + +### Pull Up Common Code +**When:** Subclasses/implementations share identical code +**Effect:** Reduces duplication in class hierarchies +**Steps:** +1. Identify the shared code in subclasses +2. Move it to the parent class +3. Remove the duplicated code from subclasses + +### Parameterize Method +**When:** Two methods do the same thing with slightly different values +**Effect:** Eliminates near-duplication +**Steps:** +1. Identify the differences between the two methods +2. Add parameters for the differing values +3. Merge into a single parameterized method +4. Update all callers + +## File Size Reducers + +### Extract Module +**When:** A file exceeds 400 lines and has identifiable sections +**Effect:** Directly improves file_size_compliance +**Steps:** +1. Identify cohesive groups of functions/classes +2. Create a new file for each group +3. Move the code to the new files +4. Update imports in the original file and all consumers +5. Re-export from the original file if it's a public API (backward compatibility) + +### Move Function to Caller's Module +**When:** A function in a large file is only used by one other module +**Effect:** Reduces file size, improves cohesion +**Steps:** +1. Identify functions with a single caller in another file +2. Move the function to the caller's file +3. Remove the export from the original file +4. Update imports + +## Safety Rules + +1. **Never change behavior** — the refactored code must produce identical outputs for identical inputs +2. **Never change public APIs** — function signatures, export lists, and type definitions must remain compatible +3. **Never touch tests** — tests are the safety net, not the target +4. **One refactoring per iteration** — compound changes are harder to verify and revert +5. **Read before editing** — always read the full file to understand context +6. **Preserve style** — match existing indentation, naming conventions, and formatting diff --git a/kaizen/profiles/process-improvement/PROFILE.md b/kaizen/profiles/process-improvement/PROFILE.md new file mode 100644 index 0000000..6d75423 --- /dev/null +++ b/kaizen/profiles/process-improvement/PROFILE.md @@ -0,0 +1,179 @@ +--- +name: process-improvement +description: "Design and facilitate kaizen improvement loops for business and operational processes" +version: 1.0.0 + +strategy: multi-objective +autonomy: supervised +iteration_budget: 5 +convergence: + epsilon: 0.05 + patience: 2 + +initial_state: + capture_strategy: hybrid + sources: + - type: user_provided + path: "" + description: "Process documentation — SOPs, workflow diagrams, checklists, runbooks" + - type: user_provided + path: "" + description: "Current metrics — cycle time, defect rate, throughput, satisfaction scores" + +measurement: + tool_generation: false + language: python + +kpis: + - name: primary_metric + description: "The primary process KPI defined by the user during setup (e.g., cycle time, defect rate, throughput). Direction and unit are configured during BOOTSTRAP." + direction: minimize + unit: custom + measurement_method: user-reported + formula: "User-defined — established during BOOTSTRAP based on the specific process" + - name: secondary_metric + description: "Optional secondary KPI to track trade-offs (e.g., if reducing cycle time, track quality). Direction and unit configured during BOOTSTRAP." + direction: maximize + unit: custom + measurement_method: user-reported + formula: "User-defined — established during BOOTSTRAP based on the specific process" + +mutation_targets: + defaults: + - path: ".docs/processes/" + description: "Process documentation, SOPs, and checklists" + - path: ".docs/workflows/" + description: "Workflow definitions and diagrams" + immutable: + - path: ".git/**" + - path: "src/**" + - path: "tests/**" + +connectors: + required: + - "~~sequential-thinking" + optional: + - "~~knowledge base" +--- + +# Process Improvement Instructions + +This profile helps humans design and run kaizen improvement loops for business and operational processes. Unlike the code-focused profiles, this one works with human-reported metrics and process documents rather than code analysis. + +## BOOTSTRAP Special Handling + +This profile requires additional setup during BOOTSTRAP because KPIs are user-defined: + +1. **Ask the user** to describe the process they want to improve: + - What is the process? (name, purpose, scope) + - Who is involved? (roles, handoffs) + - What's the current pain point? (what's not working well) + +2. **Define KPIs together** with the user: + - What is the primary metric to optimize? (e.g., "time from request to delivery") + - What direction? (minimize/maximize) + - What unit? (hours, percentage, count, etc.) + - What is the current value? (baseline) + - Is there a secondary metric to track trade-offs? + +3. **Gather process documentation**: + - Ask the user to provide or point to existing SOPs, checklists, or workflow descriptions + - If no documentation exists, help the user document the current process (this becomes the baseline artifact) + +4. **Set the `.kaizen/` location**: + - If the process relates to a specific project: use `.kaizen/` at project root + - If the process is personal or cross-project: use `~/.kaizen/` + +## MEASURE Phase + +Since KPIs are user-reported: + +1. **Present the KPI definitions** to the user as a reminder +2. **Ask for current values**: + - "What is the current {primary_metric_name}? (in {unit})" + - "What is the current {secondary_metric_name}? (in {unit})" (if defined) +3. **Record values** with timestamp +4. **Ask for qualitative observations**: "Any notable changes or events since the last measurement?" + +`Read references/kpi-design-guide.md` if the user struggles to provide numeric values — help them define a measurement protocol. + +## ANALYZE Phase + +`Read references/kaizen-methodology.md` + +Analyze the user-reported data in context: + +1. **Quantitative analysis** — delta from baseline and previous iteration +2. **Qualitative analysis** — examine the user's observations for patterns +3. **Process document review** — read the current process documentation to identify: + - Steps with excessive handoffs + - Steps with waiting time (queues) + - Steps that add no value (inspections that catch nothing, approvals that never reject) + - Steps with high variability (sometimes fast, sometimes slow) + +4. **Apply kaizen lenses**: + - **Value Stream Analysis**: which steps add value vs waste? + - **5 Whys**: for each problem, ask "why" iteratively to find root cause + - **PDCA**: where are we in the Plan-Do-Check-Act cycle? + +## HYPOTHESIZE Phase + +Common root causes for process inefficiency: + +1. **Batching** — work waits in queues instead of flowing +2. **Over-processing** — steps that add more detail than needed +3. **Handoff friction** — information lost between people/teams +4. **Rework loops** — defects found late, requiring re-doing previous steps +5. **Waiting** — approvals, reviews, or dependencies that block flow +6. **Motion waste** — switching between tools, systems, or contexts +7. **Unclear ownership** — nobody knows who's responsible for what + +Apply the **5 Whys** technique: for each hypothesis, ask "why does this happen?" and trace to the root cause. + +## PROPOSE Phase + +`Read references/facilitation-guide.md` + +Appropriate process improvements: + +- **Eliminate a non-value-adding step** — remove unnecessary approvals, inspections, or handoffs +- **Reduce batch size** — process smaller units more frequently +- **Automate a manual step** — identify steps that could be automated +- **Add a quality gate earlier** — catch defects before they propagate +- **Clarify a handoff** — define exact inputs/outputs between steps +- **Create a checklist** — standardize a variable step +- **Parallelize sequential steps** — identify steps that don't actually depend on each other + +**Format proposals as changes to process documents:** +- Update the SOP/checklist/workflow to reflect the proposed change +- Describe the change in human terms (not code) +- Include an implementation plan: who needs to do what, when + +**Constraints:** +- ONE process change per iteration — don't overwhelm the team +- MUST be implementable by the team (don't propose changes they can't control) +- Present with rationale — people need to understand why +- Consider cultural impact — process changes affect people +- ALWAYS pause for user approval (supervised mode is mandatory for this profile) + +## APPLY Phase + +For process improvement, "applying" means updating the process documentation: + +1. **Modify the SOP/checklist/workflow** — update the relevant document to reflect the proposed change +2. **Add implementation notes** — what the team needs to do differently +3. **Create a measurement protocol** — how to know if the change worked +4. **Set a review date** — when to measure the impact (typically 1-2 weeks for the next iteration) + +The user is responsible for actually implementing the process change in their organization. The engine updates the documentation and tracks progress. + +## VERIFY Phase + +Since process changes take time to show results: + +1. **Remind the user** that process improvements need time to take effect +2. **Ask for updated metrics** at the next iteration (possibly after days or weeks) +3. **Ask for qualitative feedback**: "How did the team respond to the change? Any unexpected effects?" +4. **Record both quantitative and qualitative data** + +This profile is inherently slower than code-focused profiles. Each iteration may span days or weeks. diff --git a/kaizen/profiles/process-improvement/references/facilitation-guide.md b/kaizen/profiles/process-improvement/references/facilitation-guide.md new file mode 100644 index 0000000..9d9957c --- /dev/null +++ b/kaizen/profiles/process-improvement/references/facilitation-guide.md @@ -0,0 +1,93 @@ +# Facilitation Guide for Process Improvement + +## Role of the Engine in Process Improvement + +Unlike code-focused profiles where the engine directly modifies files, the process improvement profile acts as a **facilitator**. The engine: + +1. **Structures the conversation** — guides the user through kaizen methodology +2. **Documents findings** — writes analysis, proposals, and decisions to the audit trail +3. **Maintains the process documents** — updates SOPs, checklists, and workflows +4. **Tracks progress** — records KPI measurements across iterations +5. **Provides methodology guidance** — applies 5 Whys, value stream analysis, etc. + +The engine does NOT implement process changes — the human team does. + +## Facilitation Protocol + +### Starting a Process Improvement Loop + +1. **Understand the process** — ask open-ended questions: + - "Walk me through the process step by step" + - "Where do things typically get stuck?" + - "What's the most frustrating part?" + +2. **Establish scope** — narrow to something improvable: + - "Let's focus on the part from [X] to [Y]" + - "That's a big process — which part has the most impact on your pain point?" + +3. **Define success** — make it concrete: + - "If we improve this, what would be different in 4 weeks?" + - "What number would make you confident this is working?" + +### During Each Iteration + +1. **Check in on implementation** — before measuring: + - "Were you able to implement the change from last time?" + - "Any obstacles or resistance from the team?" + +2. **Collect data** — be specific: + - "What was the [metric] this week? Just the number is fine." + - "Any notable events that affected the metric?" + +3. **Analyze together** — share findings and validate: + - "The data shows [X]. Does that match your experience?" + - "It looks like [change] had [effect]. Do you agree?" + +4. **Propose with buy-in** — don't dictate: + - "Based on the analysis, I see a few options: [A], [B], or [C]. Which feels most feasible for your team?" + - "This would require [effort]. Is that realistic right now?" + +### Between Iterations + +Process improvements take time. Between iterations: +- **Set expectations** — "We should measure again in [timeframe]" +- **Document the implementation plan** — what the team will do +- **Note risks** — what could go wrong, what to watch for + +## Handling Common Situations + +### The User Can't Provide a Number +Some metrics are hard to quantify. Help by: +- Breaking the metric into sub-components +- Suggesting proxy metrics +- Using ordinal scales (1-5) for subjective assessments +- Starting with "better/same/worse" and refining later + +### The Team Resists the Change +Process changes affect people. Address resistance by: +- Involving the team in the proposal (don't mandate) +- Starting with the smallest possible change +- Making it easy to revert if it doesn't work +- Framing as an experiment, not a policy change + +### The Metric Doesn't Move +After an iteration, the KPI is unchanged. This might mean: +- The change wasn't implemented fully +- Not enough time has passed to see effects +- The hypothesis was wrong (try a different approach) +- The metric isn't measuring the right thing + +### Multiple Competing Priorities +The user has several things they want to improve simultaneously: +- Focus on ONE metric per improvement loop +- Use a secondary metric only to detect trade-offs +- Suggest running separate loops for separate concerns +- Help prioritize: "Which improvement would create the most relief for your team?" + +## Tone and Approach + +- **Collaborative, not prescriptive** — "What do you think about..." not "You should..." +- **Curious, not judgmental** — "Why does this step exist?" not "This step is wasteful" +- **Patient** — process improvement is measured in weeks, not minutes +- **Pragmatic** — the best improvement is one the team will actually do +- **Encouraging** — celebrate small wins, they compound over time diff --git a/kaizen/profiles/process-improvement/references/kaizen-methodology.md b/kaizen/profiles/process-improvement/references/kaizen-methodology.md new file mode 100644 index 0000000..1086a5d --- /dev/null +++ b/kaizen/profiles/process-improvement/references/kaizen-methodology.md @@ -0,0 +1,101 @@ +# Kaizen Methodology Reference + +## Core Principles + +**Kaizen** (改善) means "change for better" in Japanese. It is a philosophy of continuous, incremental improvement involving everyone in the organization. + +### The 5 Kaizen Principles + +1. **Know your customer** — understand who benefits from the process +2. **Let it flow** — eliminate waste so work flows smoothly +3. **Go to gemba** — observe the actual process where it happens +4. **Empower people** — involve the people who do the work in improving it +5. **Be transparent** — make problems visible, track metrics openly + +## PDCA Cycle (Deming Cycle) + +The foundation of every kaizen iteration: + +### Plan +- Identify the problem or opportunity +- Analyze root causes (5 Whys, Ishikawa) +- Develop a hypothesis and proposed change +- Define success criteria and metrics + +### Do +- Implement the change on a small scale +- Document what was done +- Collect data during implementation + +### Check +- Compare results against the plan +- Analyze what worked and what didn't +- Look for unintended consequences + +### Act +- If successful: standardize the change (update SOPs, checklists) +- If unsuccessful: analyze why and try a different approach +- Document learnings for future iterations + +## 5S Methodology + +For workplace/process organization: + +| Step | Japanese | English | Application | +|------|----------|---------|-------------| +| 1 | Seiri | Sort | Remove unnecessary steps, tools, artifacts | +| 2 | Seiton | Set in order | Organize remaining elements for optimal flow | +| 3 | Seiso | Shine | Clean up processes, remove workarounds | +| 4 | Seiketsu | Standardize | Document the improved process | +| 5 | Shitsuke | Sustain | Maintain the improvement over time | + +## 7 Wastes (Muda) + +Look for these in any process: + +| Waste | Process Equivalent | Example | +|-------|-------------------|---------| +| Transport | Unnecessary handoffs | Forwarding emails through 3 people | +| Inventory | Work in progress queues | 50 tickets waiting for review | +| Motion | Context switching | Switching between 5 tools for one task | +| Waiting | Blocked work | Waiting for approvals, dependencies | +| Over-processing | Excessive detail | Writing 10-page reports nobody reads | +| Overproduction | Doing too much | Building features nobody asked for | +| Defects | Rework | Fixing mistakes from unclear requirements | + +## Root Cause Analysis + +### 5 Whys + +Start with the problem. Ask "Why?" 5 times to reach the root cause. + +**Example:** +1. Why is delivery slow? → Approvals take too long +2. Why do approvals take too long? → Approvers are overloaded +3. Why are approvers overloaded? → Every change needs senior approval +4. Why does every change need senior approval? → Policy was set after a major incident +5. Why hasn't the policy been updated? → Nobody reviewed it after the incident was resolved + +**Root cause:** Outdated approval policy that was appropriate for a crisis but not for normal operations. + +### Ishikawa (Fishbone) Diagram + +Categorize causes into 6 areas: +- **People** — skills, training, availability +- **Process** — steps, sequence, handoffs +- **Technology** — tools, systems, automation +- **Materials** — inputs, data quality, templates +- **Environment** — culture, priorities, competing demands +- **Measurement** — metrics, feedback loops, visibility + +## Value Stream Mapping + +For each step in the process: + +1. **Name** the step +2. **Classify** as value-adding (VA), necessary non-value-adding (NNVA), or waste (W) +3. **Measure** processing time (how long the step takes when actively worked) +4. **Measure** lead time (how long from entering to leaving the step, including waiting) +5. **Calculate** efficiency: processing time / lead time + +**Target:** Improve flow efficiency by reducing the gap between processing time and lead time. diff --git a/kaizen/profiles/process-improvement/references/kpi-design-guide.md b/kaizen/profiles/process-improvement/references/kpi-design-guide.md new file mode 100644 index 0000000..2daf71a --- /dev/null +++ b/kaizen/profiles/process-improvement/references/kpi-design-guide.md @@ -0,0 +1,91 @@ +# KPI Design Guide for Process Improvement + +## Principles of Good Process KPIs + +### 1. Measurable +The KPI must produce a number. If you can't measure it, you can't improve it. + +**Bad:** "Customer satisfaction is good" +**Good:** "Customer satisfaction score is 4.2 out of 5" + +### 2. Actionable +The team must be able to influence the KPI through process changes. + +**Bad:** "Market share" (too many external factors) +**Good:** "Time to respond to customer requests" (directly controllable) + +### 3. Relevant +The KPI must relate to the actual pain point the team wants to fix. + +**Bad:** "Number of meetings" (doesn't measure outcomes) +**Good:** "Decisions made per week" (measures what meetings should produce) + +### 4. Time-bound +The KPI must have a measurement frequency that matches the improvement cycle. + +**Bad:** "Annual revenue growth" (too slow for weekly kaizen cycles) +**Good:** "Weekly throughput" (measurable at each iteration) + +## Common Process KPIs + +### Efficiency KPIs +| KPI | Formula | Unit | Direction | +|-----|---------|------|-----------| +| Cycle time | Time from start to finish of one unit | hours/days | minimize | +| Lead time | Time from request to delivery | hours/days | minimize | +| Throughput | Units completed per time period | count/week | maximize | +| Flow efficiency | Processing time / Lead time | percentage | maximize | +| First-time-right rate | Units without rework / Total units | percentage | maximize | + +### Quality KPIs +| KPI | Formula | Unit | Direction | +|-----|---------|------|-----------| +| Defect rate | Defects found / Units produced | percentage | minimize | +| Rework rate | Units requiring rework / Total units | percentage | minimize | +| Escape rate | Defects found by customers / Total defects | percentage | minimize | + +### Satisfaction KPIs +| KPI | Formula | Unit | Direction | +|-----|---------|------|-----------| +| Customer satisfaction | Survey score average | 1-5 scale | maximize | +| Internal NPS | Team satisfaction with the process | -100 to 100 | maximize | +| Ease of use | Self-reported ease rating | 1-5 scale | maximize | + +## Measurement Protocols + +### For Automated Metrics +If the process is tracked in a tool (Jira, Linear, etc.): +1. Define the query that extracts the metric +2. Document the query for reproducibility +3. Run at a consistent time (e.g., every Monday morning) + +### For User-Reported Metrics +If the metric requires human observation: +1. Define exactly what to measure and how +2. Create a simple recording form (spreadsheet, checklist) +3. Assign responsibility for measurement +4. Set a measurement cadence (weekly, biweekly) + +### For Survey-Based Metrics +1. Use consistent questions across measurements +2. Keep surveys short (3-5 questions maximum) +3. Use the same scale consistently +4. Measure at regular intervals, not ad-hoc + +## Avoiding KPI Pitfalls + +### Goodhart's Law +"When a measure becomes a target, it ceases to be a good measure." + +**Mitigation:** Always pair efficiency KPIs with quality KPIs. If you optimize cycle time, also track defect rate. + +### Vanity Metrics +Metrics that look good but don't drive improvement. + +**Signs:** The metric always improves, nobody acts on it, it doesn't correlate with outcomes. +**Fix:** Ask "If this metric improves by 20%, what would change in practice?" + +### Measurement Overhead +The act of measuring shouldn't be more expensive than the improvement. + +**Rule of thumb:** If measurement takes >10% of the time spent on the process, simplify the metric. diff --git a/kaizen/skills/kaizen-engine/SKILL.md b/kaizen/skills/kaizen-engine/SKILL.md new file mode 100644 index 0000000..9ab920a --- /dev/null +++ b/kaizen/skills/kaizen-engine/SKILL.md @@ -0,0 +1,490 @@ +--- +name: kaizen-engine +description: "Recursive improvement loop engine inspired by karpathy/autoresearch. Orchestrates continuous improvement via Sequential Thinking MCP with 8-phase iterations (MEASURE, ANALYZE, HYPOTHESIZE, PROPOSE, APPLY, VERIFY, DECIDE, LOG). Supports greedy and multi-objective ratcheting strategies with configurable autonomy levels. Use when the user says 'run kaizen', 'kaizen loop', 'improve', 'optimization loop', 'continuous improvement', 'recursive improvement', 'iterative optimization', 'run improvement profile', or wants to iteratively improve code, configuration, or processes against measurable KPIs. Also activates when user references a specific profile name like 'claude-code-usage', 'code-refactoring', or 'process-improvement'. Requires **~~sequential-thinking** connector." +--- + +# Kaizen Engine — Recursive Improvement Loop Orchestrator + +## 1. Overview + +The kaizen engine runs recursive improvement loops against measurable KPIs. It reads a **profile** (PROFILE.md) that defines what to improve, how to measure, and what to mutate, then executes iterative cycles until convergence, budget exhaustion, or user interrupt. + +**Architecture:** Engine + Profiles. The engine is generic; profiles are domain-specific. + +**Connector requirement:** This skill requires **~~sequential-thinking** for loop orchestration. Without it, the skill cannot function. Direct the user to the README for setup instructions. + +**Storage:** Audit logs are written to `.kaizen/runs/` at the improvement target location: +- Project-level improvements: `.kaizen/` at project root +- User-level improvements: `~/.kaizen/` + +--- + +## 2. Pipeline + +### Step 0 — Profile Resolution + +Determine which profile to load: + +1. If the user specifies a profile name (e.g., `claude-code-usage`), look for it in: + - `profiles/{name}/PROFILE.md` within this plugin directory + - A custom path provided by the user +2. If no profile is specified, present the available bundled profiles and ask the user to choose: + - **claude-code-usage** — analyze and improve Claude Code tool/skill usage patterns + - **code-refactoring** — recursively improve code quality metrics + - **process-improvement** — design and run kaizen loops for business processes +3. Parse the PROFILE.md YAML frontmatter to extract configuration: + - `name`, `version`, `strategy`, `autonomy` + - `kpis[]` — name, description, direction, unit, measurement_method, formula + - `initial_state.sources[]` — data sources for baseline capture + - `mutation_targets.defaults[]` and `mutation_targets.immutable[]` + - `convergence.epsilon`, `convergence.patience` + - `iteration_budget` + - `measurement.tool_generation`, `measurement.language` +4. Ask the user for any **scope overrides**: + - Narrow or expand mutation targets + - Adjust iteration budget + - Override autonomy level for this run + +**Output:** Resolved profile configuration ready for BOOTSTRAP. + +### Step 1 — BOOTSTRAP + +Prepare the improvement environment before the first iteration. + +#### 1a. Run ID Generation + +Generate a unique run ID: `YYYY-MM-DD-{profile-name}-{NNN}` +- Date: today's date +- Profile name: from the profile's `name` field +- Sequence: zero-padded 3-digit number, incremented from the highest existing run for this profile in the `.kaizen/runs/` directory. Start at `001` if no previous runs exist. + +Create the run directory: `.kaizen/runs/{run-id}/` + +#### 1b. Continuity Check + +Look for previous runs of the same profile in `.kaizen/runs/`: +- If a previous `summary.json` exists, read it. The `final` KPIs from the most recent run become the **inherited baseline**. Skip fresh source collection — we already know the previous state. +- If no previous runs exist, proceed to fresh source collection. + +#### 1c. Source Collection + +For each source declared in `initial_state.sources`: + +| Source type | Collection method | +|-------------|-------------------| +| `session_transcripts` | `Read` files matching the `path` glob pattern | +| `config` | `Read` files in the declared path | +| `git_history` | Execute the declared `command` via `Bash` | +| `memory` | `Read` memory files matching the `path` glob | +| `user_provided` | Ask the user to provide or point to the data | + +Collect and summarize findings. Do NOT load entire transcript contents into context — extract relevant statistics and patterns only. + +#### 1d. Measurement Tool Scaffolding + +If `measurement.tool_generation` is `true` in the profile: + +1. `Read references/tool-scaffolding.md` for the generation template and interface contract +2. Generate a measurement script in the declared `language` (Python or TypeScript) +3. The script MUST: + - Accept no arguments (reads its own config from the run directory) + - Output JSON to stdout: `{"kpis": {"kpi_name": numeric_value, ...}, "metadata": {"timestamp": "ISO-8601", "profile": "name", "details": {...}}}` + - Handle errors gracefully (exit code 1 + JSON error message to stderr) + - Be self-contained (no external dependencies beyond the standard library and common tools like `git`) +4. Write the script to `.kaizen/runs/{run-id}/measure.{py|ts}` +5. Run it to capture the **baseline snapshot** +6. Write baseline to `.kaizen/runs/{run-id}/baseline.json` + +If `measurement.tool_generation` is `false`, measurement is handled inline by the engine during the MEASURE phase (for `user-reported` or simple metrics). + +#### 1e. Adversarial Tool Review + +Dispatch the **kaizen-reviewer** agent to validate the generated measurement tool: + +**Context to pass:** +- The profile's KPI definitions (names, formulas, directions) +- The generated measurement script source code +- The baseline output + +**Review criteria:** +- Does the tool actually measure what the KPI formulas describe? +- Are there edge cases where the tool could produce misleading values? +- Is the output format compliant with the interface contract? +- Could the tool be gamed by trivial changes (e.g., renaming a file to change a count)? + +If the reviewer flags CRITICAL issues, fix and re-scaffold. If MEDIUM issues, note them and proceed with caution. + +#### 1f. Manifest + +Write `.kaizen/runs/{run-id}/manifest.json`: +```json +{ + "run_id": "{run-id}", + "profile": "{profile-name}", + "profile_version": "{version}", + "started_at": "ISO-8601", + "strategy": "greedy|multi-objective", + "autonomy": "autonomous|supervised|hybrid(N)", + "iteration_budget": N, + "convergence": {"epsilon": 0.02, "patience": 3}, + "scope_overrides": {...}, + "inherited_baseline": true|false, + "previous_run": "{run-id}"|null +} +``` + +**Output:** Bootstrap complete. Environment ready for iteration loop. + +### Step 2 — Iteration Loop + +Each iteration is orchestrated as a **Sequential Thinking chain** via `~~sequential-thinking`. The chain comprises 8 thoughts, one per phase. + +Before each iteration, reconstruct optimal context: +- Profile frontmatter (KPIs, strategy, mutation targets) +- Current profile markdown body section for the active phase +- Most recent `summary.json` or `baseline.json` +- Previous iteration's `decision.json` (if any) +- Current iteration number and remaining budget + +**Do NOT carry forward full analysis text from previous iterations.** Each iteration starts clean. + +--- + +#### Phase 1: MEASURE + +**Sequential Thinking — Thought 1** + +Collect current KPI values: + +- If measurement tool exists: dispatch **kaizen-measurer** agent to run it + - Pass: path to measurement script, run directory + - Expect: JSON output with KPI values +- If `user-reported` KPIs: ask the user for current values +- If `hybrid`: run automated tool + ask user for non-automatable metrics + +Write results to `.kaizen/runs/{run-id}/iterations/{NNN}/measurement.json`: +```json +{ + "iteration": N, + "timestamp": "ISO-8601", + "kpis": {"kpi_name": numeric_value, ...}, + "source": "automated|user-reported|hybrid" +} +``` + +**Failure mode:** If measurement tool crashes, log the error. If recoverable (typo, missing file), fix and retry once. If fundamental (missing runtime, permissions), abort the iteration and report to user. + +--- + +#### Phase 2: ANALYZE + +**Sequential Thinking — Thought 2** + +Dispatch **kaizen-analyzer** agent to interpret measurements: + +**Context to pass:** +- Current measurement.json +- Baseline or previous iteration's measurement +- Profile's KPI definitions and directions +- The relevant section from the profile's markdown body (## ANALYZE Phase) + +**Expected output:** +- Per-KPI delta from baseline and from previous iteration +- Trend direction (improving, plateauing, regressing) +- Identification of the KPI with the most room for improvement +- Any anomalies or unexpected patterns + +Write to `.kaizen/runs/{run-id}/iterations/{NNN}/analysis.md` + +**Failure mode:** If analysis is inconclusive, note uncertainty and proceed. The DECIDE phase will handle ambiguity. + +--- + +#### Phase 3: HYPOTHESIZE + +**Sequential Thinking — Thought 3** + +Based on the analysis, form hypotheses about: +- **Root causes** — why are specific KPIs at their current levels? +- **Opportunities** — what changes would most likely improve the target KPIs? +- **Risks** — what could go wrong with potential changes? + +Read the profile's `## HYPOTHESIZE Phase` section for domain-specific guidance. + +This phase is inline (no subagent dispatch) — it uses the Sequential Thinking chain's reasoning capability. + +Write hypotheses to `.kaizen/runs/{run-id}/iterations/{NNN}/analysis.md` (append to analysis). + +--- + +#### Phase 4: PROPOSE + +**Sequential Thinking — Thought 4** + +Dispatch **kaizen-proposer** agent to generate a concrete change proposal: + +**Context to pass:** +- Analysis and hypotheses from Phases 2-3 +- Profile's mutation targets (defaults + any user overrides) +- Profile's immutable list (MUST NOT be touched) +- Profile's `## PROPOSE Phase` section +- Previous iteration's proposal and decision (if the previous change was reverted, avoid repeating it) + +**Expected output:** +- A specific, minimal change plan +- Which files/assets to modify +- What the modification is (described precisely) +- Expected impact on KPIs (with reasoning) +- Confidence level (high/medium/low) + +Write to `.kaizen/runs/{run-id}/iterations/{NNN}/proposal.md` + +**Autonomy gate:** If autonomy is `supervised`, present the proposal to the user and wait for approval. If `hybrid(N)` and iteration count > N, also pause for approval. If `autonomous`, proceed directly. + +**Failure mode:** If the proposer cannot find a viable change, log "no viable proposal" and proceed to DECIDE (which will trigger the patience counter). + +--- + +#### Phase 5: APPLY + +**Sequential Thinking — Thought 5** + +Apply the proposed changes: + +1. **Backup** — before any mutation, create backups of all files in mutation scope: + `.kaizen/runs/{run-id}/iterations/{NNN}/backup/` + Copy each file that will be modified, preserving relative paths. + +2. **Verify immutability** — double-check that no proposed change touches files matching `mutation_targets.immutable` patterns. If a violation is detected, ABORT the iteration and flag to the user. + +3. **Apply changes** — execute the mutations described in the proposal using `Edit` or `Write` tools. For each change: + - Read the current file + - Apply the modification + - Verify the file is syntactically valid (if applicable — e.g., JSON, YAML) + +4. **Generate diff** — capture the changes: + - If targets are under git: `git diff` → save as `.kaizen/runs/{run-id}/iterations/{NNN}/diff.patch` + - If not under git: generate a unified diff from the backup copies + +**Failure mode:** If any mutation fails partway through: +1. Restore ALL files from backup (full revert) +2. Log the failure +3. Proceed to DECIDE with `apply_failed: true` + +--- + +#### Phase 6: VERIFY + +**Sequential Thinking — Thought 6** + +Re-measure KPIs after the change (same method as Phase 1): +- If measurement tool exists: dispatch **kaizen-measurer** agent +- If user-reported: ask user for updated values +- If hybrid: both + +Write to `.kaizen/runs/{run-id}/iterations/{NNN}/verification.json` (same schema as measurement.json). + +**Failure mode:** If verification measurement fails, treat the iteration as inconclusive and revert (fail-safe). + +--- + +#### Phase 7: DECIDE + +**Sequential Thinking — Thought 7** + +`Read references/ratchet-strategies.md` + +Compare verification KPIs against the pre-iteration measurement: + +**Greedy strategy (single KPI):** +- If the target KPI improved by at least `epsilon`: **KEEP** +- Otherwise: **REVERT** + +**Multi-objective strategy:** +Apply Pareto dominance check: +- **KEEP** if: no KPI regressed beyond epsilon AND at least one KPI improved by at least epsilon +- **REVERT** if: any KPI regressed beyond epsilon +- **ESCALATE** if: autonomy is not `autonomous` and there's a trade-off (one improved, another regressed within epsilon) — present to user for judgment + +**Decision record:** +Write to `.kaizen/runs/{run-id}/iterations/{NNN}/decision.json`: +```json +{ + "iteration": N, + "decision": "keep|revert|escalate", + "strategy": "greedy|multi-objective", + "kpi_deltas": {"kpi_name": {"before": X, "after": Y, "delta": Z, "direction": "improved|regressed|unchanged"}}, + "reasoning": "...", + "apply_failed": false, + "no_proposal": false +} +``` + +**If REVERT:** +- Restore all files from `.kaizen/runs/{run-id}/iterations/{NNN}/backup/` +- If targets are under git: `git checkout` the modified files +- Increment the patience counter + +**If KEEP:** +- If targets are under git: stage and commit with message `kaizen({profile}): iteration {N} — {brief description}` +- Reset the patience counter + +--- + +#### Phase 8: LOG + +**Sequential Thinking — Thought 8** + +Update the run's aggregate state: + +1. Update `.kaizen/runs/{run-id}/summary.json` (create if first iteration): +```json +{ + "profile": "{name}", + "run_id": "{run-id}", + "started_at": "ISO-8601", + "updated_at": "ISO-8601", + "iterations_completed": N, + "iterations_kept": K, + "iterations_reverted": R, + "baseline": {"kpi_name": value, ...}, + "current": {"kpi_name": value, ...}, + "improvement": {"kpi_name": "+XX%", ...}, + "patience_counter": P, + "convergence_reason": null|"patience_exceeded"|"budget_exhausted"|"user_stopped"|"adversarial_flag" +} +``` + +2. Present a brief iteration summary to the user: + - Iteration N of M (budget) + - Decision: kept/reverted + - Current KPIs vs baseline (with improvement percentages) + - Patience counter status + +--- + +#### Loop Control + +After LOG, evaluate stopping conditions: + +| Condition | Trigger | Action | +|-----------|---------|--------| +| Convergence | patience counter >= `convergence.patience` | Stop — improvement has plateaued | +| Budget | iteration count >= `iteration_budget` | Stop — budget exhausted | +| User interrupt | User requests stop | Stop — graceful exit | +| Adversarial flag | Reviewer flags measurement integrity | Stop — investigation needed | +| No budget limit | `iteration_budget` is 0 | Continue indefinitely until convergence or interrupt | + +If no stopping condition is met: **loop back to Phase 1** (MEASURE) for the next iteration. Reconstruct context before starting. + +--- + +### Step 3 — Final Review Gate + +When the loop stops (for any reason): + +1. Dispatch **kaizen-reviewer** agent for adversarial review: + + **Context to pass:** + - Profile's mission (name, description, KPI definitions) + - The measurement tool source code + - summary.json (baseline → final KPIs) + - A sample of 2-3 iteration decision records (first, best, last) + + **Review criteria:** + - Are the reported improvements genuine or measurement artifacts? + - Do the applied changes align with the profile's stated mission? + - Could any improvement be attributed to the measurement tool being gamed? + - Were any immutable boundaries violated? + +2. Write review to `.kaizen/runs/{run-id}/adversarial-review.md` + +3. Update summary.json with `"adversarial_review": "passed|flagged"` and `"convergence_reason"`. + +### Step 4 — Final Report + +Present a comprehensive summary to the user: + +- **Profile**: name and version +- **Run ID**: for future reference +- **Iterations**: completed / kept / reverted +- **KPI Results Table**: + + | KPI | Baseline | Final | Delta | Improvement | + |-----|----------|-------|-------|-------------| + | ... | ... | ... | ... | ... | + +- **Convergence reason**: why the loop stopped +- **Adversarial review**: passed or flagged (with details if flagged) +- **Audit trail**: path to `.kaizen/runs/{run-id}/` for detailed inspection +- **Recommendations**: based on the adversarial review, suggest next steps (re-run with different focus, manual review of specific changes, schedule next run) + +--- + +## 3. Progressive Disclosure + +| Step | Documents to Read | +|------|-------------------| +| Step 0 | Profile's PROFILE.md (frontmatter only for config) | +| Step 1b | Previous run's summary.json (if continuity) | +| Step 1d | `references/tool-scaffolding.md` | +| Phase 4 (PROPOSE) | Profile's PROPOSE section from markdown body | +| Phase 7 (DECIDE) | `references/ratchet-strategies.md` | +| Step 3 | (no additional — reviewer agent is self-contained) | + +--- + +## 4. Subagent Dispatch Reference + +| Phase | Agent | Model | Context Package | +|-------|-------|-------|-----------------| +| BOOTSTRAP (1e) | kaizen-reviewer | opus | KPI defs + tool source + baseline output | +| MEASURE (1) | kaizen-measurer | haiku | Measurement script path + run directory | +| ANALYZE (2) | kaizen-analyzer | sonnet | Measurements + baseline + KPI defs + profile ANALYZE section | +| PROPOSE (4) | kaizen-proposer | sonnet | Analysis + mutation targets + immutable list + profile PROPOSE section | +| VERIFY (6) | kaizen-measurer | haiku | Measurement script path + run directory | +| Final review (3) | kaizen-reviewer | opus | Profile mission + tool source + summary + sample decisions | + +See `references/subagent-dispatch.md` for detailed context packaging instructions per agent. + +--- + +## 5. Context Management Protocol + +**Between iterations:** After Phase 8 (LOG), before the next Phase 1 (MEASURE): + +1. The current iteration's detailed analysis, proposals, and reasoning are written to disk (the audit trail). +2. The next iteration starts with **reconstructed minimal context**: + - Profile frontmatter (static — reloaded from PROFILE.md) + - Current summary.json (aggregate state) + - Previous iteration's decision.json (to avoid repeating reverted proposals) + - Current iteration number and remaining budget +3. Full history is available on disk but NOT loaded into context unless specifically needed. + +This ensures the engine can run many iterations without context exhaustion. + +--- + +## 6. Error Recovery + +| Failure | Recovery | +|---------|----------| +| Measurement tool crash (recoverable) | Fix typo/path, retry once | +| Measurement tool crash (fundamental) | Abort iteration, report to user | +| Partial APPLY failure | Full revert from backup | +| Subagent dispatch failure | Retry once, then run phase inline | +| Sequential Thinking unavailable | CRITICAL — skill cannot function. Direct user to README for setup. | +| Git operations fail | Fall back to file-backup-based revert | +| summary.json corrupted | Rebuild from iteration records | + +--- + +## 7. Integration + +This skill is the core of the kaizen plugin. It is invoked by: +- `/kaizen` command — primary entry point +- Direct skill activation via trigger phrases + +Its output (audit trail in `.kaizen/runs/`) is consumed by: +- **kaizen-report** — reads summary.json files to show trends and history +- **kaizen-profile-designer** — uses the profile-template.md reference diff --git a/kaizen/skills/kaizen-engine/references/loop-protocol.md b/kaizen/skills/kaizen-engine/references/loop-protocol.md new file mode 100644 index 0000000..98c86d9 --- /dev/null +++ b/kaizen/skills/kaizen-engine/references/loop-protocol.md @@ -0,0 +1,188 @@ +# Loop Protocol — 8-Phase Iteration Detail + +## Phase Sequence + +``` +MEASURE → ANALYZE → HYPOTHESIZE → PROPOSE → APPLY → VERIFY → DECIDE → LOG → [loop or stop] +``` + +Each phase maps to one Sequential Thinking thought. The chain represents a single iteration. + +## Phase Specifications + +### Phase 1: MEASURE + +**Purpose:** Collect current KPI values as the starting point for this iteration. + +**Inputs:** +- Measurement tool path (from manifest) +- Run directory path + +**Outputs:** +- `iterations/{NNN}/measurement.json` + +**Failure modes:** +| Failure | Severity | Recovery | +|---------|----------|----------| +| Tool script not found | CRITICAL | Abort iteration, check BOOTSTRAP | +| Tool runtime missing (Python/TS) | CRITICAL | Abort run, report to user | +| Tool exits with error | RECOVERABLE | Read stderr, fix if simple typo/path, retry once | +| Tool produces invalid JSON | RECOVERABLE | Fix output parsing, retry once | +| Tool hangs (>60s) | CRITICAL | Kill process, abort iteration | +| Partial KPI output (some missing) | WARNING | Log warning, proceed with available KPIs | + +### Phase 2: ANALYZE + +**Purpose:** Compare current KPIs to baseline/previous, identify trends and opportunities. + +**Inputs:** +- Current measurement.json +- Previous measurement (baseline.json or previous iteration's measurement.json) +- Profile KPI definitions + +**Outputs:** +- `iterations/{NNN}/analysis.md` (first section) + +**Analysis structure:** +1. **Per-KPI delta table** — current vs previous, absolute and percentage change +2. **Trend assessment** — improving, plateauing, or regressing (based on last 3 iterations if available) +3. **Opportunity ranking** — which KPI has the most room for improvement, considering direction and current distance from ideal +4. **Anomaly detection** — sudden jumps, reversals, or values outside expected range + +**Failure modes:** +| Failure | Severity | Recovery | +|---------|----------|----------| +| No previous measurement | INFO | Use baseline as comparison point | +| KPI value type mismatch | WARNING | Log, cast if possible, skip if not | +| Analyzer agent fails | RECOVERABLE | Run analysis inline (no subagent) | + +### Phase 3: HYPOTHESIZE + +**Purpose:** Form testable hypotheses about root causes and improvement opportunities. + +**Inputs:** +- Analysis from Phase 2 +- Profile's HYPOTHESIZE section (domain knowledge) + +**Outputs:** +- Appended to `iterations/{NNN}/analysis.md` + +**Hypothesis structure:** +- **Observation**: what the data shows +- **Hypothesis**: proposed explanation +- **Predicted effect**: what change would improve the KPI +- **Confidence**: high/medium/low based on evidence strength + +### Phase 4: PROPOSE + +**Purpose:** Generate a concrete, minimal, actionable change proposal. + +**Inputs:** +- Hypotheses from Phase 3 +- Mutation targets (allowed files/assets) +- Immutable list (forbidden files) +- Profile's PROPOSE section +- Previous iteration's decision (to avoid repeating reverted proposals) + +**Outputs:** +- `iterations/{NNN}/proposal.md` + +**Proposal structure:** +1. **Target**: which file(s) or asset(s) to modify +2. **Change description**: precise description of what to change +3. **Rationale**: which hypothesis this tests +4. **Expected KPI impact**: predicted improvement with reasoning +5. **Risk assessment**: what could go wrong +6. **Confidence**: high/medium/low + +**Constraints:** +- MUST NOT propose changes to immutable files +- MUST propose the **minimum viable change** — prefer small, targeted edits over large refactors +- MUST NOT repeat a proposal that was reverted in the immediately previous iteration (try a different approach) +- If confidence is low, flag this in the proposal + +**Failure modes:** +| Failure | Severity | Recovery | +|---------|----------|----------| +| No viable proposal found | INFO | Skip to DECIDE with `no_proposal: true` | +| Proposal touches immutable file | CRITICAL | Reject proposal, re-propose | +| Proposer agent fails | RECOVERABLE | Generate proposal inline | + +### Phase 5: APPLY + +**Purpose:** Execute the proposed changes safely with full rollback capability. + +**Inputs:** +- Proposal from Phase 4 +- Mutation targets and immutable list + +**Outputs:** +- `iterations/{NNN}/backup/` directory +- `iterations/{NNN}/diff.patch` +- Modified target files + +**Protocol:** +1. Create `backup/` directory +2. For each file in mutation scope: copy to backup preserving relative path +3. Verify no proposed change violates immutable list +4. Apply each change using Edit/Write tools +5. After each file change, verify syntactic validity if applicable +6. Generate diff (git diff or unified diff from backups) + +**Failure modes:** +| Failure | Severity | Recovery | +|---------|----------|----------| +| Backup creation fails | CRITICAL | Abort iteration (cannot safely proceed) | +| File write permission denied | CRITICAL | Abort, restore from backup | +| Partial apply (some succeed, some fail) | CRITICAL | Full restore from backup | +| Syntax validation fails | RECOVERABLE | Revert specific file, attempt fix, or full revert | +| Immutable violation detected | CRITICAL | Full restore, flag to user | + +### Phase 6: VERIFY + +**Purpose:** Re-measure KPIs after the change to assess impact. + +Same protocol as Phase 1 (MEASURE), writing to `verification.json` instead. + +### Phase 7: DECIDE + +**Purpose:** Determine whether to keep or revert the change based on KPI comparison. + +**Inputs:** +- measurement.json (before) +- verification.json (after) +- Profile strategy and convergence settings + +**Outputs:** +- `iterations/{NNN}/decision.json` +- File restoration (if revert) +- Git commit (if keep + git-managed) + +**Decision logic documented in `ratchet-strategies.md`.** + +### Phase 8: LOG + +**Purpose:** Update aggregate run state and inform the user. + +**Inputs:** +- Decision from Phase 7 +- Running totals from summary.json + +**Outputs:** +- Updated `summary.json` +- User-facing iteration summary + +## Timing Expectations + +| Phase | Typical Duration | Model | +|-------|-----------------|-------| +| MEASURE | 5-15s | haiku (via measurer agent) | +| ANALYZE | 10-30s | sonnet (via analyzer agent) | +| HYPOTHESIZE | 5-15s | inline (Sequential Thinking) | +| PROPOSE | 15-45s | sonnet (via proposer agent) | +| APPLY | 5-20s | inline (file operations) | +| VERIFY | 5-15s | haiku (via measurer agent) | +| DECIDE | 5-10s | inline (comparison logic) | +| LOG | 2-5s | inline (file write) | + +**Total per iteration:** ~1-3 minutes depending on complexity and autonomy pauses. diff --git a/kaizen/skills/kaizen-engine/references/profile-template.md b/kaizen/skills/kaizen-engine/references/profile-template.md new file mode 100644 index 0000000..d9aaec4 --- /dev/null +++ b/kaizen/skills/kaizen-engine/references/profile-template.md @@ -0,0 +1,130 @@ +# Profile Template + +Use this template to create a new kaizen improvement profile. Copy the content below into a new file at `profiles/{your-profile-name}/PROFILE.md` and fill in the sections. + +--- + +```yaml +--- +name: your-profile-name +description: "One-line description of what this profile improves" +version: 0.1.0 + +# Strategy: how the engine decides to keep or revert changes +# - greedy: single KPI, pure hill-climbing +# - multi-objective: Pareto dominance across multiple KPIs +strategy: multi-objective + +# Autonomy: how much human involvement per iteration +# - autonomous: loop runs unattended until convergence or budget +# - supervised: pause for human approval at every PROPOSE step +# - hybrid(N): autonomous for N iterations, then pause for checkpoint +autonomy: supervised + +# Iteration budget: max iterations before stopping (0 = unlimited) +iteration_budget: 10 + +# Convergence: when to stop if no improvement is happening +convergence: + epsilon: 0.02 # minimum KPI delta to count as "improvement" + patience: 3 # consecutive no-improvement iterations before stopping + +# Initial state: how to capture the baseline before the first iteration +initial_state: + capture_strategy: automatic # automatic | manual | hybrid + sources: + - type: config # session_transcripts | config | git_history | memory | user_provided + path: ".claude/" + description: "What this source provides" + # Add more sources as needed + +# Measurement: how KPIs are collected +measurement: + tool_generation: true # true = auto-scaffold a measurement script + language: python # python | typescript (when tool_generation is true) + +# KPIs: what to measure and optimize +kpis: + - name: your_kpi_name + description: "Human-readable description of what this measures" + direction: maximize # maximize | minimize + unit: ratio # ratio | percentage | count | seconds | custom + measurement_method: automated # automated | user-reported | hybrid + formula: "numerator / denominator — human-readable, not eval'd" + + # Add more KPIs as needed (multi-objective profiles should have 2-4 KPIs) + +# Mutation targets: what the engine is allowed to change +mutation_targets: + defaults: + - path: "path/to/file-or-pattern" + description: "Why this file is a valid improvement target" + immutable: + - path: "tests/**" # patterns that MUST NOT be modified + - path: ".git/**" + +# Connectors: MCP server dependencies +connectors: + required: + - "~~sequential-thinking" + optional: [] +--- + +# Improvement Instructions + +These sections provide domain-specific guidance for each phase of the improvement loop. The engine reads the relevant section during each phase. + +## MEASURE Phase + +Describe how to collect data for your KPIs: +- What files or sources to examine +- What patterns to look for +- How to handle edge cases (missing data, ambiguous values) + +## ANALYZE Phase + +Describe how to interpret the measurements: +- What constitutes a good vs poor value for each KPI +- Known correlations between KPIs +- Common patterns or anti-patterns to look for + +## HYPOTHESIZE Phase + +Describe the kinds of root causes and opportunities to consider: +- Typical reasons for poor KPI values in this domain +- Categories of improvements that tend to be high-impact +- Constraints or trade-offs to keep in mind + +## PROPOSE Phase + +Describe the kinds of changes that are appropriate: +- What types of modifications are safe and effective +- Examples of good proposals for this domain +- Constraints on proposal scope (e.g., "one file per iteration") +- Anti-patterns to avoid + +## APPLY Phase + +Describe any special considerations for applying changes: +- Syntax validation requirements +- Side effects to watch for +- Order-dependent operations + +## VERIFY Phase + +Describe any additional verification beyond KPI re-measurement: +- Smoke tests to run +- Manual checks the user should perform (for supervised mode) +- Signs that a change may have unintended side effects +``` + +--- + +## Guidelines for Good Profiles + +1. **Start with 1-2 KPIs** — add more only if needed. Multi-objective optimization is harder. +2. **Keep formulas concrete** — even though they're human-readable, they should be unambiguous enough to implement as code. +3. **Set conservative epsilon** — too low catches noise, too high misses real improvements. +4. **Use supervised autonomy initially** — switch to autonomous once you trust the loop. +5. **Define immutables carefully** — err on the side of protecting more files. +6. **Write detailed phase instructions** — the engine is generic; your domain knowledge lives in these sections. diff --git a/kaizen/skills/kaizen-engine/references/ratchet-strategies.md b/kaizen/skills/kaizen-engine/references/ratchet-strategies.md new file mode 100644 index 0000000..d0409f3 --- /dev/null +++ b/kaizen/skills/kaizen-engine/references/ratchet-strategies.md @@ -0,0 +1,85 @@ +# Ratchet Strategies + +## Greedy Strategy (Single KPI) + +Used when `strategy: greedy` in the profile, or when the profile defines a single KPI. + +**Decision rule:** +``` +IF kpi_after < kpi_before (for minimize direction) + OR kpi_after > kpi_before (for maximize direction) + AND abs(kpi_after - kpi_before) >= epsilon +THEN KEEP +ELSE REVERT +``` + +Simple hill-climbing. No tolerance for regression. Each iteration either locks in an improvement or returns to the previous best state. + +**Epsilon role:** Prevents noise from being treated as improvement. If a KPI changes by less than epsilon, it's treated as unchanged. Typical epsilon values: +- Ratios: 0.01-0.05 +- Percentages: 1-5 +- Counts: 1 +- Time (seconds): 0.5-2.0 + +## Multi-Objective Strategy + +Used when `strategy: multi-objective` in the profile, or when multiple KPIs are defined. + +**Decision rule (Pareto dominance):** +``` +improved = [kpi for kpi in kpis if delta(kpi) >= epsilon in the desired direction] +regressed = [kpi for kpi in kpis if delta(kpi) >= epsilon in the undesired direction] +unchanged = [kpi for kpi in kpis if abs(delta(kpi)) < epsilon] + +IF len(regressed) == 0 AND len(improved) >= 1: + KEEP (Pareto improvement — at least one better, none worse) + +IF len(regressed) > 0: + IF autonomy == "autonomous": + REVERT (cannot accept trade-offs without human judgment) + ELSE: + ESCALATE to user: + "Iteration {N} improved {improved_kpis} but regressed {regressed_kpis}. + Accept this trade-off?" + +IF len(improved) == 0 AND len(regressed) == 0: + REVERT (no meaningful change) +``` + +**Trade-off presentation (for ESCALATE):** + +| KPI | Before | After | Delta | Direction | +|-----|--------|-------|-------|-----------| +| tool_efficiency | 0.65 | 0.72 | +0.07 | improved | +| search_precision | 3.2 | 3.8 | +0.6 | regressed | + +"tool_efficiency improved by 10.8% but search_precision worsened by 18.8%. Accept?" + +## Patience Mechanism + +Both strategies use a patience counter to detect convergence: + +``` +patience_counter = 0 + +After each DECIDE: + IF decision == KEEP: + patience_counter = 0 (reset) + IF decision == REVERT: + patience_counter += 1 + +IF patience_counter >= convergence.patience: + STOP (convergence — improvement has plateaued) +``` + +Typical patience values: +- Fast convergence: 2 (stop after 2 consecutive reverts) +- Standard: 3 +- Thorough exploration: 5 (allow more failed attempts before giving up) + +## Avoiding Repetition + +The proposer MUST track reverted proposals. After a revert: +- The next iteration's PROPOSE phase receives the reverted proposal as negative context +- The proposer should try a different approach (different file, different strategy, different hypothesis) +- If the proposer has exhausted all hypotheses, it should report `no_proposal` which counts toward patience diff --git a/kaizen/skills/kaizen-engine/references/subagent-dispatch.md b/kaizen/skills/kaizen-engine/references/subagent-dispatch.md new file mode 100644 index 0000000..b959a56 --- /dev/null +++ b/kaizen/skills/kaizen-engine/references/subagent-dispatch.md @@ -0,0 +1,177 @@ +# Subagent Dispatch Protocol + +## Overview + +The kaizen engine dispatches 4 specialized agents during the improvement loop. Each agent receives a **minimal, tailored context package** — only the information needed for its phase. This prevents context bloat and ensures agents reason about their specific task. + +## Agent Registry + +| Agent | Model | Invocation Points | Purpose | +|-------|-------|--------------------|---------| +| kaizen-measurer | haiku | MEASURE, VERIFY | Run measurement tools, collect KPIs | +| kaizen-analyzer | sonnet | ANALYZE | Interpret data, find patterns | +| kaizen-proposer | sonnet | PROPOSE | Generate change proposals | +| kaizen-reviewer | opus | BOOTSTRAP, Final Review | Adversarial validation | + +## Context Packaging + +### kaizen-measurer (haiku) + +**Dispatch at:** Phase 1 (MEASURE) and Phase 6 (VERIFY) + +**Context package:** +``` +You are the kaizen-measurer agent. Run the measurement tool and return KPI values. + +Measurement script: {path_to_measure.py_or_ts} +Run directory: {path_to_run_dir} +Expected KPIs: {list of kpi names from profile} +Output file: {path_to_measurement.json_or_verification.json} + +Instructions: +1. Execute the measurement script: `python {script_path}` or `npx tsx {script_path}` +2. Capture stdout as the measurement result +3. If exit code is non-zero, capture stderr for error diagnosis +4. Write the result to the output file path +5. Return the KPI values and any errors encountered +``` + +**Do NOT include:** Analysis context, previous proposals, profile body, hypothesis text. + +### kaizen-analyzer (sonnet) + +**Dispatch at:** Phase 2 (ANALYZE) + +**Context package:** +``` +You are the kaizen-analyzer agent. Compare current measurements against the reference point and identify improvement opportunities. + +Profile KPIs: +{for each kpi: name, description, direction, unit} + +Current measurement: +{contents of measurement.json} + +Reference measurement (baseline or previous iteration): +{contents of baseline.json or previous measurement.json} + +Profile analysis guidance: +{contents of the ## ANALYZE Phase section from PROFILE.md body} + +Instructions: +1. Calculate per-KPI deltas (absolute and percentage) +2. Assess trend direction for each KPI +3. Rank KPIs by room for improvement +4. Flag any anomalies or unexpected patterns +5. Write your analysis as structured markdown +``` + +**Do NOT include:** Mutation targets, previous proposals, the measurement tool source code. + +### kaizen-proposer (sonnet) + +**Dispatch at:** Phase 4 (PROPOSE) + +**Context package:** +``` +You are the kaizen-proposer agent. Generate a concrete, minimal improvement proposal. + +Analysis and hypotheses: +{contents of iterations/{NNN}/analysis.md} + +Mutation targets (you MAY modify these): +{list of defaults from profile, with any user overrides} + +Immutable targets (you MUST NOT modify these): +{list of immutable patterns from profile} + +Profile proposal guidance: +{contents of the ## PROPOSE Phase section from PROFILE.md body} + +Previous reverted proposal (DO NOT repeat this approach): +{contents of previous iteration's proposal.md, if it was reverted; "None" if first iteration or previous was kept} + +Instructions: +1. Based on the analysis, identify the highest-impact change +2. Verify the change targets only mutable files +3. Describe the change precisely (which file, what modification) +4. Estimate expected KPI impact with reasoning +5. Assess confidence level (high/medium/low) +6. If you cannot find a viable change, report "no viable proposal" +``` + +**Do NOT include:** Measurement tool source, full iteration history, other agents' prompts. + +### kaizen-reviewer (opus) + +**Dispatch at:** BOOTSTRAP (Step 1e) and Final Review (Step 3) + +#### BOOTSTRAP dispatch: +``` +You are the kaizen-reviewer agent performing adversarial review of a measurement tool. + +Profile KPI definitions: +{for each kpi: name, description, direction, formula} + +Measurement tool source code: +{full contents of measure.py or measure.ts} + +Baseline measurement output: +{contents of baseline.json} + +Review criteria: +1. Does the tool faithfully implement each KPI formula? +2. Are there edge cases where the tool produces misleading values? +3. Is the JSON output format compliant with the interface contract? +4. Could the tool be gamed by trivial changes (e.g., renaming to change counts)? +5. Are there hardcoded assumptions that could break? + +Rate each finding as: CRITICAL (must fix), MEDIUM (note and monitor), LOW (acceptable). +``` + +#### Final Review dispatch: +``` +You are the kaizen-reviewer agent performing final adversarial review of a completed kaizen run. + +Profile mission: + Name: {name} + Description: {description} + KPIs: {list with directions} + +Measurement tool source: +{full contents of measure.py or measure.ts} + +Run summary: +{contents of summary.json} + +Sample iteration decisions: + First: {decision.json from iteration 001} + Best improvement: {decision.json from the iteration with largest positive delta} + Last: {decision.json from final iteration} + +Review criteria: +1. Are the reported improvements genuine or measurement artifacts? +2. Do the applied changes align with the profile's stated mission? +3. Could any improvement be attributed to gaming the measurement tool? +4. Were immutable boundaries respected throughout? +5. Is the convergence reason appropriate? + +Provide a verdict: PASSED (improvements are genuine) or FLAGGED (concerns identified, with details). +``` + +## Dispatch Mechanics + +Use the `Agent` tool with: +- `subagent_type`: the agent name (e.g., "kaizen-measurer") +- `model`: as specified in the registry +- `prompt`: the context package above, with placeholders filled +- `description`: brief label (e.g., "Measure KPIs for iteration 3") + +## Failure Handling + +If an agent dispatch fails (timeout, error, unexpected output): + +1. **First failure:** Retry the dispatch once with the same context +2. **Second failure:** Fall back to running the phase inline (without subagent) +3. Log the failure in the iteration record +4. If the reviewer agent fails, proceed but note in summary that adversarial review was skipped diff --git a/kaizen/skills/kaizen-engine/references/tool-scaffolding.md b/kaizen/skills/kaizen-engine/references/tool-scaffolding.md new file mode 100644 index 0000000..f2369bf --- /dev/null +++ b/kaizen/skills/kaizen-engine/references/tool-scaffolding.md @@ -0,0 +1,207 @@ +# Measurement Tool Scaffolding + +## Interface Contract + +Every measurement tool MUST conform to this interface regardless of language: + +### Input +- No command-line arguments +- Reads configuration from `config.json` in the same directory (written by the engine during BOOTSTRAP) + +### Output (stdout) +```json +{ + "kpis": { + "kpi_name_1": 0.75, + "kpi_name_2": 3.2 + }, + "metadata": { + "timestamp": "2026-03-23T14:30:00Z", + "profile": "claude-code-usage", + "measurement_duration_ms": 1234, + "details": { + "kpi_name_1": { + "numerator": 45, + "denominator": 60, + "raw_data_points": 60 + } + } + } +} +``` + +**KPI values MUST be numeric** (integer or float). The engine handles direction interpretation. + +### Error Output (stderr + exit code 1) +```json +{ + "error": "description of what went wrong", + "partial_kpis": {"kpi_name_1": 0.75}, + "recoverable": true +} +``` + +### Config File (config.json) +Written by the engine during BOOTSTRAP: +```json +{ + "profile": "claude-code-usage", + "run_id": "2026-03-23-claude-code-usage-001", + "sources": [ + {"type": "session_transcripts", "path": "~/.claude/sessions/"}, + {"type": "config", "path": ".claude/"} + ], + "kpis": [ + {"name": "tool_efficiency", "formula": "dedicated_tool_calls / total_tool_calls", "direction": "maximize"}, + {"name": "search_precision", "formula": "targets_found / total_searches", "direction": "minimize"} + ] +} +``` + +## Python Template + +```python +#!/usr/bin/env python3 +"""Kaizen measurement tool — {profile_name} +Auto-generated by kaizen-engine. Edit with caution. +""" + +import json +import sys +import os +from datetime import datetime, timezone +from pathlib import Path + +def load_config(): + config_path = Path(__file__).parent / "config.json" + with open(config_path) as f: + return json.load(f) + +def measure_kpis(config): + """ + Implement KPI measurement logic here. + Returns dict of {kpi_name: numeric_value}. + """ + kpis = {} + details = {} + + # --- KPI: {kpi_name} --- + # Formula: {formula} + # Direction: {direction} + # TODO: Implement measurement logic + # kpis["{kpi_name}"] = computed_value + # details["{kpi_name}"] = {"numerator": N, "denominator": D} + + return kpis, details + +def main(): + try: + config = load_config() + kpis, details = measure_kpis(config) + + result = { + "kpis": kpis, + "metadata": { + "timestamp": datetime.now(timezone.utc).isoformat(), + "profile": config["profile"], + "measurement_duration_ms": 0, # filled by caller + "details": details + } + } + print(json.dumps(result, indent=2)) + except Exception as e: + error = { + "error": str(e), + "partial_kpis": {}, + "recoverable": False + } + print(json.dumps(error), file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__": + main() +``` + +## TypeScript Template + +```typescript +#!/usr/bin/env npx tsx +/** + * Kaizen measurement tool — {profile_name} + * Auto-generated by kaizen-engine. Edit with caution. + */ + +import { readFileSync } from 'fs'; +import { join, dirname } from 'path'; +import { fileURLToPath } from 'url'; + +interface Config { + profile: string; + run_id: string; + sources: Array<{ type: string; path: string }>; + kpis: Array<{ name: string; formula: string; direction: string }>; +} + +interface MeasurementResult { + kpis: Record; + metadata: { + timestamp: string; + profile: string; + measurement_duration_ms: number; + details: Record; + }; +} + +function loadConfig(): Config { + const configPath = join(dirname(fileURLToPath(import.meta.url)), 'config.json'); + return JSON.parse(readFileSync(configPath, 'utf-8')); +} + +function measureKpis(config: Config): { kpis: Record; details: Record } { + const kpis: Record = {}; + const details: Record = {}; + + // --- KPI: {kpi_name} --- + // Formula: {formula} + // Direction: {direction} + // TODO: Implement measurement logic + + return { kpis, details }; +} + +try { + const config = loadConfig(); + const { kpis, details } = measureKpis(config); + + const result: MeasurementResult = { + kpis, + metadata: { + timestamp: new Date().toISOString(), + profile: config.profile, + measurement_duration_ms: 0, + details, + }, + }; + + console.log(JSON.stringify(result, null, 2)); +} catch (error) { + console.error(JSON.stringify({ + error: String(error), + partial_kpis: {}, + recoverable: false, + })); + process.exit(1); +} +``` + +## Scaffolding Guidelines + +When generating a measurement tool from a profile: + +1. **Read the KPI formulas** — translate each human-readable formula into executable code +2. **Map data sources** — use config.json `sources` to know where to find data +3. **Handle missing data** — if a source path doesn't exist, report partial results (don't crash) +4. **Keep it simple** — standard library only, no external dependencies +5. **Be deterministic** — same inputs should produce same outputs +6. **Log nothing to stdout except the final JSON** — any debug output goes to stderr +7. **Respect timeouts** — measurement should complete in under 60 seconds diff --git a/kaizen/skills/kaizen-profile-designer/SKILL.md b/kaizen/skills/kaizen-profile-designer/SKILL.md new file mode 100644 index 0000000..06ed3e3 --- /dev/null +++ b/kaizen/skills/kaizen-profile-designer/SKILL.md @@ -0,0 +1,111 @@ +--- +name: kaizen-profile-designer +description: "Interactive workflow to create custom kaizen improvement profiles. Guides users through KPI definition, data source identification, mutation scope selection, autonomy configuration, and profile validation. Use when user says 'create kaizen profile', 'design improvement loop', 'new kaizen profile', 'custom profile', 'custom kaizen', 'define improvement target', or wants to create a new improvement loop for a domain not covered by the bundled profiles." +--- + +# Kaizen Profile Designer — Custom Profile Creation + +## 1. Overview + +This skill guides the user through creating a custom PROFILE.md file for the kaizen engine. It produces a validated, ready-to-use profile that follows the engine's template format. + +Output: `profiles/{name}/PROFILE.md` in the kaizen plugin directory, or a custom location specified by the user. + +## 2. Pipeline + +### Step 1 — Understand the Improvement Target + +Interview the user to understand what they want to improve: + +1. **What is the target?** A process, codebase, configuration, workflow, or system +2. **What's the pain point?** What's not working well, what triggered this +3. **What does success look like?** Concrete description of the improved state +4. **What's the scope?** Which files, systems, or processes are involved +5. **What's off-limits?** What must NOT be changed + +### Step 2 — Define KPIs + +`Read references/profile-checklist.md` + +Help the user define 1-4 measurable KPIs: + +For each KPI: +1. **Name** — short, descriptive (e.g., `build_time`, `error_rate`) +2. **Description** — what it measures in plain language +3. **Direction** — maximize or minimize +4. **Unit** — ratio, percentage, count, seconds, custom +5. **Measurement method** — automated, user-reported, or hybrid +6. **Formula** — human-readable description of how to calculate + +**Guide the user with questions:** +- "If this improvement works, what number would change?" +- "How would you measure that? Can it be automated?" +- "Is there a trade-off KPI we should watch?" (suggest multi-objective) + +### Step 3 — Identify Data Sources + +Based on the KPIs, determine what data the engine needs: + +1. **Source type**: files, git history, session logs, API output, user input +2. **Source path**: where to find the data +3. **Collection method**: automatic (Read/Grep/Bash) or manual (user provides) + +### Step 4 — Define Mutation Scope + +Determine what the engine is allowed to modify: + +1. **Default targets** — files/patterns that are the primary improvement surface +2. **Immutable targets** — files/patterns that MUST NOT be touched (tests, git, dependencies) + +**Guide with questions:** +- "Which files contain the things you want to improve?" +- "Are there files that must never be changed? (tests, configs, etc.)" + +### Step 5 — Configure Engine Behavior + +1. **Strategy** — greedy (single KPI) or multi-objective (multiple KPIs) + - If 1 KPI defined: recommend greedy + - If 2+ KPIs: recommend multi-objective, explain Pareto dominance + +2. **Autonomy** — supervised, autonomous, or hybrid(N) + - For first-time profiles: recommend supervised + - For well-understood domains: suggest hybrid(3) + +3. **Iteration budget** — how many iterations to allow (recommend 5-10) + +4. **Convergence** — epsilon and patience values + - Help calibrate epsilon to the KPI's scale + +5. **Measurement tool** — should the engine auto-generate a measurement tool? + - If KPIs are automatable: recommend `tool_generation: true` + - If KPIs are user-reported: set `tool_generation: false` + - Language preference: Python or TypeScript + +### Step 6 — Generate Profile + +1. Assemble the PROFILE.md using the profile template +2. Fill in all frontmatter fields from Steps 2-5 +3. Write phase instructions in the markdown body: + - MEASURE: how to collect KPI data + - ANALYZE: how to interpret measurements + - HYPOTHESIZE: common root causes in this domain + - PROPOSE: appropriate change types and constraints + - APPLY: special considerations for applying changes + - VERIFY: additional verification beyond KPI re-measurement + +4. Present the complete profile to the user for review + +### Step 7 — Validate and Save + +1. Verify all required frontmatter fields are present +2. Verify KPI definitions are complete (name, direction, unit) +3. Verify mutation targets don't overlap with immutable patterns +4. Save to the specified location +5. Suggest the user run `/kaizen {profile-name}` to start the loop + +## 3. Progressive Disclosure + +| Step | Documents to Read | +|------|-------------------| +| Step 2 | `references/profile-checklist.md` | +| Step 6 | Profile template from the kaizen-engine skill's reference directory (profile-template.md) | diff --git a/kaizen/skills/kaizen-profile-designer/references/profile-checklist.md b/kaizen/skills/kaizen-profile-designer/references/profile-checklist.md new file mode 100644 index 0000000..20da73a --- /dev/null +++ b/kaizen/skills/kaizen-profile-designer/references/profile-checklist.md @@ -0,0 +1,61 @@ +# Profile Design Checklist + +Use this checklist when helping users design custom kaizen profiles. + +## KPI Checklist + +For each KPI, verify: + +- [ ] **Name** is short, lowercase, uses underscores (e.g., `build_time`) +- [ ] **Description** clearly explains what the KPI measures +- [ ] **Direction** is explicitly stated (maximize or minimize) +- [ ] **Unit** is defined (ratio, percentage, count, seconds, or custom with explanation) +- [ ] **Formula** is unambiguous — someone else could implement it from the description +- [ ] **Measurement method** is feasible: + - `automated`: can be calculated from files/data without human input + - `user-reported`: requires human observation + - `hybrid`: some parts automated, some manual +- [ ] **Epsilon** is calibrated to the KPI's scale: + - For ratios (0-1): epsilon 0.01-0.05 + - For percentages (0-100): epsilon 1-5 + - For counts: epsilon 1 + - For time (seconds): epsilon depends on scale + +## Data Source Checklist + +For each source: + +- [ ] **Type** is one of: session_transcripts, config, git_history, memory, user_provided +- [ ] **Path** exists (or will be provided by the user) +- [ ] **Description** explains what data this provides for the KPIs + +## Mutation Scope Checklist + +- [ ] **Default targets** cover the files that need improvement +- [ ] **Immutable targets** protect: tests, git directory, lock files, dependencies +- [ ] Default and immutable targets don't overlap +- [ ] The user confirmed the scope is appropriate + +## Engine Configuration Checklist + +- [ ] **Strategy** matches the number of KPIs (greedy for 1, multi-objective for 2+) +- [ ] **Autonomy** is appropriate for the domain and user trust level +- [ ] **Iteration budget** is set (5-10 for first run) +- [ ] **Patience** is set (2-3 typically) +- [ ] **Measurement tool generation** is decided and language chosen (if automated) + +## Profile Body Checklist + +- [ ] Each phase section (MEASURE through VERIFY) has domain-specific instructions +- [ ] PROPOSE section includes constraints (what NOT to change) +- [ ] MEASURE section describes how to collect each KPI value +- [ ] ANALYZE section describes what good/bad values look like for each KPI + +## Common Mistakes + +- **Too many KPIs** — start with 1-2, add more later +- **Vague formulas** — "code quality" is not measurable; "functions with complexity > 10" is +- **Missing immutables** — always protect tests, git, and dependencies +- **Autonomous too early** — start supervised until you trust the loop +- **Epsilon too low** — catches noise instead of improvements +- **Budget too high** — 10 is usually enough; diminishing returns after that diff --git a/kaizen/skills/kaizen-report/SKILL.md b/kaizen/skills/kaizen-report/SKILL.md new file mode 100644 index 0000000..7b94cc5 --- /dev/null +++ b/kaizen/skills/kaizen-report/SKILL.md @@ -0,0 +1,94 @@ +--- +name: kaizen-report +description: "View improvement history and KPI trends from kaizen runs. Reads .kaizen/runs/ audit trail to show cross-run improvement trajectories, diminishing returns detection, and formatted KPI reports with delta tables. Use when user says 'kaizen report', 'kaizen history', 'improvement history', 'show kaizen results', 'KPI trends', 'how is improvement going', 'kaizen status', or wants to review past improvement loop outcomes." +--- + +# Kaizen Report — Improvement History & Trends + +## 1. Overview + +This skill reads the `.kaizen/runs/` audit trail and presents improvement history, KPI trends, and actionable insights. It works with any profile's audit data. + +## 2. Pipeline + +### Step 1 — Locate Audit Data + +Search for `.kaizen/runs/` directories: +1. Check current project root: `.kaizen/runs/` +2. Check user home: `~/.kaizen/runs/` +3. If a specific profile is requested, filter to runs matching that profile name + +If no runs are found, inform the user and suggest running `/kaizen` first. + +### Step 2 — Load Run Summaries + +For each run directory found: +1. Read `summary.json` +2. Extract: profile name, run ID, start date, iterations completed/kept/reverted, baseline KPIs, final KPIs, improvement percentages, convergence reason, adversarial review status +3. Sort runs chronologically + +### Step 3 — Present Report + +#### Single Profile Report + +If runs are for a single profile (or user requested a specific profile): + +```markdown +# Kaizen Report: {profile-name} + +## Overview +- **Total runs:** {count} +- **Date range:** {first run date} → {latest run date} +- **Total iterations:** {sum across runs} ({kept} kept, {reverted} reverted) + +## KPI Trajectory + +| Run | Date | {KPI 1} | {KPI 2} | ... | Iterations | Result | +|-----|------|---------|---------|-----|------------|--------| +| 001 | ... | baseline → final (Δ%) | ... | ... | N (K kept) | converged/budget/stopped | +| 002 | ... | baseline → final (Δ%) | ... | ... | N (K kept) | converged/budget/stopped | + +## Cumulative Improvement +- **{KPI 1}:** {first baseline} → {latest final} ({total Δ%} total improvement) +- **{KPI 2}:** {first baseline} → {latest final} ({total Δ%} total improvement) + +## Trend Analysis +- **Rate of improvement:** [accelerating / steady / diminishing returns] +- **Diminishing returns alert:** [if last 2+ runs had few kept iterations, flag this] +- **Recommendation:** [continue / shift focus / review profile] + +## Adversarial Review Status +| Run | Verdict | +|-----|---------| +| ... | passed/flagged | +``` + +#### Multi-Profile Summary + +If runs span multiple profiles: + +```markdown +# Kaizen Overview — All Profiles + +| Profile | Runs | Last Run | Best KPI Improvement | Status | +|---------|------|----------|---------------------|--------| +| {name} | N | {date} | {best Δ%} | active/converged | +``` + +### Step 4 — Diminishing Returns Detection + +For each KPI across sequential runs: +1. Calculate the improvement delta per run +2. If the last 3+ deltas are decreasing in magnitude: flag as **diminishing returns** +3. If the last 2+ runs converged with patience_exceeded: suggest shifting focus + +Present recommendation: +- "tool_efficiency has improved from 0.45 to 0.89 over 4 runs. The last 2 runs gained only +0.02 each. Consider shifting focus to another KPI or accepting the current level." + +### Step 5 — Drill-Down (Optional) + +If the user asks about a specific run: +1. Read the full `summary.json` and `adversarial-review.md` +2. List each iteration with its decision (keep/revert) and KPI deltas +3. Show the proposals that were kept (from `proposal.md` files) +4. Present the adversarial review findings diff --git a/tests/validate-plugin.sh b/tests/validate-plugin.sh index 41ff917..79ed658 100755 --- a/tests/validate-plugin.sh +++ b/tests/validate-plugin.sh @@ -127,6 +127,72 @@ validate_plugin() { done fi + # Check profiles + if [[ -d "$plugin_dir/profiles" ]]; then + for profile_dir in "$plugin_dir"/profiles/*/; do + [[ -d "$profile_dir" ]] || continue + local profile_name + profile_name="$(basename "$profile_dir")" + local profile_md="$profile_dir/PROFILE.md" + + if [[ ! -f "$profile_md" ]]; then + error "$plugin_name/profiles/$profile_name: missing PROFILE.md" + continue + fi + + if ! head -1 "$profile_md" | grep -q '^---$'; then + error "$plugin_name/profiles/$profile_name: PROFILE.md missing YAML frontmatter" + continue + fi + + local frontmatter + frontmatter="$(awk 'NR==1{next} /^---$/{exit} {print}' "$profile_md")" + + # Check required frontmatter fields + if ! echo "$frontmatter" | grep -q '^name:'; then + error "$plugin_name/profiles/$profile_name: PROFILE.md frontmatter missing 'name'" + fi + if ! echo "$frontmatter" | grep -q '^description:'; then + error "$plugin_name/profiles/$profile_name: PROFILE.md frontmatter missing 'description'" + fi + if ! echo "$frontmatter" | grep -q '^version:'; then + error "$plugin_name/profiles/$profile_name: PROFILE.md frontmatter missing 'version'" + fi + if ! echo "$frontmatter" | grep -q '^strategy:'; then + error "$plugin_name/profiles/$profile_name: PROFILE.md frontmatter missing 'strategy'" + fi + if ! echo "$frontmatter" | grep -q '^autonomy:'; then + error "$plugin_name/profiles/$profile_name: PROFILE.md frontmatter missing 'autonomy'" + fi + + # Check KPI definitions exist + if ! echo "$frontmatter" | grep -q '^kpis:'; then + error "$plugin_name/profiles/$profile_name: PROFILE.md frontmatter missing 'kpis'" + else + # Check each KPI has required subfields + local kpi_count + kpi_count="$(echo "$frontmatter" | grep -c '^ *- name:' || true)" + if [[ "$kpi_count" -eq 0 ]]; then + error "$plugin_name/profiles/$profile_name: PROFILE.md has 'kpis' but no KPI entries" + else + # Check direction is present for each KPI + local dir_count + dir_count="$(echo "$frontmatter" | grep -c '^ *direction:' || true)" + if [[ "$dir_count" -lt "$kpi_count" ]]; then + warn "$plugin_name/profiles/$profile_name: some KPIs missing 'direction' field" + fi + local unit_count + unit_count="$(echo "$frontmatter" | grep -c '^ *unit:' || true)" + if [[ "$unit_count" -lt "$kpi_count" ]]; then + warn "$plugin_name/profiles/$profile_name: some KPIs missing 'unit' field" + fi + fi + fi + + ok "$plugin_name/profiles/$profile_name: PROFILE.md frontmatter valid" + done + fi + # Check hooks local hooks_json="$plugin_dir/hooks/hooks.json" if [[ -f "$hooks_json" ]]; then diff --git a/tests/validate-references.sh b/tests/validate-references.sh index 2b362cb..d26acf5 100755 --- a/tests/validate-references.sh +++ b/tests/validate-references.sh @@ -69,6 +69,30 @@ for plugin_dir in "$REPO_ROOT"/*/; do done < <(grep -oE '(assets|scripts)/[a-zA-Z0-9_./-]+\.[a-z]+' "$skill_md" | sort -u || true) done + # Check profile references (profiles follow the same reference pattern as skills) + if [[ -d "$plugin_dir/profiles" ]]; then + for profile_dir in "$plugin_dir"/profiles/*/; do + [[ -d "$profile_dir" ]] || continue + prof_name="$(basename "$profile_dir")" + prof_md="$profile_dir/PROFILE.md" + + [[ -f "$prof_md" ]] || continue + + while IFS= read -r ref_path; do + clean_path="$(echo "$ref_path" | sed 's/[`"'"'"']//g; s/[),;]$//')" + [[ "$clean_path" == *"{lang}"* ]] && continue + [[ "$clean_path" == *"{LANG}"* ]] && continue + + full_path="$profile_dir/$clean_path" + if [[ -f "$full_path" ]]; then + ok "$plugin_name/profiles/$prof_name: $clean_path exists" + else + error "$plugin_name/profiles/$prof_name: broken reference '$clean_path' (resolved to $full_path)" + fi + done < <(grep -oE 'references/[a-zA-Z0-9_./-]+\.[a-z]+' "$prof_md" | sort -u || true) + done + fi + # Check markdown links in plugin-level files (README.md, CONNECTORS.md) for md_file in "$plugin_dir"/*.md; do [[ -f "$md_file" ]] || continue