From 80c5b0bbddb1b400f02f50152df55d07d3fca6c1 Mon Sep 17 00:00:00 2001 From: NagyVikt Date: Wed, 22 Apr 2026 23:47:45 +0200 Subject: [PATCH] Expose a built-in session severity rubric for Guardex lanes The repo needed a first-class CLI surface for scoring session health without relying on side scripts. This threads a native report subcommand through the existing help, parsing, and shared context paths and locks the behavior with focused regression coverage. Constraint: Keep the scoring flow inside the existing report command surface and T1 notes-only OpenSpec lane Rejected: Ship a standalone side script | would drift from CLI parsing and help text Confidence: high Scope-risk: narrow Directive: Keep help text, rubric weights, and report tests synchronized when the scoring model changes Tested: node --test test/cli-args-dispatch.test.js test/report.test.js; node bin/multiagent-safety.js report help; node bin/multiagent-safety.js report session-severity --task-size narrow-patch --tokens 3850000 --exec-count 18 --write-stdin-count 6 --completion-before-tail yes --fragmentation 14 --finish-path 6 --post-proof 4; git diff --check Not-tested: remote PR merge path until guarded finish completes --- .../notes.md | 10 + src/cli/args.js | 72 ++++++ src/cli/main.js | 17 +- src/context.js | 2 +- src/report/session-severity.js | 213 ++++++++++++++++++ test/cli-args-dispatch.test.js | 39 ++++ test/report.test.js | 63 ++++++ 7 files changed, 414 insertions(+), 2 deletions(-) create mode 100644 openspec/changes/agent-codex-add-session-severity-scoring-command-2026-04-22-23-28/notes.md create mode 100644 src/report/session-severity.js diff --git a/openspec/changes/agent-codex-add-session-severity-scoring-command-2026-04-22-23-28/notes.md b/openspec/changes/agent-codex-add-session-severity-scoring-command-2026-04-22-23-28/notes.md new file mode 100644 index 00000000..33cf06a1 --- /dev/null +++ b/openspec/changes/agent-codex-add-session-severity-scoring-command-2026-04-22-23-28/notes.md @@ -0,0 +1,10 @@ +# agent-codex-add-session-severity-scoring-command-2026-04-22-23-28 (minimal / T1) + +- Add `gx report session-severity` as a native GitGuardex report subcommand with the fixed weighted rubric for healthy / mildly fragmented / inefficient / runaway / catastrophic sessions. +- Keep the scoring logic in a small report module and thread it through the existing `gx report` help, parsing, and output surface instead of shipping a repo-local side script. +- Lock the new report surface with focused CLI arg parsing and report integration tests. +- Verification: + - `node --test test/cli-args-dispatch.test.js test/report.test.js` + - `node bin/multiagent-safety.js report help` + - `node bin/multiagent-safety.js report session-severity --task-size narrow-patch --tokens 3850000 --exec-count 18 --write-stdin-count 6 --completion-before-tail yes --fragmentation 14 --finish-path 6 --post-proof 4` + - `git diff --check` diff --git a/src/cli/args.js b/src/cli/args.js index e7bf9f92..599e623c 100644 --- a/src/cli/args.js +++ b/src/cli/args.js @@ -349,6 +349,15 @@ function parseReportArgs(rawArgs) { scorecardJson: '', outputDir: '', date: '', + taskSize: '', + tokens: '', + execCount: '', + writeStdinCount: '', + completionBeforeTail: '', + expectedBound: '', + fragmentation: '', + finishPath: '', + postProof: '', dryRun: false, json: false, }; @@ -390,6 +399,69 @@ function parseReportArgs(rawArgs) { index += 1; continue; } + if (arg === '--task-size') { + const next = rawArgs[index + 1]; + if (!next) throw new Error('--task-size requires a value'); + options.taskSize = next; + index += 1; + continue; + } + if (arg === '--tokens') { + const next = rawArgs[index + 1]; + if (!next) throw new Error('--tokens requires a value'); + options.tokens = next; + index += 1; + continue; + } + if (arg === '--exec-count') { + const next = rawArgs[index + 1]; + if (!next) throw new Error('--exec-count requires a value'); + options.execCount = next; + index += 1; + continue; + } + if (arg === '--write-stdin-count') { + const next = rawArgs[index + 1]; + if (!next) throw new Error('--write-stdin-count requires a value'); + options.writeStdinCount = next; + index += 1; + continue; + } + if (arg === '--completion-before-tail') { + const next = rawArgs[index + 1]; + if (!next) throw new Error('--completion-before-tail requires yes or no'); + options.completionBeforeTail = next; + index += 1; + continue; + } + if (arg === '--expected-bound') { + const next = rawArgs[index + 1]; + if (!next) throw new Error('--expected-bound requires a value'); + options.expectedBound = next; + index += 1; + continue; + } + if (arg === '--fragmentation') { + const next = rawArgs[index + 1]; + if (!next) throw new Error('--fragmentation requires a value'); + options.fragmentation = next; + index += 1; + continue; + } + if (arg === '--finish-path') { + const next = rawArgs[index + 1]; + if (!next) throw new Error('--finish-path requires a value'); + options.finishPath = next; + index += 1; + continue; + } + if (arg === '--post-proof') { + const next = rawArgs[index + 1]; + if (!next) throw new Error('--post-proof requires a value'); + options.postProof = next; + index += 1; + continue; + } if (arg === '--dry-run') { options.dryRun = true; continue; diff --git a/src/cli/main.js b/src/cli/main.js index 56e99e90..434c55ff 100755 --- a/src/cli/main.js +++ b/src/cli/main.js @@ -5,6 +5,7 @@ const sandboxModule = require('../sandbox'); const toolchainModule = require('../toolchain'); const finishCommands = require('../finish'); const doctorModule = require('../doctor'); +const sessionSeverityReport = require('../report/session-severity'); const { fs, path, @@ -2433,15 +2434,29 @@ function report(rawArgs) { console.log( `${TOOL_NAME} report commands:\n` + ` ${TOOL_NAME} report scorecard [--target ] [--repo github.com//] [--scorecard-json ] [--output-dir ] [--date YYYY-MM-DD] [--dry-run] [--json]\n` + + ` ${TOOL_NAME} report session-severity --task-size --tokens --exec-count --write-stdin-count --completion-before-tail [--expected-bound ] [--fragmentation ] [--finish-path ] [--post-proof ] [--json]\n` + `\n` + `Examples:\n` + ` ${TOOL_NAME} report scorecard --repo github.com/recodeecom/multiagent-safety\n` + - ` ${TOOL_NAME} report scorecard --scorecard-json ./scorecard.json --date 2026-04-10`, + ` ${TOOL_NAME} report scorecard --scorecard-json ./scorecard.json --date 2026-04-10\n` + + ` ${TOOL_NAME} report session-severity --task-size narrow-patch --tokens 3850000 --exec-count 18 --write-stdin-count 6 --completion-before-tail yes --fragmentation 14 --finish-path 6 --post-proof 4`, ); process.exitCode = 0; return; } + if (subcommand === 'session-severity') { + const payload = sessionSeverityReport.buildSessionSeverityReport(options); + if (options.json) { + process.stdout.write(`${JSON.stringify(payload, null, 2)}\n`); + process.exitCode = 0; + return; + } + console.log(sessionSeverityReport.renderSessionSeverityReport(payload)); + process.exitCode = 0; + return; + } + if (subcommand !== 'scorecard') { throw new Error(`Unknown report subcommand: ${subcommand}`); } diff --git a/src/context.js b/src/context.js index 1d667b6a..cf2f21ce 100644 --- a/src/context.js +++ b/src/context.js @@ -361,7 +361,7 @@ const CLI_COMMAND_DESCRIPTIONS = [ ['release', 'Create or update the current GitHub release with README-generated notes'], ['agents', 'Start/stop repo-scoped review + cleanup bots'], ['prompt', 'Print AI setup checklist or named slices (--exec, --part, --list-parts, --snippet)'], - ['report', 'Security/safety reports (e.g. OpenSSF scorecard)'], + ['report', 'Security/safety reports (e.g. OpenSSF scorecard, session severity)'], ['help', 'Show this help output'], ['version', 'Print GitGuardex version'], ]; diff --git a/src/report/session-severity.js b/src/report/session-severity.js new file mode 100644 index 00000000..809c0e57 --- /dev/null +++ b/src/report/session-severity.js @@ -0,0 +1,213 @@ +const TASK_SIZE_UPPER_BOUNDS = { + 'narrow-patch': 1_800_000, + 'medium-change': 4_000_000, + 'large-change': 8_000_000, +}; + +const TASK_SIZE_VALUES = new Set(Object.keys(TASK_SIZE_UPPER_BOUNDS)); +const FRAGMENTATION_PRESET_SCORES = { + clean: 0, + 'few-extra-checks': 5, + 'repeated-follow-ups': 10, + looping: 18, + 'dominant-loop': 25, +}; +const FINISH_PATH_PRESET_SCORES = { + 'clear-early': 0, + 'minor-hesitation': 5, + 'late-decision': 10, + reopening: 15, +}; +const POST_PROOF_PRESET_SCORES = { + 'stops-soon': 0, + 'small-tail': 5, + 'notable-tail': 10, + 'heavy-tail': 15, +}; +const DRIVER_TIE_BREAK = ['fragmentation', 'writeStdin', 'finishPath', 'postProof', 'cost']; +const DRIVER_LABELS = { + cost: 'cost vs expected scope', + fragmentation: 'turn fragmentation', + writeStdin: 'write_stdin churn', + finishPath: 'finish-path discipline', + postProof: 'post-proof drift', +}; + +function parseRequiredPositiveInteger(name, rawValue, { allowZero = true } = {}) { + const parsed = Number.parseInt(String(rawValue || ''), 10); + if (!Number.isFinite(parsed) || (!allowZero && parsed <= 0) || (allowZero && parsed < 0)) { + throw new Error(`${name} requires ${allowZero ? 'a non-negative integer' : 'a positive integer'} value`); + } + return parsed; +} + +function parseBooleanFlag(name, rawValue) { + const normalized = String(rawValue || '').trim().toLowerCase(); + if (normalized === 'yes' || normalized === 'true' || normalized === '1') { + return true; + } + if (normalized === 'no' || normalized === 'false' || normalized === '0') { + return false; + } + throw new Error(`${name} requires yes/no (or true/false, 1/0)`); +} + +function clampScore(value, min, max) { + return Math.max(min, Math.min(max, Math.round(value))); +} + +function parseTaskSize(rawTaskSize) { + const normalized = String(rawTaskSize || '').trim(); + if (!TASK_SIZE_VALUES.has(normalized)) { + throw new Error(`--task-size must be one of: ${Array.from(TASK_SIZE_VALUES).join(', ')}`); + } + return normalized; +} + +function resolveExpectedUpperBound(taskSize, rawExpectedBound) { + if (rawExpectedBound) { + return parseRequiredPositiveInteger('--expected-bound', rawExpectedBound, { allowZero: false }); + } + return TASK_SIZE_UPPER_BOUNDS[taskSize]; +} + +function scoreCost(tokens, expectedUpperBound) { + const ratio = tokens / expectedUpperBound; + if (ratio <= 1.0) return 0; + if (ratio <= 1.5) return 5; + if (ratio <= 2.5) return 10; + if (ratio <= 4.0) return 18; + if (ratio <= 6.0) return 24; + return 30; +} + +function scoreFragmentation(execCount, override) { + if (override) { + if (Object.prototype.hasOwnProperty.call(FRAGMENTATION_PRESET_SCORES, override)) { + return FRAGMENTATION_PRESET_SCORES[override]; + } + return clampScore(parseRequiredPositiveInteger('--fragmentation', override), 0, 25); + } + if (execCount <= 4) return 0; + if (execCount <= 8) return 5; + if (execCount <= 16) return 10; + if (execCount <= 28) return 18; + return 25; +} + +function scoreWriteStdin(writeStdinCount) { + if (writeStdinCount <= 0) return 0; + if (writeStdinCount <= 3) return 5; + if (writeStdinCount <= 6) return 10; + return 15; +} + +function scoreFinishPath(completionBeforeTail, override) { + if (override) { + if (Object.prototype.hasOwnProperty.call(FINISH_PATH_PRESET_SCORES, override)) { + return FINISH_PATH_PRESET_SCORES[override]; + } + return clampScore(parseRequiredPositiveInteger('--finish-path', override), 0, 15); + } + return completionBeforeTail ? 0 : 5; +} + +function scorePostProof(completionBeforeTail, override) { + if (override) { + if (Object.prototype.hasOwnProperty.call(POST_PROOF_PRESET_SCORES, override)) { + return POST_PROOF_PRESET_SCORES[override]; + } + return clampScore(parseRequiredPositiveInteger('--post-proof', override), 0, 15); + } + return completionBeforeTail ? 0 : 10; +} + +function labelForTotal(total) { + if (total <= 15) return 'Healthy'; + if (total <= 30) return 'Mildly fragmented'; + if (total <= 50) return 'Inefficient'; + if (total <= 75) return 'Runaway'; + return 'Catastrophic'; +} + +function buildSessionSeverityReport(options) { + const taskSize = parseTaskSize(options.taskSize); + const tokens = parseRequiredPositiveInteger('--tokens', options.tokens); + const execCount = parseRequiredPositiveInteger('--exec-count', options.execCount); + const writeStdinCount = parseRequiredPositiveInteger('--write-stdin-count', options.writeStdinCount); + const completionBeforeTail = parseBooleanFlag('--completion-before-tail', options.completionBeforeTail); + const expectedUpperBound = resolveExpectedUpperBound(taskSize, options.expectedBound); + const costRatio = tokens / expectedUpperBound; + const scores = { + cost: scoreCost(tokens, expectedUpperBound), + fragmentation: scoreFragmentation(execCount, options.fragmentation), + writeStdin: scoreWriteStdin(writeStdinCount), + finishPath: scoreFinishPath(completionBeforeTail, options.finishPath), + postProof: scorePostProof(completionBeforeTail, options.postProof), + }; + const total = scores.cost + scores.fragmentation + scores.writeStdin + scores.finishPath + scores.postProof; + const label = labelForTotal(total); + const rankedDimensions = Object.entries(scores) + .map(([key, score]) => ({ key, score, label: DRIVER_LABELS[key] })) + .filter((entry) => entry.score > 0) + .sort((left, right) => { + if (right.score !== left.score) { + return right.score - left.score; + } + return DRIVER_TIE_BREAK.indexOf(left.key) - DRIVER_TIE_BREAK.indexOf(right.key); + }); + const primaryDriver = rankedDimensions[0] ? rankedDimensions[0].label : 'none'; + const secondaries = rankedDimensions.slice(1).map((entry) => entry.label); + + return { + taskSize, + expectedUpperBound, + tokens, + execCount, + writeStdinCount, + completionBeforeTail, + costRatio, + scores: { + ...scores, + total, + }, + label, + primaryDriver, + secondaries, + outputLine: `Score ${total}/100 — ${label}. Primary: ${primaryDriver}. Secondaries: ${ + secondaries.length > 0 ? secondaries.join(', ') : 'none' + }.`, + }; +} + +function renderSessionSeverityReport(report) { + return [ + report.outputLine, + '', + `Task size: ${report.taskSize}`, + `Expected upper bound: ${report.expectedUpperBound}`, + `Actual tokens: ${report.tokens}`, + `Exec count: ${report.execCount}`, + `write_stdin count: ${report.writeStdinCount}`, + `Completion before tail churn: ${report.completionBeforeTail ? 'yes' : 'no'}`, + `Cost ratio: ${report.costRatio.toFixed(2)}x`, + '', + `A. Cost vs expected scope: ${report.scores.cost}`, + `B. Turn fragmentation: ${report.scores.fragmentation}`, + `C. write_stdin churn: ${report.scores.writeStdin}`, + `D. Finish-path discipline: ${report.scores.finishPath}`, + `E. Post-proof drift: ${report.scores.postProof}`, + '', + `Total: ${report.scores.total}`, + `Label: ${report.label}`, + `Primary driver: ${report.primaryDriver}`, + `Secondary drivers: ${report.secondaries.length > 0 ? report.secondaries.join(', ') : 'none'}`, + ].join('\n'); +} + +module.exports = { + TASK_SIZE_UPPER_BOUNDS, + buildSessionSeverityReport, + renderSessionSeverityReport, + labelForTotal, +}; diff --git a/test/cli-args-dispatch.test.js b/test/cli-args-dispatch.test.js index b71a2b02..9f1b08a2 100644 --- a/test/cli-args-dispatch.test.js +++ b/test/cli-args-dispatch.test.js @@ -15,6 +15,7 @@ const { parseSetupArgs, parseDoctorArgs, parseAgentsArgs, + parseReportArgs, parseCleanupArgs, parseMergeArgs, parseFinishArgs, @@ -111,6 +112,43 @@ test('parseAgentsArgs applies interval overrides and validates the subcommand', }); }); +test('parseReportArgs accepts the session-severity flag set', () => { + const options = parseReportArgs([ + 'session-severity', + '--task-size', + 'medium-change', + '--tokens', + '2100000', + '--exec-count', + '12', + '--write-stdin-count', + '4', + '--completion-before-tail', + 'no', + '--expected-bound', + '4000000', + '--fragmentation', + '10', + '--finish-path', + 'late-decision', + '--post-proof', + 'heavy-tail', + '--json', + ]); + + assert.equal(options.subcommand, 'session-severity'); + assert.equal(options.taskSize, 'medium-change'); + assert.equal(options.tokens, '2100000'); + assert.equal(options.execCount, '12'); + assert.equal(options.writeStdinCount, '4'); + assert.equal(options.completionBeforeTail, 'no'); + assert.equal(options.expectedBound, '4000000'); + assert.equal(options.fragmentation, '10'); + assert.equal(options.finishPath, 'late-decision'); + assert.equal(options.postProof, 'heavy-tail'); + assert.equal(options.json, true); +}); + test('parseCleanupArgs defaults idle minutes when watch mode is enabled', () => { const options = parseCleanupArgs(['--watch']); assert.equal(options.watch, true); @@ -178,6 +216,7 @@ test('shared context keeps the drift-prone help text, gitignore paths, and relea assert.ok(MANAGED_GITIGNORE_PATHS.includes('!.vscode/')); assert.ok(MANAGED_GITIGNORE_PATHS.includes('.vscode/*')); assert.ok(MANAGED_GITIGNORE_PATHS.includes('!.vscode/settings.json')); + assert.match(descriptions.get('report'), /session severity/); assert.equal(MAINTAINER_RELEASE_REPO, repoRoot); }); diff --git a/test/report.test.js b/test/report.test.js index 73a17a4d..1f44a93b 100644 --- a/test/report.test.js +++ b/test/report.test.js @@ -99,4 +99,67 @@ exit 1 assert.match(remediation, /Verification loop/); }); +test('report session-severity prints the weighted rubric summary', () => { + const repoDir = initRepo(); + const result = runNode([ + 'report', + 'session-severity', + '--task-size', + 'narrow-patch', + '--tokens', + '3850000', + '--exec-count', + '18', + '--write-stdin-count', + '6', + '--completion-before-tail', + 'yes', + '--fragmentation', + '14', + '--finish-path', + '6', + '--post-proof', + '4', + ], repoDir); + + assert.equal(result.status, 0, result.stderr || result.stdout); + assert.match(result.stdout, /Score 44\/100 — Inefficient\./); + assert.match(result.stdout, /Primary: turn fragmentation\./); + assert.match(result.stdout, /Secondaries: write_stdin churn, cost vs expected scope, finish-path discipline, post-proof drift\./); + assert.match(result.stdout, /A\. Cost vs expected scope: 10/); + assert.match(result.stdout, /Total: 44/); +}); + +test('report session-severity emits structured JSON when requested', () => { + const repoDir = initRepo(); + const result = runNode([ + 'report', + 'session-severity', + '--task-size', + 'medium-change', + '--tokens', + '2100000', + '--exec-count', + '12', + '--write-stdin-count', + '4', + '--completion-before-tail', + 'no', + '--fragmentation', + '10', + '--finish-path', + '10', + '--post-proof', + '10', + '--json', + ], repoDir); + + assert.equal(result.status, 0, result.stderr || result.stdout); + const payload = JSON.parse(result.stdout); + assert.equal(payload.taskSize, 'medium-change'); + assert.equal(payload.scores.total, 40); + assert.equal(payload.label, 'Inefficient'); + assert.equal(payload.primaryDriver, 'turn fragmentation'); +}); + });