Skip to content

Commit ae8a24d

Browse files
committed
feat(runner): add support for running and repairing tests
This commit introduces the ability to run tests against the generated code as part of the evaluation process. A new optional `testCommand` can be in the environment configuration. If provided, this command will be executed after a successful build. If the tests fail, the tool will attempt to repair the code using the LLM, similar to how build failures are handled. The number of repair attempts is configurable. The report has been updated to display the test results for each run, including whether the tests passed, failed, or passed after repair. The summary view also includes aggregated statistics about the test results.
1 parent 4145aca commit ae8a24d

22 files changed

+474
-108
lines changed

docs/environment-reference.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
179179

180180
Command used to start a local dev server as a part of the evaluation.
181181
Defaults to `<package manager> run start --port 0`.
182+
183+
### `testCommand`
184+
185+
Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 2 minutes.
186+

report-app/src/app/pages/report-viewer/report-viewer.html

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,20 @@ <h3 class="chart-title">
7373
<stacked-bar-chart [data]="buildsAsGraphData(overview.stats.builds)" [compact]="true" />
7474
</div>
7575
</div>
76+
@if (overview.stats.tests) {
77+
<div class="chart-container test-results-details">
78+
<h3 class="chart-title">
79+
<span class="material-symbols-outlined"> quiz </span>
80+
<span>Tests</span>
81+
</h3>
82+
<div class="summary-card-item">
83+
<stacked-bar-chart
84+
[data]="testsAsGraphData(overview.stats.tests)"
85+
[compact]="true"
86+
/>
87+
</div>
88+
</div>
89+
}
7690
@if (overview.stats.runtime) {
7791
<div class="chart-container">
7892
<h3 class="chart-title">
@@ -281,9 +295,19 @@ <h2>Generated applications</h2>
281295
<span class="status-badge error">Initial build failed</span>
282296
}
283297

284-
@if (hasBuildFailureDuringA11yRepair(result)) {
298+
@if (hasBuildFailureDuringTestRepair(result)) {
285299
<span class="status-badge error">Build failed after a11y repair</span>
286300
}
301+
<!-- Test status badges -->
302+
@if (finalAttempt.testResult) {
303+
@if (finalAttempt.testResult.passed) {
304+
@if ((result.testRepairAttempts || 0) > 0) {
305+
<span class="status-badge warning">Tests passed after repair</span>
306+
}
307+
} @else {
308+
<span class="status-badge error">Tests failed</span>
309+
}
310+
}
287311
</div>
288312
</div>
289313
</expansion-panel-header>
@@ -355,12 +379,36 @@ <h5>
355379
</div>
356380
</div>
357381

382+
@if (result.testResult) {
383+
<div class="app-details-section">
384+
<h4>Test Results</h4>
385+
<div class="test-summary">
386+
@if (result.testResult.passed) {
387+
<span class="status-text success">✔ Tests passed</span>
388+
@if ((result.testRepairAttempts || 0) > 0) {
389+
<span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
390+
}
391+
} @else {
392+
<span class="status-text error">✘ Tests failed</span>
393+
}
394+
</div>
395+
396+
@if (result.testResult.output && !result.testResult.passed) {
397+
<details class="test-output-button">
398+
<summary class="neutral-button">See Test Output</summary>
399+
<pre class="callout neutral code">{{ result.testResult.output }}</pre>
400+
</details>
401+
}
402+
</div>
403+
}
404+
358405
<div class="app-details-section">
359406
<h4>Additional info</h4>
360407
@for (attempt of result.attemptDetails; track attempt) {
361408
@let isBuilt = attempt.buildResult.status === 'success';
362409
@let axeViolations = attempt.serveTestingResult?.axeViolations;
363410
@let hasAxeViolations = axeViolations && axeViolations.length > 0;
411+
@let testsFailed = attempt.testResult?.passed === false;
364412

365413
<expansion-panel #expansionPanel>
366414
<expansion-panel-header>
@@ -385,6 +433,15 @@ <h4>Additional info</h4>
385433
>A11y</span
386434
>
387435
}
436+
437+
@if (attempt.testResult) {
438+
<span
439+
class="status-badge"
440+
[class.error]="!attempt.testResult.passed"
441+
[class.success]="attempt.testResult.passed"
442+
>Tests</span
443+
>
444+
}
388445
</expansion-panel-header>
389446

390447
@if (expansionPanel.opened()) {
@@ -421,6 +478,11 @@ <h4>A11y Violations</h4>
421478
</pre>
422479
}
423480

481+
@if (testsFailed) {
482+
<h4>Failed Tests</h4>
483+
<pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
484+
}
485+
424486
<h4>Generated Code</h4>
425487

426488
@for (file of attempt.outputFiles; track file) {

report-app/src/app/pages/report-viewer/report-viewer.ts

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import {
2525
LlmResponseFile,
2626
RunInfo,
2727
RunSummaryBuilds,
28+
RunSummaryTests,
2829
RuntimeStats,
2930
ScoreBucket,
3031
SkippedIndividualAssessment,
@@ -271,6 +272,31 @@ export class ReportViewer {
271272
];
272273
}
273274

275+
protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
276+
return [
277+
{
278+
label: 'Passed',
279+
color: ScoreCssVariable.excellent,
280+
value: tests.successfulInitialTests,
281+
},
282+
{
283+
label: 'Passed after repair',
284+
color: ScoreCssVariable.great,
285+
value: tests.successfulTestsAfterRepair,
286+
},
287+
{
288+
label: 'Failed',
289+
color: ScoreCssVariable.poor,
290+
value: tests.failedTests,
291+
},
292+
{
293+
label: 'No tests run',
294+
color: ScoreCssVariable.neutral,
295+
value: tests.noTestsRun,
296+
},
297+
];
298+
}
299+
274300
protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
275301
return buckets.map(b => ({
276302
label: b.nameWithLabels,
@@ -427,7 +453,7 @@ export class ReportViewer {
427453
return `wcs run --prompt=${result.promptDef.name} --env=<path to ${report.details.summary.environmentId} config>`;
428454
}
429455

430-
protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean {
431-
return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair);
456+
protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean {
457+
return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair);
432458
}
433459
}

runner/configuration/base-environment-config.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import z from 'zod';
22
import {ratingSchema} from '../ratings/rating-types.js';
33
import {MultiStepPrompt} from './multi-step-prompt.js';
4-
import {mcpServerOptionsSchema} from '../codegen/llm-runner.js';
5-
import {getPossiblePackageManagers} from './environment-config.js';
64

75
export const baseEnvironmentConfigSchema = z.strictObject({
86
/** Display name for the environment. */

runner/configuration/environment-config.ts

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,6 @@ const environmentConfigSchema = z.union([
1515
*/
1616
export type EnvironmentConfig = z.infer<typeof environmentConfigSchema>;
1717

18-
/** Package managers that are currently supported. */
19-
export function getPossiblePackageManagers() {
20-
return ['npm', 'pnpm', 'yarn'] as const;
21-
}
22-
2318
/** Asserts that the specified data is a valid environment config. */
2419
export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig {
2520
const validationResult = environmentConfigSchema.safeParse(value);

runner/configuration/environment-local.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import z from 'zod';
33
import {LlmRunner, McpServerOptions, mcpServerOptionsSchema} from '../codegen/llm-runner.js';
44
import {LocalGateway} from '../orchestration/gateways/local_gateway.js';
55
import {BaseEnvironment} from './base-environment.js';
6-
import {EnvironmentConfig, getPossiblePackageManagers} from './environment-config.js';
6+
import {getPossiblePackageManagers} from './package-managers.js';
77
import {baseEnvironmentConfigSchema} from './base-environment-config.js';
88

99
export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
@@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
2828
* Defaults to `<package manager> run start --port 0`.
2929
*/
3030
serveCommand: z.string().optional(),
31+
/**
32+
* Command to run when testing the code.
33+
*/
34+
testCommand: z.string().optional(),
3135
/**
3236
* Whether to skip installing dependencies when running evals in the environment.
3337
* Useful if you're managing dependencies yourself.
@@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment {
4751
readonly buildCommand: string;
4852
/** Command to run when starting a development server inside the app. */
4953
readonly serveCommand: string;
54+
/** Command to run when starting tests inside the app. */
55+
readonly testCommand: string | null;
5056
/**
5157
* Absolute path at which files specific to this environment are located. Will be merged in
5258
* with the files from the `projectTemplatePath` to get the final project structure.
@@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment {
8288
this.installCommand = `${packageManager} install --silent`;
8389
this.buildCommand = config.buildCommand || `${packageManager} run build`;
8490
this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager);
91+
this.testCommand = config.testCommand ?? null;
8592
this.projectTemplatePath = projectTemplatePath;
8693
this.sourceDirectory = sourceDirectory;
8794
this.mcpServerOptions = config.mcpServers || [];
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/** Package managers that are currently supported. */
2+
export function getPossiblePackageManagers() {
3+
return ['npm', 'pnpm', 'yarn'] as const;
4+
}

runner/eval-cli.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ interface Options {
3636
enableUserJourneyTesting?: boolean;
3737
enableAutoCsp?: boolean;
3838
autoraterModel?: string;
39-
a11yRepairAttempts?: number;
39+
testRepairAttempts?: number;
4040
logging?: 'text-only' | 'dynamic';
4141
}
4242

@@ -148,10 +148,11 @@ function builder(argv: Argv): Argv<Options> {
148148
default: DEFAULT_AUTORATER_MODEL_NAME,
149149
description: 'Model to use when automatically rating generated code',
150150
})
151-
.option('a11y-repair-attempts', {
151+
.option('test-repair-attempts', {
152152
type: 'number',
153153
default: 0,
154-
description: 'Number of repair attempts for discovered a11y violations',
154+
description:
155+
'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)',
155156
})
156157
.strict()
157158
.version(false)
@@ -196,7 +197,7 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
196197
logging: cliArgs.logging,
197198
autoraterModel: cliArgs.autoraterModel,
198199
skipAiSummary: cliArgs.skipAiSummary,
199-
a11yRepairAttempts: cliArgs.a11yRepairAttempts,
200+
testRepairAttempts: cliArgs.testRepairAttempts,
200201
});
201202

202203
logReportToConsole(runInfo);

0 commit comments

Comments
 (0)