Skip to content

Commit 877d195

Browse files
committed
feat(runner): add support for running and repairing tests
This commit introduces the ability to run tests against the generated code as part of the evaluation process. A new optional `testCommand` can be in the environment configuration. If provided, this command will be executed after a successful build. If the tests fail, the tool will attempt to repair the code using the LLM, similar to how build failures are handled. The number of repair attempts is configurable. The report has been updated to display the test results for each run, including whether the tests passed, failed, or passed after repair. The summary view also includes aggregated statistics about the test results.
1 parent fa629ce commit 877d195

22 files changed

+640
-5
lines changed

docs/environment-reference.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
179179

180180
Command used to start a local dev server as a part of the evaluation.
181181
Defaults to `<package manager> run start --port 0`.
182+
183+
### `testCommand`
184+
185+
Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 2 minutes.
186+

report-app/src/app/pages/report-viewer/report-viewer.html

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,20 @@ <h3 class="chart-title">
7676
/>
7777
</div>
7878
</div>
79+
@if (overview.stats.tests) {
80+
<div class="chart-container test-results-details">
81+
<h3 class="chart-title">
82+
<span class="material-symbols-outlined"> quiz </span>
83+
<span>Tests</span>
84+
</h3>
85+
<div class="summary-card-item">
86+
<stacked-bar-chart
87+
[data]="testsAsGraphData(overview.stats.tests)"
88+
[compact]="true"
89+
/>
90+
</div>
91+
</div>
92+
}
7993
@if (overview.stats.runtime) {
8094
<div class="chart-container">
8195
<h3 class="chart-title">
@@ -273,6 +287,17 @@ <h2>Generated applications</h2>
273287
@if (initialAttempt?.buildResult?.status === 'error') {
274288
<span class="status-badge error">Initial build failed</span>
275289
}
290+
291+
<!-- Test status badges -->
292+
@if (finalAttempt.testResult) {
293+
@if (finalAttempt.testResult.passed) {
294+
@if ((result.testRepairAttempts || 0) > 0) {
295+
<span class="status-badge warning">Tests passed after repair</span>
296+
}
297+
} @else {
298+
<span class="status-badge error">Tests failed</span>
299+
}
300+
}
276301
</div>
277302
</div>
278303
</expansion-panel-header>
@@ -348,6 +373,29 @@ <h5>
348373
</div>
349374
</div>
350375

376+
@if (result.testResult) {
377+
<div class="app-details-section">
378+
<h4>Test Results</h4>
379+
<div class="test-summary">
380+
@if (result.testResult.passed) {
381+
<span class="status-text success">✔ Tests passed</span>
382+
@if ((result.testRepairAttempts || 0) > 0) {
383+
<span class="status-text">after {{ result.testRepairAttempts }} repair attempt(s)</span>
384+
}
385+
} @else {
386+
<span class="status-text error">✘ Tests failed</span>
387+
}
388+
</div>
389+
390+
@if (result.testResult.output && !result.testResult.passed) {
391+
<details class="test-output-button">
392+
<summary class="neutral-button">See Test Output</summary>
393+
<pre class="callout neutral code">{{ result.testResult.output }}</pre>
394+
</details>
395+
}
396+
</div>
397+
}
398+
351399
<div class="app-details-section">
352400
<h4>Additional info</h4>
353401
@for (attempt of result.attemptDetails; track attempt) {
@@ -356,6 +404,7 @@ <h4>Additional info</h4>
356404
attempt.serveTestingResult?.axeViolations;
357405
@let hasAxeViolations =
358406
axeViolations && axeViolations.length > 0;
407+
@let testsFailed = attempt.testResult?.passed === false;
359408

360409
<expansion-panel #expansionPanel>
361410
<expansion-panel-header>
@@ -380,6 +429,15 @@ <h4>Additional info</h4>
380429
>A11y</span
381430
>
382431
}
432+
433+
@if (attempt.testResult) {
434+
<span
435+
class="status-badge"
436+
[class.error]="!attempt.testResult.passed"
437+
[class.success]="attempt.testResult.passed"
438+
>Tests</span
439+
>
440+
}
383441
</expansion-panel-header>
384442

385443
@if (expansionPanel.opened()) {
@@ -418,6 +476,11 @@ <h4>A11y Violations</h4>
418476
</pre>
419477
}
420478

479+
@if (testsFailed) {
480+
<h4>Failed Tests</h4>
481+
<pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
482+
}
483+
421484
<h4>Generated Code</h4>
422485

423486
@for (file of attempt.outputFiles; track file) {

report-app/src/app/pages/report-viewer/report-viewer.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import {
2323
LlmResponseFile,
2424
RunInfo,
2525
RunSummaryBuilds,
26+
RunSummaryTests,
2627
RuntimeStats,
2728
ScoreBucket,
2829
SkippedIndividualAssessment,
@@ -264,6 +265,31 @@ export class ReportViewer {
264265
];
265266
}
266267

268+
protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
269+
return [
270+
{
271+
label: 'Passed',
272+
color: ScoreCssVariable.excellent,
273+
value: tests.successfulInitialTests,
274+
},
275+
{
276+
label: 'Passed after repair',
277+
color: ScoreCssVariable.great,
278+
value: tests.successfulTestsAfterRepair,
279+
},
280+
{
281+
label: 'Failed',
282+
color: ScoreCssVariable.poor,
283+
value: tests.failedTests,
284+
},
285+
{
286+
label: 'No tests run',
287+
color: ScoreCssVariable.neutral,
288+
value: tests.noTestsRun,
289+
},
290+
];
291+
}
292+
267293
protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
268294
return buckets.map(b => ({
269295
label: b.nameWithLabels,

runner/configuration/constants.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
2626
*/
2727
export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
2828

29+
/**
30+
* Number of times we'll try to ask LLM to repair a test failure,
31+
* providing the test output and the code that causes the problem.
32+
*/
33+
export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
34+
2935
/** Name of the folder where we store all generated reports */
3036
export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
3137

runner/configuration/environment-local.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
2828
* Defaults to `<package manager> run start --port 0`.
2929
*/
3030
serveCommand: z.string().optional(),
31+
/**
32+
* Command to run when testing the code.
33+
*/
34+
testCommand: z.string().optional(),
3135
/**
3236
* Whether to skip installing dependencies when running evals in the environment.
3337
* Useful if you're managing dependencies yourself.
@@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment {
4751
readonly buildCommand: string;
4852
/** Command to run when starting a development server inside the app. */
4953
readonly serveCommand: string;
54+
/** Command to run when starting tests inside the app. */
55+
readonly testCommand: string | null;
5056
/**
5157
* Absolute path at which files specific to this environment are located. Will be merged in
5258
* with the files from the `projectTemplatePath` to get the final project structure.
@@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment {
8288
this.installCommand = `${packageManager} install --silent`;
8389
this.buildCommand = config.buildCommand || `${packageManager} run build`;
8490
this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager);
91+
this.testCommand = config.testCommand ?? null;
8592
this.projectTemplatePath = projectTemplatePath;
8693
this.sourceDirectory = sourceDirectory;
8794
this.mcpServerOptions = config.mcpServers || [];

runner/orchestration/build-repair.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ import {EvalID, Gateway} from './gateway.js';
2828
* @param abortSignal An AbortSignal to cancel the operation.
2929
* @param workerConcurrencyQueue The queue for managing worker concurrency.
3030
* @param attempts The current attempt number.
31-
* @param repairType The type of repair being performed.
3231
* @returns A promise that resolves to the new BuildResult.
3332
*/
3433
export async function repairAndBuild(
@@ -49,7 +48,7 @@ export async function repairAndBuild(
4948
): Promise<AttemptDetails> {
5049
const repairResponse = await repairCodeWithAI(
5150
evalID,
52-
gateway,
51+
gateway.repairBuild.bind(gateway),
5352
model,
5453
env,
5554
rootPromptDef,

runner/orchestration/codegen.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ export async function generateCodeWithAI(
8888
*/
8989
export async function repairCodeWithAI(
9090
evalID: EvalID,
91-
gateway: Gateway<Environment>,
91+
repairer: Gateway<Environment>['repairBuild'] | Gateway<Environment>['repairTest'],
9292
model: string,
9393
env: Environment,
9494
promptDef: RootPromptDefinition,
@@ -123,7 +123,7 @@ export async function repairCodeWithAI(
123123

124124
progress.log(promptDef, 'codegen', 'Repairing code with AI');
125125

126-
const response = await gateway.repairBuild(
126+
const response = await repairer(
127127
evalID,
128128
context,
129129
model,

runner/orchestration/gateway.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {
77
LlmResponse,
88
LlmResponseFile,
99
RootPromptDefinition,
10+
TestResult,
1011
} from '../shared-interfaces.js';
1112
import {BuildResult} from '../workers/builder/builder-types.js';
1213

@@ -35,6 +36,16 @@ export interface Gateway<Env extends Environment> {
3536
abortSignal: AbortSignal,
3637
): Promise<LlmResponse>;
3738

39+
repairTest(
40+
id: EvalID,
41+
requestCtx: LlmGenerateFilesContext,
42+
model: string,
43+
errorMessage: string,
44+
appFiles: LlmResponseFile[],
45+
contextFiles: LlmContextFile[],
46+
abortSignal: AbortSignal,
47+
): Promise<LlmResponse>;
48+
3849
shouldRetryFailedBuilds(evalID: EvalID): boolean;
3950

4051
tryBuild(
@@ -47,6 +58,18 @@ export interface Gateway<Env extends Environment> {
4758
progress: ProgressLogger,
4859
): Promise<BuildResult>;
4960

61+
tryTest(
62+
id: EvalID,
63+
env: Env,
64+
appDirectoryPath: string,
65+
rootPromptDef: RootPromptDefinition,
66+
workerConcurrencyQueue: PQueue,
67+
abortSignal: AbortSignal,
68+
progress: ProgressLogger,
69+
): Promise<TestResult>;
70+
71+
shouldRetryFailedTests(evalID: EvalID): boolean;
72+
5073
serveBuild<T>(
5174
id: EvalID,
5275
env: Env,

runner/orchestration/gateways/local_gateway.ts

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import {
1010
LlmContextFile,
1111
LlmResponse,
1212
LlmResponseFile,
13+
TestResult,
1314
} from '../../shared-interfaces.js';
1415
import {generateCodeWithAI} from '../codegen.js';
1516
import {EvalID, Gateway} from '../gateway.js';
@@ -51,6 +52,18 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
5152
return await generateCodeWithAI(this.llm, model, requestCtx, contextFiles, abortSignal);
5253
}
5354

55+
async repairTest(
56+
_id: EvalID,
57+
requestCtx: LlmGenerateFilesContext,
58+
model: string,
59+
errorMessage: string,
60+
appFiles: LlmResponseFile[],
61+
contextFiles: LlmContextFile[],
62+
abortSignal: AbortSignal,
63+
): Promise<LlmResponse> {
64+
return await generateCodeWithAI(this.llm, model, requestCtx, contextFiles, abortSignal);
65+
}
66+
5467
tryBuild(
5568
_id: EvalID,
5669
env: LocalEnvironment,
@@ -88,6 +101,43 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
88101
);
89102
}
90103

104+
tryTest(
105+
_id: EvalID,
106+
env: LocalEnvironment,
107+
appDirectoryPath: string,
108+
rootPromptDef: RootPromptDefinition,
109+
workerConcurrencyQueue: PQueue,
110+
abortSignal: AbortSignal,
111+
progress: ProgressLogger,
112+
): Promise<TestResult> {
113+
const testParams = {
114+
directory: appDirectoryPath,
115+
appName: rootPromptDef.name,
116+
testCommand: env.testCommand,
117+
};
118+
119+
return workerConcurrencyQueue.add(
120+
() =>
121+
new Promise<TestResult>((resolve, reject) => {
122+
const child: ChildProcess = fork(
123+
path.resolve(import.meta.dirname, '../../workers/test/worker.js'),
124+
{signal: abortSignal},
125+
);
126+
child.send(testParams);
127+
128+
child.on('message', async (result: any) => {
129+
await killChildProcessGracefully(child);
130+
resolve(result.payload);
131+
});
132+
child.on('error', async err => {
133+
await killChildProcessGracefully(child);
134+
reject(err);
135+
});
136+
}),
137+
{throwOnTimeout: true},
138+
);
139+
}
140+
91141
async serveBuild<T>(
92142
_id: EvalID,
93143
env: LocalEnvironment,
@@ -109,5 +159,9 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
109159
return this.llm.hasBuiltInRepairLoop === false;
110160
}
111161

162+
shouldRetryFailedTests(): boolean {
163+
return this.llm.hasBuiltInRepairLoop === false;
164+
}
165+
112166
async finalizeEval(_id: EvalID): Promise<void> {}
113167
}

0 commit comments

Comments
 (0)