angular
diff --git a/‎docs/environment-reference.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/environment-reference.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎report-app/src/app/pages/report-viewer/report-viewer.html‎
Lines changed: 63 additions & 0 deletions b/‎report-app/src/app/pages/report-viewer/report-viewer.html‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎report-app/src/app/pages/report-viewer/report-viewer.ts‎
Lines changed: 26 additions & 0 deletions b/‎report-app/src/app/pages/report-viewer/report-viewer.ts‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎runner/configuration/constants.ts‎
Lines changed: 6 additions & 0 deletions b/‎runner/configuration/constants.ts‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎runner/configuration/environment-local.ts‎
Lines changed: 7 additions & 0 deletions b/‎runner/configuration/environment-local.ts‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎runner/orchestration/build-repair.ts‎
Lines changed: 1 addition & 2 deletions b/‎runner/orchestration/build-repair.ts‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎runner/orchestration/codegen.ts‎
Lines changed: 2 additions & 2 deletions b/‎runner/orchestration/codegen.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎runner/orchestration/gateway.ts‎
Lines changed: 23 additions & 0 deletions b/‎runner/orchestration/gateway.ts‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎runner/orchestration/gateways/local_gateway.ts‎
Lines changed: 54 additions & 0 deletions b/‎runner/orchestration/gateways/local_gateway.ts‎
Lines changed: 54 additions & 0 deletions
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
 
 Command used to start a local dev server as a part of the evaluation.
 Defaults to `<package manager> run start --port 0`.
+
+### `testCommand`
+
+Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 2 minutes.
+
@@ -76,6 +76,20 @@ <h3 class="chart-title">
             />
           </div>
         </div>
+        @if (overview.stats.tests) {
+          <div class="chart-container test-results-details">
+            <h3 class="chart-title">
+              <span class="material-symbols-outlined"> quiz </span>
+              <span>Tests</span>
+            </h3>
+            <div class="summary-card-item">
+              <stacked-bar-chart
+                [data]="testsAsGraphData(overview.stats.tests)"
+                [compact]="true"
+              />
+            </div>
+          </div>
+        }
         @if (overview.stats.runtime) {
           <div class="chart-container">
             <h3 class="chart-title">
@@ -273,6 +287,17 @@ <h2>Generated applications</h2>
                 @if (initialAttempt?.buildResult?.status === 'error') {
                   <span class="status-badge error">Initial build failed</span>
                 }
+
+                <!-- Test status badges -->
+                @if (finalAttempt.testResult) {
+                  @if (finalAttempt.testResult.passed) {
+                    @if ((result.testRepairAttempts || 0) > 0) {
+                      <span class="status-badge warning">Tests passed after repair</span>
+                    }
+                  } @else {
+                    <span class="status-badge error">Tests failed</span>
+                  }
+                }
               </div>
             </div>
           </expansion-panel-header>
@@ -348,6 +373,29 @@ <h5>
                 </div>
               </div>
 
+              @if (result.testResult) {
+                <div class="app-details-section">
+                  <h4>Test Results</h4>
+                  <div class="test-summary">
+                    @if (result.testResult.passed) {
+                      <span class="status-text success">✔ Tests passed</span>
+                      @if ((result.testRepairAttempts || 0) > 0) {
+                        <span class="status-text">after {{ result.testRepairAttempts }} repair attempt(s)</span>
+                      }
+                    } @else {
+                      <span class="status-text error">✘ Tests failed</span>
+                    }
+                  </div>
+                  
+                  @if (result.testResult.output && !result.testResult.passed) {
+                    <details class="test-output-button">
+                      <summary class="neutral-button">See Test Output</summary>
+                      <pre class="callout neutral code">{{ result.testResult.output }}</pre>
+                    </details>
+                  }
+                </div>
+              }
+
               <div class="app-details-section">
                 <h4>Additional info</h4>
                 @for (attempt of result.attemptDetails; track attempt) {
@@ -356,6 +404,7 @@ <h4>Additional info</h4>
                     attempt.serveTestingResult?.axeViolations;
                   @let hasAxeViolations =
                     axeViolations && axeViolations.length > 0;
+                  @let testsFailed = attempt.testResult?.passed === false;
 
                   <expansion-panel #expansionPanel>
                     <expansion-panel-header>
@@ -380,6 +429,15 @@ <h4>Additional info</h4>
                           >A11y</span
                         >
                       }
+
+                      @if (attempt.testResult) {
+                        <span
+                          class="status-badge"
+                          [class.error]="!attempt.testResult.passed"
+                          [class.success]="attempt.testResult.passed"
+                          >Tests</span
+                        >
+                      }
                     </expansion-panel-header>
 
                     @if (expansionPanel.opened()) {
@@ -418,6 +476,11 @@ <h4>A11y Violations</h4>
                         </pre>
                       }
 
+                      @if (testsFailed) {
+                        <h4>Failed Tests</h4>
+                        <pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
+                      }
+
                       <h4>Generated Code</h4>
 
                       @for (file of attempt.outputFiles; track file) {
 
@@ -23,6 +23,7 @@ import {
   LlmResponseFile,
   RunInfo,
   RunSummaryBuilds,
+  RunSummaryTests,
   RuntimeStats,
   ScoreBucket,
   SkippedIndividualAssessment,
@@ -264,6 +265,31 @@ export class ReportViewer {
     ];
   }
 
+  protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
+    return [
+      {
+        label: 'Passed',
+        color: ScoreCssVariable.excellent,
+        value: tests.successfulInitialTests,
+      },
+      {
+        label: 'Passed after repair',
+        color: ScoreCssVariable.great,
+        value: tests.successfulTestsAfterRepair,
+      },
+      {
+        label: 'Failed',
+        color: ScoreCssVariable.poor,
+        value: tests.failedTests,
+      },
+      {
+        label: 'No tests run',
+        color: ScoreCssVariable.neutral,
+        value: tests.noTestsRun,
+      },
+    ];
+  }
+
   protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
     return buckets.map(b => ({
       label: b.nameWithLabels,
 
@@ -26,6 +26,12 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
  */
 export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
 
+/**
+ * Number of times we'll try to ask LLM to repair a test failure,
+ * providing the test output and the code that causes the problem.
+ */
+export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
+
 /** Name of the folder where we store all generated reports */
 export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
 
 
@@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
    * Defaults to `<package manager> run start --port 0`.
    */
   serveCommand: z.string().optional(),
+  /**
+   * Command to run when testing the code.
+   */
+  testCommand: z.string().optional(),
   /**
    * Whether to skip installing dependencies when running evals in the environment.
    * Useful if you're managing dependencies yourself.
@@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment {
   readonly buildCommand: string;
   /** Command to run when starting a development server inside the app. */
   readonly serveCommand: string;
+  /** Command to run when starting tests inside the app. */
+  readonly testCommand: string | null;
   /**
    * Absolute path at which files specific to this environment are located. Will be merged in
    * with the files from the `projectTemplatePath` to get the final project structure.
@@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment {
     this.installCommand = `${packageManager} install --silent`;
     this.buildCommand = config.buildCommand || `${packageManager} run build`;
     this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager);
+    this.testCommand = config.testCommand ?? null;
     this.projectTemplatePath = projectTemplatePath;
     this.sourceDirectory = sourceDirectory;
     this.mcpServerOptions = config.mcpServers || [];
 
@@ -28,7 +28,6 @@ import {EvalID, Gateway} from './gateway.js';
  * @param abortSignal An AbortSignal to cancel the operation.
  * @param workerConcurrencyQueue The queue for managing worker concurrency.
  * @param attempts The current attempt number.
- * @param repairType The type of repair being performed.
  * @returns A promise that resolves to the new BuildResult.
  */
 export async function repairAndBuild(
@@ -49,7 +48,7 @@ export async function repairAndBuild(
 ): Promise<AttemptDetails> {
   const repairResponse = await repairCodeWithAI(
     evalID,
-    gateway,
+    gateway.repairBuild.bind(gateway),
     model,
     env,
     rootPromptDef,
 
@@ -88,7 +88,7 @@ export async function generateCodeWithAI(
  */
 export async function repairCodeWithAI(
   evalID: EvalID,
-  gateway: Gateway<Environment>,
+  repairer: Gateway<Environment>['repairBuild'] | Gateway<Environment>['repairTest'],
   model: string,
   env: Environment,
   promptDef: RootPromptDefinition,
@@ -123,7 +123,7 @@ export async function repairCodeWithAI(
 
   progress.log(promptDef, 'codegen', 'Repairing code with AI');
 
-  const response = await gateway.repairBuild(
+  const response = await repairer(
     evalID,
     context,
     model,
 
@@ -7,6 +7,7 @@ import {
   LlmResponse,
   LlmResponseFile,
   RootPromptDefinition,
+  TestResult,
 } from '../shared-interfaces.js';
 import {BuildResult} from '../workers/builder/builder-types.js';
 
@@ -35,6 +36,16 @@ export interface Gateway<Env extends Environment> {
     abortSignal: AbortSignal,
   ): Promise<LlmResponse>;
 
+  repairTest(
+    id: EvalID,
+    requestCtx: LlmGenerateFilesContext,
+    model: string,
+    errorMessage: string,
+    appFiles: LlmResponseFile[],
+    contextFiles: LlmContextFile[],
+    abortSignal: AbortSignal,
+  ): Promise<LlmResponse>;
+
   shouldRetryFailedBuilds(evalID: EvalID): boolean;
 
   tryBuild(
@@ -47,6 +58,18 @@ export interface Gateway<Env extends Environment> {
     progress: ProgressLogger,
   ): Promise<BuildResult>;
 
+  tryTest(
+    id: EvalID,
+    env: Env,
+    appDirectoryPath: string,
+    rootPromptDef: RootPromptDefinition,
+    workerConcurrencyQueue: PQueue,
+    abortSignal: AbortSignal,
+    progress: ProgressLogger,
+  ): Promise<TestResult>;
+
+  shouldRetryFailedTests(evalID: EvalID): boolean;
+
   serveBuild<T>(
     id: EvalID,
     env: Env,
 
@@ -10,6 +10,7 @@ import {
   LlmContextFile,
   LlmResponse,
   LlmResponseFile,
+  TestResult,
 } from '../../shared-interfaces.js';
 import {generateCodeWithAI} from '../codegen.js';
 import {EvalID, Gateway} from '../gateway.js';
@@ -51,6 +52,18 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
     return await generateCodeWithAI(this.llm, model, requestCtx, contextFiles, abortSignal);
   }
 
+  async repairTest(
+    _id: EvalID,
+    requestCtx: LlmGenerateFilesContext,
+    model: string,
+    errorMessage: string,
+    appFiles: LlmResponseFile[],
+    contextFiles: LlmContextFile[],
+    abortSignal: AbortSignal,
+  ): Promise<LlmResponse> {
+    return await generateCodeWithAI(this.llm, model, requestCtx, contextFiles, abortSignal);
+  }
+
   tryBuild(
     _id: EvalID,
     env: LocalEnvironment,
@@ -88,6 +101,43 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
     );
   }
 
+  tryTest(
+    _id: EvalID,
+    env: LocalEnvironment,
+    appDirectoryPath: string,
+    rootPromptDef: RootPromptDefinition,
+    workerConcurrencyQueue: PQueue,
+    abortSignal: AbortSignal,
+    progress: ProgressLogger,
+  ): Promise<TestResult> {
+    const testParams = {
+      directory: appDirectoryPath,
+      appName: rootPromptDef.name,
+      testCommand: env.testCommand,
+    };
+
+    return workerConcurrencyQueue.add(
+      () =>
+        new Promise<TestResult>((resolve, reject) => {
+          const child: ChildProcess = fork(
+            path.resolve(import.meta.dirname, '../../workers/test/worker.js'),
+            {signal: abortSignal},
+          );
+          child.send(testParams);
+
+          child.on('message', async (result: any) => {
+            await killChildProcessGracefully(child);
+            resolve(result.payload);
+          });
+          child.on('error', async err => {
+            await killChildProcessGracefully(child);
+            reject(err);
+          });
+        }),
+      {throwOnTimeout: true},
+    );
+  }
+
   async serveBuild<T>(
     _id: EvalID,
     env: LocalEnvironment,
@@ -109,5 +159,9 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
     return this.llm.hasBuiltInRepairLoop === false;
   }
 
+  shouldRetryFailedTests(): boolean {
+    return this.llm.hasBuiltInRepairLoop === false;
+  }
+
   async finalizeEval(_id: EvalID): Promise<void> {}
 }