Skip to content

Commit 477017d

Browse files
committed
measure eval-planner latency
1 parent cbbcd73 commit 477017d

File tree

3 files changed

+44
-20
lines changed

3 files changed

+44
-20
lines changed

.agents/read-only-commander.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ const readOnlyCommander: SecretAgentDefinition = {
77
model: 'anthropic/claude-sonnet-4.5',
88
displayName: 'ReadOnly Commander',
99
spawnerPrompt:
10-
'Can run read-only terminal commands and codebase string search to answer questions with good analysis. Feel free to spawn mulitple in parallel.',
10+
'Can run read-only terminal commands to answer questions with good analysis. Feel free to spawn mulitple in parallel.',
1111
inputSchema: {
1212
prompt: {
1313
type: 'string',

evals/subagents/eval-planner.ts

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ export const evalPlannerAgent = async (params: {
3333
initCommand,
3434
fileStates,
3535
} = params
36+
const plannerStartTime = Date.now()
3637
const result = await withTestRepo(
3738
{ repoUrl, commitSha, initCommand, checkoutPrevious: true },
3839
async (cwd) => {
@@ -49,6 +50,7 @@ export const evalPlannerAgent = async (params: {
4950
})
5051
},
5152
)
53+
const plannerLatencyMs = Date.now() - plannerStartTime
5254

5355
const { output } = result
5456

@@ -137,6 +139,7 @@ Evaluate how well the implementation plan matches the real commit changes. Consi
137139
overallScore: 0,
138140
},
139141
agentOutput: outputString,
142+
plannerLatencyMs,
140143
}
141144
}
142145
const { output: judgeOutput } = judgeResult
@@ -147,7 +150,7 @@ Evaluate how well the implementation plan matches the real commit changes. Consi
147150
overallScore: number
148151
}
149152

150-
return { judgingResults, agentOutput: outputString }
153+
return { judgingResults, agentOutput: outputString, plannerLatencyMs }
151154
}
152155

153156
const judgeAgent: AgentDefinition = {
@@ -245,6 +248,7 @@ async function main() {
245248
cons: string
246249
overallScore: number
247250
}
251+
plannerLatencyMs: number
248252
}>
249253

250254
// Track statistics
@@ -253,6 +257,7 @@ async function main() {
253257
completed: 0,
254258
failed: 0,
255259
scores: [] as number[],
260+
plannerLatencies: [] as number[],
256261
}
257262

258263
// Loop through each eval task
@@ -274,12 +279,13 @@ async function main() {
274279
fileStates,
275280
})
276281

277-
const { judgingResults, agentOutput } = result
282+
const { judgingResults, agentOutput, plannerLatencyMs } = result
278283
allResults.push({
279284
sha,
280285
spec,
281286
agentOutput,
282287
judgingResults,
288+
plannerLatencyMs,
283289
})
284290

285291
fs.writeFileSync(
@@ -310,10 +316,14 @@ async function main() {
310316
const emptyBar = '░'.repeat(10 - Math.floor(overallScore / 10))
311317
console.log(`${scoreBar}${emptyBar} ${overallScore}/100`)
312318

319+
console.log('\n⏱️ LATENCY:')
320+
console.log(` ${(plannerLatencyMs / 1000).toFixed(2)}s`)
321+
313322
console.log('\n' + '='.repeat(80) + '\n')
314323

315324
stats.completed++
316325
stats.scores.push(overallScore)
326+
stats.plannerLatencies.push(plannerLatencyMs)
317327
} catch (error) {
318328
console.log(`\n${'='.repeat(80)}`)
319329
console.error(`✗ Failed eval for commit ${sha}`)
@@ -360,6 +370,22 @@ async function main() {
360370
)
361371
}
362372

373+
if (stats.plannerLatencies.length > 0) {
374+
const avgPlannerLatency =
375+
stats.plannerLatencies.reduce((a, b) => a + b, 0) / stats.plannerLatencies.length
376+
const minPlannerLatency = Math.min(...stats.plannerLatencies)
377+
const maxPlannerLatency = Math.max(...stats.plannerLatencies)
378+
const medianPlannerLatency = stats.plannerLatencies.sort((a, b) => a - b)[
379+
Math.floor(stats.plannerLatencies.length / 2)
380+
]
381+
382+
console.log('Latency Statistics:')
383+
console.log(` Average: ${(avgPlannerLatency / 1000).toFixed(2)}s`)
384+
console.log(` Median: ${(medianPlannerLatency / 1000).toFixed(2)}s`)
385+
console.log(` Min: ${(minPlannerLatency / 1000).toFixed(2)}s`)
386+
console.log(` Max: ${(maxPlannerLatency / 1000).toFixed(2)}s\n`)
387+
}
388+
363389
console.log('='.repeat(80))
364390
}
365391

sdk/src/run.ts

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -318,24 +318,22 @@ async function handleToolCall({
318318
)
319319
}
320320
} catch (error) {
321-
return {
322-
output: [
323-
{
324-
type: 'json',
325-
value: {
326-
errorMessage:
327-
error &&
328-
typeof error === 'object' &&
329-
'message' in error &&
330-
typeof error.message === 'string'
331-
? error.message
332-
: typeof error === 'string'
333-
? error
334-
: 'Unknown error',
335-
},
321+
result = [
322+
{
323+
type: 'json',
324+
value: {
325+
errorMessage:
326+
error &&
327+
typeof error === 'object' &&
328+
'message' in error &&
329+
typeof error.message === 'string'
330+
? error.message
331+
: typeof error === 'string'
332+
? error
333+
: 'Unknown error',
336334
},
337-
],
338-
}
335+
},
336+
]
339337
}
340338
return {
341339
output: result,

0 commit comments

Comments
 (0)