Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
366 changes: 322 additions & 44 deletions .github/workflows/ci.yml

Large diffs are not rendered by default.

20 changes: 13 additions & 7 deletions .github/workflows/eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,27 @@ on:
jobs:
evaluate:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
issues: write
steps:
- uses: actions/checkout@v4

- name: Setup pnpm
uses: pnpm/action-setup@v4

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
cache: 'npm'
cache-dependency-path: package-lock.json
cache: 'pnpm'

- name: Install dependencies
run: npm ci --legacy-peer-deps
run: pnpm install --frozen-lockfile

- name: Build
run: npm run build
run: pnpm build

- name: Download baseline results
if: github.event_name == 'pull_request'
Expand All @@ -40,23 +46,23 @@ jobs:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
mkdir -p results
npx agent-eval-harness eval \
node dist/cli.js eval \
trajectories/examples/*.jsonl \
--config trajectories/examples/config.yaml \
--output results/

- name: Run regression gates
if: github.event_name == 'pull_request' && hashFiles('baseline/') != ''
run: |
npx agent-eval-harness compare \
node dist/cli.js compare \
baseline/results.json \
results/results.json \
--format markdown \
--output results/comparison.md

- name: Check gates
run: |
npx agent-eval-harness gate \
node dist/cli.js gate \
results/results.json \
--preset standard \
--exit-code
Expand Down
77 changes: 52 additions & 25 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,75 +4,102 @@ on:
push:
tags:
- 'v*'
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: false

env:
NODE_VERSION: 22

jobs:
release:
name: Release
runs-on: ubuntu-latest
permissions:
contents: write
packages: write
id-token: write
steps:
- uses: actions/checkout@v4

- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Setup pnpm
uses: pnpm/action-setup@v4

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
cache: 'npm'
cache-dependency-path: package-lock.json
node-version: ${{ env.NODE_VERSION }}
cache: 'pnpm'
registry-url: 'https://registry.npmjs.org'

- name: Install dependencies
run: npm ci
run: pnpm install --frozen-lockfile

- name: Run tests
run: npm test
run: pnpm test

- name: Build
run: npm run build
run: pnpm build

- name: Publish to npm
run: npm publish --access public
run: pnpm publish --access public --no-git-checks
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}

NPM_CONFIG_PROVENANCE: 'true'

- name: Mirror to GitHub Packages
env:
NODE_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
cat > .npmrc <<EOF
@reaatech:registry=https://npm.pkg.github.com
//npm.pkg.github.com/:_authToken=${NODE_AUTH_TOKEN}
EOF
pnpm publish --registry=https://npm.pkg.github.com --no-git-checks

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Build and push Docker image
uses: docker/build-push-action@v5
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: |
${{ github.repository }}:${{ github.ref_name }}
${{ github.repository }}:latest
cache-from: type=registry,ref=${{ github.repository }}:buildcache
cache-to: type=inline
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Create GitHub Release
uses: softprops/action-gh-release@v1
uses: softprops/action-gh-release@v2
with:
generate_release_notes: true
files: |
dist/*.js
body: |
## Changes
See the [CHANGELOG](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.

## Installation

### npm
```bash
npm install agent-eval-harness
npm install @reaatech/agent-eval-harness
```

### Docker
```bash
docker pull ${{ github.repository }}:${{ github.ref_name }}
```
13 changes: 2 additions & 11 deletions .lintstagedrc.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,4 @@
{
"src/**/*.{ts,js}": [
"eslint --fix",
"prettier --write"
],
"tests/**/*.{ts,js}": [
"eslint --fix",
"prettier --write"
],
"*.{json,md,yaml,yml}": [
"prettier --write"
]
"*.{ts,js,json,jsonc}": ["biome check --write --no-errors-on-unmatched"],
"*.{md,yaml,yml}": ["biome format --write --no-errors-on-unmatched"]
}
2 changes: 2 additions & 0 deletions .npmrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
shamefully-hoist=false
strict-peer-dependencies=true
11 changes: 0 additions & 11 deletions .prettierrc

This file was deleted.

8 changes: 4 additions & 4 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ Golden trajectories serve as reference implementations for regression testing.
### Comparing Against Golden

```typescript
import { compareAgainstGolden } from 'agent-eval-harness';
import { compareAgainstGolden } from '@reaatech/agent-eval-harness';

const result = compareAgainstGolden(trajectory, goldenTrajectory, {
similarityThreshold: 0.85,
Expand Down Expand Up @@ -251,7 +251,7 @@ judge:
4. **Apply calibration** to future judge scores

```typescript
import { calibrate, applyCalibration } from 'agent-eval-harness';
import { calibrate, applyCalibration } from '@reaatech/agent-eval-harness';

await calibrate({
humanLabelsPath: 'calibration/human-labels.jsonl',
Expand Down Expand Up @@ -363,7 +363,7 @@ latency:
### Latency Monitoring

```typescript
import { monitorLatency } from 'agent-eval-harness';
import { monitorLatency } from '@reaatech/agent-eval-harness';

const budget = {
per_turn_p99: 5000,
Expand Down Expand Up @@ -405,7 +405,7 @@ tool_validation:
### Validation Example

```typescript
import { validateTrajectory, validateSchema } from 'agent-eval-harness';
import { validateTrajectory, validateSchema } from '@reaatech/agent-eval-harness';

const toolSchemas = {
send_reset_email: {
Expand Down
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ describe('MyEvaluator', () => {

```typescript
import { describe, it, expect } from 'vitest';
import { loadFromFile, evaluate } from 'agent-eval-harness';
import { loadFromFile, evaluate } from '@reaatech/agent-eval-harness';

describe('Integration: Load and Evaluate', () => {
it('should load and evaluate trajectory', () => {
Expand Down
19 changes: 9 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,28 +1,27 @@
# Stage 1: Build
FROM node:22-alpine AS builder

WORKDIR /app
RUN npm install -g pnpm@10

# Copy package files
COPY package.json package-lock.json ./
WORKDIR /app

# Install dependencies (full install for build)
RUN npm ci --legacy-peer-deps && npm cache clean --force
COPY package.json pnpm-lock.yaml ./
RUN pnpm install --frozen-lockfile

# Copy source
COPY tsconfig.json ./
COPY src ./src

# Build
RUN npm run build
RUN pnpm build

# Stage 2: Install production deps only
FROM node:22-alpine AS prod-deps

RUN npm install -g pnpm@10

WORKDIR /app

COPY package.json package-lock.json ./
RUN npm ci --legacy-peer-deps --only=production --ignore-scripts && npm cache clean --force
COPY package.json pnpm-lock.yaml ./
RUN pnpm install --prod --frozen-lockfile --ignore-scripts

# Stage 3: Runtime
FROM node:22-alpine AS runtime
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ End-to-end agent evaluation harness for full agent runs. Supports trajectory eva

```bash
# npm
npm install agent-eval-harness
npm install @reaatech/agent-eval-harness

# Or use without installing
npx agent-eval-harness eval trajectories/*.jsonl
Expand Down
Loading
Loading