Skip to content
This repository was archived by the owner on Jan 23, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
305 changes: 305 additions & 0 deletions cmd/eval.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,305 @@
package cmd

import (
"context"
"fmt"
"os"
"os/signal"
"path/filepath"
"syscall"

"github.com/agentuity/cli/internal/errsystem"
"github.com/agentuity/cli/internal/project"
"github.com/agentuity/cli/internal/util"
"github.com/agentuity/go-common/env"
"github.com/agentuity/go-common/logger"
"github.com/agentuity/go-common/tui"
"github.com/charmbracelet/huh/spinner"
"github.com/spf13/cobra"
)

type EvalObject struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
ProjectID string `json:"projectId"`
OrgID string `json:"orgId"`
}

type EvalPullObject struct {
Code string `json:"code"`
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
}

type EvalCreateResponse = project.Response[EvalObject]
type EvalPullResponse = project.Response[EvalPullObject]

func CreateGenerativeEvaluation(ctx context.Context, logger logger.Logger, baseUrl string, token string, projectId string) (string, error) {
client := util.NewAPIClient(ctx, logger, baseUrl, token)

var resp EvalCreateResponse
payload := map[string]any{
"projectId": projectId,
"type": "generative",
}

if err := client.Do("POST", "/cli/eval", payload, &resp); err != nil {
return "", fmt.Errorf("error creating generative evaluation: %w", err)
}
if !resp.Success {
return "", fmt.Errorf("failed to create generative evaluation: %s", resp.Message)
}
return resp.Data.ID, nil
}

func CreateTemplateEvaluation(ctx context.Context, logger logger.Logger, baseUrl string, token string, projectId string, name string, description string) (string, error) {
client := util.NewAPIClient(ctx, logger, baseUrl, token)

var resp EvalCreateResponse
payload := map[string]any{
"projectId": projectId,
"name": name,
"description": description,
"type": "template",
}

if err := client.Do("POST", "/cli/eval", payload, &resp); err != nil {
return "", fmt.Errorf("error creating template evaluation: %w", err)
}
if !resp.Success {
return "", fmt.Errorf("failed to create template evaluation: %s", resp.Message)
}
return resp.Data.ID, nil
}

func PullEvaluation(ctx context.Context, logger logger.Logger, baseUrl string, token string, evalId string) (*EvalPullObject, error) {
client := util.NewAPIClient(ctx, logger, baseUrl, token)

var resp EvalPullResponse
if err := client.Do("GET", fmt.Sprintf("/cli/eval/pull/%s", evalId), nil, &resp); err != nil {
return nil, fmt.Errorf("error pulling evaluation: %w", err)
}
if !resp.Success {
return nil, fmt.Errorf("failed to pull evaluation: %s", resp.Message)
}
return &resp.Data, nil
}

var evalCmd = &cobra.Command{
Use: "eval",
Short: "Evaluation related commands",
Long: `Evaluation related commands for managing evaluations and test data.

Use the subcommands to create and pull evaluation data to/from the cloud.`,
Run: func(cmd *cobra.Command, args []string) {
cmd.Help()
},
}

var evalCreateCmd = &cobra.Command{
Use: "create [name] [description]",
Short: "Create evaluation data in the cloud",
Long: `Create evaluation data in the cloud for your project.

Arguments:
[name] Optional name for the evaluation
[description] Optional description for the evaluation

Flags:
--force Don't prompt for confirmation

Examples:
agentuity eval create
agentuity eval create "My Eval" "Description of evaluation"
agentuity eval create --force "My Eval" "Description"`,
Run: func(cmd *cobra.Command, args []string) {
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
defer cancel()

logger := env.NewLogger(cmd)
context := project.EnsureProject(ctx, cmd)
dir := context.Dir
apiUrl := context.APIURL
apiKey := context.Token
theproject := context.Project

force, _ := cmd.Flags().GetBool("force")

// First, get the evaluation type
var evalType string
if !tui.HasTTY {
// Default to template when no TTY
evalType = "template"
} else {
evalType = tui.Select(logger, "What type of evaluation would you like to create?", "Choose between template-based or generative evaluation", []tui.Option{
{Text: tui.PadRight("Template", 20, " ") + tui.Muted("Use a predefined regex evaluation template"), ID: "template"},
{Text: tui.PadRight("Generative", 20, " ") + tui.Muted("AI will generate custom evaluation code"), ID: "generative"},
})
}

var name, description string

// Get name and description only for template type
if evalType == "template" {
// Get name and description from args or prompt
if len(args) > 0 {
name = args[0]
}
if len(args) > 1 {
description = args[1]
}

// Interactive flow for name and description
if name == "" {
if !tui.HasTTY {
logger.Fatal("No TTY detected, please specify an evaluation name from the command line")
}
name = tui.InputWithValidation(logger, "What should we name this evaluation?", "The name helps identify the evaluation", 255, func(name string) error {
if name == "" {
return fmt.Errorf("evaluation name cannot be empty")
}
return nil
})
}

if description == "" {
description = tui.Input(logger, "How should we describe what this evaluation tests?", "The description is optional but helpful for understanding the purpose of the evaluation")
}
}

// Confirm create unless force flag is set
if !force {
var confirmMessage string
if evalType == "template" {
confirmMessage = fmt.Sprintf("Create template evaluation '%s' in the cloud?", name)
} else {
confirmMessage = "Create generative evaluation in the cloud?"
}

if !tui.Ask(logger, confirmMessage, false) {
tui.ShowWarning("cancelled")
return
}
}

var evalId string
var evalObj *EvalPullObject
action := func() {
var err error

// Call the appropriate function based on type
if evalType == "template" {
evalId, err = CreateTemplateEvaluation(ctx, logger, apiUrl, apiKey, theproject.ProjectId, name, description)
} else {
evalId, err = CreateGenerativeEvaluation(ctx, logger, apiUrl, apiKey, theproject.ProjectId)
}

if err != nil {
errsystem.New(errsystem.ErrApiRequest, err, errsystem.WithUserMessage("Failed to create evaluation")).ShowErrorAndExit()
}

// Automatically pull the evaluation data
evalObj, err = PullEvaluation(ctx, logger, apiUrl, apiKey, evalId)
if err != nil {
errsystem.New(errsystem.ErrApiRequest, err, errsystem.WithUserMessage("Failed to pull evaluation data")).ShowErrorAndExit()
}
}

spinner.New().Title("Creating evaluation...").Action(action).Run()

// Write code to file
filename := evalObj.Name + ".ts"
evalsDir := filepath.Join(dir, "src", "evals")

Comment on lines +212 to +215
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Sanitize file name to prevent path traversal and invalid chars.

evalObj.Name + ".ts" can escape src/evals or include illegal characters.

Apply this diff (imports added below):

- filename := evalObj.Name + ".ts"
- evalsDir := filepath.Join(dir, "src", "evals")
+ safeName := filepath.Base(evalObj.Name)
+ safeName = strings.TrimSpace(safeName)
+ if safeName == "" {
+   safeName = "evaluation"
+ }
+ // replace path separators and disallowed chars
+ safeName = strings.Map(func(r rune) rune {
+   switch r {
+   case '/', '\\', ':', '*', '?', '"', '<', '>', '|':
+     return '_'
+   default:
+     return r
+   }
+ }, safeName)
+ filename := safeName + ".ts"
+ evalsDir := filepath.Join(dir, "src", "evals")

Add import:

 import (
   "context"
   "fmt"
   "os"
   "os/signal"
   "path/filepath"
   "syscall"
+  "strings"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// Write code to file
filename := evalObj.Name + ".ts"
evalsDir := filepath.Join(dir, "src", "evals")
import (
"context"
"fmt"
"os"
"os/signal"
"path/filepath"
"syscall"
"strings"
)
Suggested change
// Write code to file
filename := evalObj.Name + ".ts"
evalsDir := filepath.Join(dir, "src", "evals")
// Write code to file
safeName := filepath.Base(evalObj.Name)
safeName = strings.TrimSpace(safeName)
if safeName == "" {
safeName = "evaluation"
}
// replace path separators and disallowed chars
safeName = strings.Map(func(r rune) rune {
switch r {
case '/', '\\', ':', '*', '?', '"', '<', '>', '|':
return '_'
default:
return r
}
}, safeName)
filename := safeName + ".ts"
evalsDir := filepath.Join(dir, "src", "evals")

// Create the evals directory if it doesn't exist
if err := os.MkdirAll(evalsDir, 0755); err != nil {
errsystem.New(errsystem.ErrCreateDirectory, err, errsystem.WithUserMessage("Failed to create evals directory")).ShowErrorAndExit()
}

filePath := filepath.Join(evalsDir, filename)
if err := os.WriteFile(filePath, []byte(evalObj.Code), 0644); err != nil {
errsystem.New(errsystem.ErrOpenFile, err, errsystem.WithUserMessage("Failed to write evaluation code to file")).ShowErrorAndExit()
}

if evalType == "template" {
tui.ShowSuccess("Template evaluation '%s' created successfully with ID: %s", name, evalId)
} else {
tui.ShowSuccess("Generative evaluation created successfully with ID: %s", evalId)
}

tui.ShowSuccess("Evaluation code written to: %s", filePath)
fmt.Println("\nEvaluation code:")
fmt.Println(evalObj.Code)
},
}

var evalPullCmd = &cobra.Command{
Use: "pull <id>",
Short: "Pull evaluation data from the cloud by ID",
Long: `Pull evaluation data from the cloud for your project using the evaluation ID.

Arguments:
<id> The evaluation ID to pull

Examples:
agentuity eval pull abc123
agentuity eval pull def456`,
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
defer cancel()

logger := env.NewLogger(cmd)
context := project.EnsureProject(ctx, cmd)
dir := context.Dir
apiUrl := context.APIURL
apiKey := context.Token

evalId := args[0]

var evalObj *EvalPullObject
action := func() {
var err error
evalObj, err = PullEvaluation(ctx, logger, apiUrl, apiKey, evalId)
if err != nil {
errsystem.New(errsystem.ErrApiRequest, err, errsystem.WithUserMessage("Failed to pull evaluation")).ShowErrorAndExit()
}
}

spinner.New().Title("Pulling evaluation...").Action(action).Run()

// Write code to file
filename := evalObj.Name + ".ts"
evalsDir := filepath.Join(dir, "src", "evals")

// Create the evals directory if it doesn't exist
if err := os.MkdirAll(evalsDir, 0755); err != nil {
errsystem.New(errsystem.ErrCreateDirectory, err, errsystem.WithUserMessage("Failed to create evals directory")).ShowErrorAndExit()
}

filePath := filepath.Join(evalsDir, filename)
if err := os.WriteFile(filePath, []byte(evalObj.Code), 0644); err != nil {
errsystem.New(errsystem.ErrOpenFile, err, errsystem.WithUserMessage("Failed to write evaluation code to file")).ShowErrorAndExit()
}

tui.ShowSuccess("Evaluation code written to: %s", filePath)

// Output to stdout
fmt.Println(evalObj.Code)
},
}

func init() {
rootCmd.AddCommand(evalCmd)

evalCreateCmd.Flags().Bool("force", !hasTTY, "Don't prompt for confirmation")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix compile error: undefined hasTTY.

Use the exported tui.HasTTY.

- evalCreateCmd.Flags().Bool("force", !hasTTY, "Don't prompt for confirmation")
+ evalCreateCmd.Flags().Bool("force", !tui.HasTTY, "Don't prompt for confirmation")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
evalCreateCmd.Flags().Bool("force", !hasTTY, "Don't prompt for confirmation")
evalCreateCmd.Flags().Bool("force", !tui.HasTTY, "Don't prompt for confirmation")
🤖 Prompt for AI Agents
In cmd/eval.go around line 297 the code references an undefined identifier
`hasTTY`; replace it with the exported `tui.HasTTY` (i.e., use tui.HasTTY when
setting the default for the "force" flag) and ensure the package `tui` is
imported at the top of the file if not already present. Ensure the flag line
becomes evalCreateCmd.Flags().Bool("force", !tui.HasTTY, "Don't prompt for
confirmation") and run `go build` to confirm compilation.


evalCmd.AddCommand(evalCreateCmd)
evalCmd.AddCommand(evalPullCmd)

for _, cmd := range []*cobra.Command{evalCreateCmd, evalPullCmd} {
cmd.Flags().StringP("dir", "d", ".", "The directory to the project")
}
}
8 changes: 7 additions & 1 deletion internal/bundler/vercel_ai.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,13 @@ func createVercelAIProviderPatch(module string, createFn string, envkey string,
}

func init() {
var vercelTelemetryPatch = generateJSArgsPatch(0, `experimental_telemetry: { isEnabled: true }`)
var vercelTelemetryPatch = generateJSArgsPatch(0, ``) + fmt.Sprintf(`
const opts = {...(_args[0] ?? {}) };
const metadata = { promptId: opts.prompt.id };
opts.experimental_telemetry = { isEnabled: true , metadata: metadata };
opts.prompt = opts.prompt.toString();
_args[0] = opts;
`)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Telemetry Patch Fails Without Prompt Validation

The telemetry patch accesses opts.prompt.id and calls opts.prompt.toString() without checking if opts.prompt exists or has the expected structure. This can lead to runtime errors, especially for Vercel AI functions like embed that may not include a prompt object in their arguments.

Fix in Cursor Fix in Web

Comment on lines +34 to +40
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Guard against undefined opts.prompt and avoid breaking non-text APIs (embed/embedMany).

Directly accessing opts.prompt.id and calling .toString() will throw when prompt is absent (not provided for embed/embedMany) or not an object. Also, don’t clobber existing experimental_telemetry.

Apply this diff:

- var vercelTelemetryPatch = generateJSArgsPatch(0, ``) + fmt.Sprintf(`
- const opts = {...(_args[0] ?? {}) };
- const metadata = { promptId: opts.prompt.id };
- opts.experimental_telemetry = { isEnabled: true , metadata: metadata };
- opts.prompt = opts.prompt.toString();
- _args[0] = opts;
- `)
+ var vercelTelemetryPatch = generateJSArgsPatch(0, ``) + fmt.Sprintf(`
+ const opts = { ...(_args[0] ?? {}) };
+ const p = opts.prompt;
+ const metadata = {};
+ if (p && typeof p === 'object' && 'id' in p) {
+   metadata.promptId = p.id;
+ }
+ opts.experimental_telemetry = { ...(opts.experimental_telemetry ?? {}), isEnabled: true, metadata };
+ if (typeof p === 'string') {
+   // keep as-is
+ } else if (p && typeof p.toString === 'function') {
+   opts.prompt = p.toString();
+ }
+ _args[0] = opts;
+ `)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
var vercelTelemetryPatch = generateJSArgsPatch(0, ``) + fmt.Sprintf(`
const opts = {...(_args[0] ?? {}) };
const metadata = { promptId: opts.prompt.id };
opts.experimental_telemetry = { isEnabled: true , metadata: metadata };
opts.prompt = opts.prompt.toString();
_args[0] = opts;
`)
var vercelTelemetryPatch = generateJSArgsPatch(0, ``) + fmt.Sprintf(`
const opts = { ...(_args[0] ?? {}) };
const p = opts.prompt;
const metadata = {};
if (p && typeof p === 'object' && 'id' in p) {
metadata.promptId = p.id;
}
opts.experimental_telemetry = { ...(opts.experimental_telemetry ?? {}), isEnabled: true, metadata };
if (typeof p === 'string') {
// keep as-is
} else if (p && typeof p.toString === 'function') {
opts.prompt = p.toString();
}
_args[0] = opts;
`)
🤖 Prompt for AI Agents
In internal/bundler/vercel_ai.go around lines 34 to 40, the injected JS assumes
opts.prompt exists and is an object and unconditionally overwrites
opts.experimental_telemetry, which breaks embed/embedMany and non-object
prompts; fix by guarding access: check that opts.prompt is defined and typeof
opts.prompt === 'object' before reading prompt.id and calling toString(), only
set metadata when an id exists, and if prompt is a primitive use
String(opts.prompt) instead of .toString() on undefined; also merge into any
existing opts.experimental_telemetry (e.g., opts.experimental_telemetry = {
...(opts.experimental_telemetry||{}), isEnabled: true, metadata }) rather than
clobbering it so existing telemetry fields are preserved.

vercelAIPatches := patchModule{
Module: "ai",
Functions: map[string]patchAction{
Expand Down
Loading