Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 72 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,7 @@ command = "mkdir -p subdir && echo 'done' > subdir/file.txt"

[[checks]]
name = "check_name"
command = "skill-invoked"
skill = "skill-name"
command = { command = "skill-invoked", skill = "skill-name" }

[answers]
"question_key" = "answer_value"
Expand Down Expand Up @@ -180,49 +179,98 @@ name = "optional-descriptive-name"
command = "echo 'Hello' > greeting.txt && mkdir -p output"
```

## Assertion Reference
## Check Reference

Assertions use structured TOML format:
Run `skill-bench help <type>` for detailed help on any check type.

### Skill Verification
- `skill-loaded` - Skill was loaded
- `skill-invoked` - Skill was invoked
- `skill-not-invoked` - Skill was NOT invoked

- `skill-loaded` — Skill was loaded
- `skill-invoked` — Skill was invoked

```toml
[[checks]]
name = "check-name"
command = { command = "skill-invoked", skill = "my-skill" }
```

### MCP Verification
- `mcp-loaded` - MCP server was loaded
- `mcp-tool-invoked` - MCP tool was invoked
- `mcp-success` - MCP tool succeeded

- `mcp-loaded` — MCP server was loaded
- `mcp-tool-invoked` — MCP tool was invoked
- `mcp-success` — MCP tool succeeded

```toml
[[checks]]
name = "check-name"
command = { command = "mcp-loaded", server = "filesystem" }
```

### Tool Verification
- `tool-use` - Tool was used
- `param` - Parameter value verification

- `tool-use` — Tool was called (partial match)
- `tool-param` — Tool was called with a specific parameter

```toml
[[checks]]
name = "check-name"
command = { command = "tool-use", tool = "Read" }

[[checks]]
name = "check-param"
command = { command = "tool-param", tool = "Read", param = "file_path", value = "test.txt" }
```

### File Verification
- `file-content` - Verify file content
- `file-contains` - File contains string
- `workspace-file` - File exists
- `workspace-dir` - Directory exists

- `workspace-file` — File exists in workspace
- `workspace-dir` — Directory exists in workspace
- `file-contains` — File contains string

```toml
[[checks]]
name = "check-name"
command = { command = "workspace-file", path = "output.txt" }

[[checks]]
name = "check-content"
command = { command = "file-contains", file = "output.txt", contains = "expected text" }
```

### Log Verification
- `output-contains` - Output contains string
- `log-contains` - Log contains pattern
- `text-contains` - Text content search

- `log-contains` — Log contains regex pattern
- `message-contains` — Assistant output contains text

```toml
[[checks]]
name = "check-name"
command = { command = "log-contains", pattern = "error|failed" }

[[checks]]
name = "check-output"
command = { command = "message-contains", text = "expected output" }
```

### Database Verification
- `db-query` - SQL query result verification

- `db-query` — SQL query result verification
- Numeric comparisons: `">0"`, `">=5"`, `"=10"`, `"<3"`, `"<=2"`

```toml
[[checks]]
name = "check-name"
command = { command = "db-query", db = "patents.db", query = "SELECT COUNT(*) FROM patents", expected = ">0" }
```

### Negative Assertions

Use `deny = true` on any assertion for negative verification:
Use `deny = true` on any check to invert the assertion:

```toml
[[checks]]
name = "should-not-contain-error"
command = "file-contains"
file = "output.txt"
contains = "error"
command = { command = "file-contains", file = "output.txt", contains = "error" }
deny = true
```

Expand Down
6 changes: 6 additions & 0 deletions src/cli/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use clap::{Parser, Subcommand};
#[command(name = "skill-bench")]
#[command(about = "TOML-based test runner for skill testing", long_about = None)]
#[command(version)]
#[command(disable_help_subcommand = true)]
pub struct Cli {
#[command(subcommand)]
pub command: Commands,
Expand Down Expand Up @@ -57,6 +58,11 @@ pub enum Commands {
#[arg(short, long)]
verbose: bool,
},
/// Show reference for check types and setup
Help {
/// Check type or "setup" (e.g., skill-invoked, file-contains, db-query)
check_type: Option<String>,
},
}

use std::path::PathBuf;
4 changes: 4 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ mod assertions;
mod cli;
mod models;
mod output;
mod reference;
mod runtime;
mod state;
mod timeline;
Expand Down Expand Up @@ -68,6 +69,9 @@ async fn main() -> Result<()> {
Commands::Timeline { log_file, verbose } => {
timeline::display_timeline(&log_file, verbose)?;
}
Commands::Help { check_type } => {
reference::print_help(check_type.as_deref())?;
}
}

Ok(())
Expand Down
193 changes: 193 additions & 0 deletions src/reference.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
//! Reference documentation for check types and setup

use anyhow::Result;

struct CheckDoc {
name: &'static str,
description: &'static str,
required: &'static [&'static str],
optional: &'static [&'static str],
example: &'static str,
}

fn check_docs() -> Vec<CheckDoc> {
vec![
CheckDoc {
name: "skill-loaded",
description: "Verify a skill was loaded during initialization",
required: &["skill"],
optional: &[],
example: "command = { command = \"skill-loaded\", skill = \"my-skill\" }",
},
CheckDoc {
name: "skill-invoked",
description: "Verify a skill was invoked during execution",
required: &["skill"],
optional: &[],
example: "command = { command = \"skill-invoked\", skill = \"my-skill\" }",
},
CheckDoc {
name: "mcp-loaded",
description: "Verify an MCP server was loaded",
required: &["server"],
optional: &[],
example: "command = { command = \"mcp-loaded\", server = \"filesystem\" }",
},
CheckDoc {
name: "mcp-tool-invoked",
description: "Verify an MCP tool was invoked",
required: &["tool"],
optional: &[],
example: "command = { command = \"mcp-tool-invoked\", tool = \"read_file\" }",
},
CheckDoc {
name: "mcp-success",
description: "Verify MCP tool calls succeeded (no errors)",
required: &["tool"],
optional: &[],
example: "command = { command = \"mcp-success\", tool = \"read_file\" }",
},
CheckDoc {
name: "tool-use",
description: "Verify a tool was called (partial match on tool name)",
required: &["tool"],
optional: &[],
example: "command = { command = \"tool-use\", tool = \"Read\" }",
},
CheckDoc {
name: "tool-param",
description: "Verify a tool was called with a specific parameter value",
required: &["tool", "param"],
optional: &["value"],
example: "command = { command = \"tool-param\", tool = \"Read\", param = \"file_path\", value = \"test.txt\" }",
},
CheckDoc {
name: "workspace-file",
description: "Verify a file exists in the workspace",
required: &["path"],
optional: &[],
example: "command = { command = \"workspace-file\", path = \"output.txt\" }",
},
CheckDoc {
name: "workspace-dir",
description: "Verify a directory exists in the workspace",
required: &["path"],
optional: &[],
example: "command = { command = \"workspace-dir\", path = \"output\" }",
},
CheckDoc {
name: "file-contains",
description: "Verify a file contains specific text",
required: &["file", "contains"],
optional: &[],
example: "command = { command = \"file-contains\", file = \"output.txt\", contains = \"expected text\" }",
},
CheckDoc {
name: "log-contains",
description: "Verify the log contains a regex pattern",
required: &["pattern"],
optional: &[],
example: "command = { command = \"log-contains\", pattern = \"error|failed\" }",
},
CheckDoc {
name: "message-contains",
description: "Verify assistant output contains specific text",
required: &["text"],
optional: &[],
example: "command = { command = \"message-contains\", text = \"expected output\" }",
},
CheckDoc {
name: "db-query",
description: "Execute a SQL query and verify the result",
required: &["query", "expected"],
optional: &["db"],
example: "command = { command = \"db-query\", db = \"patents.db\", query = \"SELECT COUNT(*) FROM patents\", expected = \">0\" }",
},
]
}

pub fn print_help(check_type: Option<&str>) -> Result<()> {
match check_type {
None => {
print_all();
Ok(())
}
Some("setup") => {
print_setup();
Ok(())
}
Some(name) => print_check(name),
}
}

fn print_all() {
println!("Usage: skill-bench help <type>\n");
println!("Check types:");
for doc in check_docs() {
println!(" {:<20} {}", doc.name, doc.description);
}
println!("\nOther:");
println!(" {:<20} Setup step documentation", "setup");
}

fn print_check(name: &str) -> Result<()> {
let docs = check_docs();
let doc = docs.iter().find(|d| d.name == name).ok_or_else(|| {
anyhow::anyhow!(
"Unknown check type: '{}'\nRun 'skill-bench help' for available types",
name
)
})?;

println!("{}\n", doc.name);
println!(" {}\n", doc.description);

println!(" Required fields:");
for field in doc.required {
println!(" - {}", field);
}

if !doc.optional.is_empty() {
println!("\n Optional fields:");
for field in doc.optional {
println!(" - {}", field);
}
}

println!("\n Example:");
println!(" [[checks]]");
println!(" name = \"check-name\"");
println!(" {}", doc.example);

println!("\n Negative assertion (deny = true inverts the check):");
println!(" [[checks]]");
println!(" name = \"check-name\"");
println!(" {}", doc.example);
println!(" deny = true");

Ok(())
}

fn print_setup() {
println!("Setup\n");
println!(" Setup steps run in the test workspace before the test prompt.");
println!(" Steps are executed in order. Failure in any step fails the test.\n");

println!(" File setup (creates a file with content):");
println!(" [[setup]]");
println!(" name = \"optional-name\"");
println!(" path = \"file.txt\"");
println!(" content = \"File content\"\n");

println!(" Required fields:");
println!(" - path: File path in workspace");
println!(" - content: File content to write\n");

println!(" Script setup (executes a shell command via bash -c):");
println!(" [[setup]]");
println!(" name = \"optional-name\"");
println!(" command = \"echo 'Hello' > greeting.txt\"\n");

println!(" Required fields:");
println!(" - command: Shell command to execute");
}
Loading