From 5d220a766da84b4602f3a7b72502eeba813194df Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 4 Apr 2026 02:46:38 +0000 Subject: [PATCH] feat: add reference command and fix README check documentation - Add `skill-bench reference ` subcommand for check/setup docs - Fix README TOML examples to use inline table format - Remove non-existent checks (file-content, output-contains, text-contains, skill-not-invoked) - Add missing checks (message-contains, tool-param with value) - Add proper TOML examples for each check category Co-Authored-By: Claude Opus 4.6 --- README.md | 96 +++++++++++++++++------ src/cli/args.rs | 6 ++ src/main.rs | 4 + src/reference.rs | 193 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 275 insertions(+), 24 deletions(-) create mode 100644 src/reference.rs diff --git a/README.md b/README.md index dada6b7..e9d1562 100644 --- a/README.md +++ b/README.md @@ -151,8 +151,7 @@ command = "mkdir -p subdir && echo 'done' > subdir/file.txt" [[checks]] name = "check_name" -command = "skill-invoked" -skill = "skill-name" +command = { command = "skill-invoked", skill = "skill-name" } [answers] "question_key" = "answer_value" @@ -180,49 +179,98 @@ name = "optional-descriptive-name" command = "echo 'Hello' > greeting.txt && mkdir -p output" ``` -## Assertion Reference +## Check Reference -Assertions use structured TOML format: +Run `skill-bench help ` for detailed help on any check type. ### Skill Verification -- `skill-loaded` - Skill was loaded -- `skill-invoked` - Skill was invoked -- `skill-not-invoked` - Skill was NOT invoked + +- `skill-loaded` — Skill was loaded +- `skill-invoked` — Skill was invoked + +```toml +[[checks]] +name = "check-name" +command = { command = "skill-invoked", skill = "my-skill" } +``` ### MCP Verification -- `mcp-loaded` - MCP server was loaded -- `mcp-tool-invoked` - MCP tool was invoked -- `mcp-success` - MCP tool succeeded + +- `mcp-loaded` — MCP server was loaded +- `mcp-tool-invoked` — MCP tool was invoked +- `mcp-success` — MCP tool succeeded + +```toml +[[checks]] +name = "check-name" +command = { command = "mcp-loaded", server = "filesystem" } +``` ### Tool Verification -- `tool-use` - Tool was used -- `param` - Parameter value verification + +- `tool-use` — Tool was called (partial match) +- `tool-param` — Tool was called with a specific parameter + +```toml +[[checks]] +name = "check-name" +command = { command = "tool-use", tool = "Read" } + +[[checks]] +name = "check-param" +command = { command = "tool-param", tool = "Read", param = "file_path", value = "test.txt" } +``` ### File Verification -- `file-content` - Verify file content -- `file-contains` - File contains string -- `workspace-file` - File exists -- `workspace-dir` - Directory exists + +- `workspace-file` — File exists in workspace +- `workspace-dir` — Directory exists in workspace +- `file-contains` — File contains string + +```toml +[[checks]] +name = "check-name" +command = { command = "workspace-file", path = "output.txt" } + +[[checks]] +name = "check-content" +command = { command = "file-contains", file = "output.txt", contains = "expected text" } +``` ### Log Verification -- `output-contains` - Output contains string -- `log-contains` - Log contains pattern -- `text-contains` - Text content search + +- `log-contains` — Log contains regex pattern +- `message-contains` — Assistant output contains text + +```toml +[[checks]] +name = "check-name" +command = { command = "log-contains", pattern = "error|failed" } + +[[checks]] +name = "check-output" +command = { command = "message-contains", text = "expected output" } +``` ### Database Verification -- `db-query` - SQL query result verification + +- `db-query` — SQL query result verification - Numeric comparisons: `">0"`, `">=5"`, `"=10"`, `"<3"`, `"<=2"` +```toml +[[checks]] +name = "check-name" +command = { command = "db-query", db = "patents.db", query = "SELECT COUNT(*) FROM patents", expected = ">0" } +``` + ### Negative Assertions -Use `deny = true` on any assertion for negative verification: +Use `deny = true` on any check to invert the assertion: ```toml [[checks]] name = "should-not-contain-error" -command = "file-contains" -file = "output.txt" -contains = "error" +command = { command = "file-contains", file = "output.txt", contains = "error" } deny = true ``` diff --git a/src/cli/args.rs b/src/cli/args.rs index 73bf1e7..6785331 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -6,6 +6,7 @@ use clap::{Parser, Subcommand}; #[command(name = "skill-bench")] #[command(about = "TOML-based test runner for skill testing", long_about = None)] #[command(version)] +#[command(disable_help_subcommand = true)] pub struct Cli { #[command(subcommand)] pub command: Commands, @@ -57,6 +58,11 @@ pub enum Commands { #[arg(short, long)] verbose: bool, }, + /// Show reference for check types and setup + Help { + /// Check type or "setup" (e.g., skill-invoked, file-contains, db-query) + check_type: Option, + }, } use std::path::PathBuf; diff --git a/src/main.rs b/src/main.rs index 20dedb7..9b72378 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ mod assertions; mod cli; mod models; mod output; +mod reference; mod runtime; mod state; mod timeline; @@ -68,6 +69,9 @@ async fn main() -> Result<()> { Commands::Timeline { log_file, verbose } => { timeline::display_timeline(&log_file, verbose)?; } + Commands::Help { check_type } => { + reference::print_help(check_type.as_deref())?; + } } Ok(()) diff --git a/src/reference.rs b/src/reference.rs new file mode 100644 index 0000000..5323f9f --- /dev/null +++ b/src/reference.rs @@ -0,0 +1,193 @@ +//! Reference documentation for check types and setup + +use anyhow::Result; + +struct CheckDoc { + name: &'static str, + description: &'static str, + required: &'static [&'static str], + optional: &'static [&'static str], + example: &'static str, +} + +fn check_docs() -> Vec { + vec![ + CheckDoc { + name: "skill-loaded", + description: "Verify a skill was loaded during initialization", + required: &["skill"], + optional: &[], + example: "command = { command = \"skill-loaded\", skill = \"my-skill\" }", + }, + CheckDoc { + name: "skill-invoked", + description: "Verify a skill was invoked during execution", + required: &["skill"], + optional: &[], + example: "command = { command = \"skill-invoked\", skill = \"my-skill\" }", + }, + CheckDoc { + name: "mcp-loaded", + description: "Verify an MCP server was loaded", + required: &["server"], + optional: &[], + example: "command = { command = \"mcp-loaded\", server = \"filesystem\" }", + }, + CheckDoc { + name: "mcp-tool-invoked", + description: "Verify an MCP tool was invoked", + required: &["tool"], + optional: &[], + example: "command = { command = \"mcp-tool-invoked\", tool = \"read_file\" }", + }, + CheckDoc { + name: "mcp-success", + description: "Verify MCP tool calls succeeded (no errors)", + required: &["tool"], + optional: &[], + example: "command = { command = \"mcp-success\", tool = \"read_file\" }", + }, + CheckDoc { + name: "tool-use", + description: "Verify a tool was called (partial match on tool name)", + required: &["tool"], + optional: &[], + example: "command = { command = \"tool-use\", tool = \"Read\" }", + }, + CheckDoc { + name: "tool-param", + description: "Verify a tool was called with a specific parameter value", + required: &["tool", "param"], + optional: &["value"], + example: "command = { command = \"tool-param\", tool = \"Read\", param = \"file_path\", value = \"test.txt\" }", + }, + CheckDoc { + name: "workspace-file", + description: "Verify a file exists in the workspace", + required: &["path"], + optional: &[], + example: "command = { command = \"workspace-file\", path = \"output.txt\" }", + }, + CheckDoc { + name: "workspace-dir", + description: "Verify a directory exists in the workspace", + required: &["path"], + optional: &[], + example: "command = { command = \"workspace-dir\", path = \"output\" }", + }, + CheckDoc { + name: "file-contains", + description: "Verify a file contains specific text", + required: &["file", "contains"], + optional: &[], + example: "command = { command = \"file-contains\", file = \"output.txt\", contains = \"expected text\" }", + }, + CheckDoc { + name: "log-contains", + description: "Verify the log contains a regex pattern", + required: &["pattern"], + optional: &[], + example: "command = { command = \"log-contains\", pattern = \"error|failed\" }", + }, + CheckDoc { + name: "message-contains", + description: "Verify assistant output contains specific text", + required: &["text"], + optional: &[], + example: "command = { command = \"message-contains\", text = \"expected output\" }", + }, + CheckDoc { + name: "db-query", + description: "Execute a SQL query and verify the result", + required: &["query", "expected"], + optional: &["db"], + example: "command = { command = \"db-query\", db = \"patents.db\", query = \"SELECT COUNT(*) FROM patents\", expected = \">0\" }", + }, + ] +} + +pub fn print_help(check_type: Option<&str>) -> Result<()> { + match check_type { + None => { + print_all(); + Ok(()) + } + Some("setup") => { + print_setup(); + Ok(()) + } + Some(name) => print_check(name), + } +} + +fn print_all() { + println!("Usage: skill-bench help \n"); + println!("Check types:"); + for doc in check_docs() { + println!(" {:<20} {}", doc.name, doc.description); + } + println!("\nOther:"); + println!(" {:<20} Setup step documentation", "setup"); +} + +fn print_check(name: &str) -> Result<()> { + let docs = check_docs(); + let doc = docs.iter().find(|d| d.name == name).ok_or_else(|| { + anyhow::anyhow!( + "Unknown check type: '{}'\nRun 'skill-bench help' for available types", + name + ) + })?; + + println!("{}\n", doc.name); + println!(" {}\n", doc.description); + + println!(" Required fields:"); + for field in doc.required { + println!(" - {}", field); + } + + if !doc.optional.is_empty() { + println!("\n Optional fields:"); + for field in doc.optional { + println!(" - {}", field); + } + } + + println!("\n Example:"); + println!(" [[checks]]"); + println!(" name = \"check-name\""); + println!(" {}", doc.example); + + println!("\n Negative assertion (deny = true inverts the check):"); + println!(" [[checks]]"); + println!(" name = \"check-name\""); + println!(" {}", doc.example); + println!(" deny = true"); + + Ok(()) +} + +fn print_setup() { + println!("Setup\n"); + println!(" Setup steps run in the test workspace before the test prompt."); + println!(" Steps are executed in order. Failure in any step fails the test.\n"); + + println!(" File setup (creates a file with content):"); + println!(" [[setup]]"); + println!(" name = \"optional-name\""); + println!(" path = \"file.txt\""); + println!(" content = \"File content\"\n"); + + println!(" Required fields:"); + println!(" - path: File path in workspace"); + println!(" - content: File content to write\n"); + + println!(" Script setup (executes a shell command via bash -c):"); + println!(" [[setup]]"); + println!(" name = \"optional-name\""); + println!(" command = \"echo 'Hello' > greeting.txt\"\n"); + + println!(" Required fields:"); + println!(" - command: Shell command to execute"); +}