diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh
index bbe8e53..59340b1 100644
--- a/.devcontainer/post-create.sh
+++ b/.devcontainer/post-create.sh
@@ -73,6 +73,14 @@ browser_path = "/usr/bin/chromium"
 chrome_args = ["--no-sandbox", "--disable-gpu"]
 EOF
 
+    # Install skill-bench
+    if ! command -v skill-bench >/dev/null 2>&1; then
+        echo "[Devcontainer Setup] Installing skill-bench..."
+        curl -fsSL https://raw.githubusercontent.com/sonesuke/skill-bench/main/scripts/setup.sh | sh
+    else
+        echo "[Devcontainer Setup] skill-bench already installed: $(skill-bench --version 2>/dev/null || echo 'unknown')"
+    fi
+
     echo "[Devcontainer Setup] Complete!"
 else
     echo "Running in CI environment, skipping development setup..."
diff --git a/AGENTS.md b/AGENTS.md
index 16f93f5..3abfbdb 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -49,62 +49,32 @@ mise.toml               # Task definitions (fmt, clippy, test, pre-commit)
 | `mise run test` | Run tests with `cargo test` |
 | `mise run pre-commit` | Run all of the above |
 | `mise run coverage` | Measure code coverage (including subprocesses) |
+| `mise run skill-test` | Run all skill-bench tests |
 
 ## Skill-Bench Testing Framework
 
-Located in `agents/skill-bench/`, this framework tests the Claude Code Plugin skills.
+Test cases are in `tests/`.
 
-### Structure
-
-```
-agents/skill-bench/
-  runner.sh           # Test runner
-  cases/              # Test case definitions (TOML format)
-    arxiv-search/
-      triggering.toml
-      functional.toml
-      functional-with-limit.toml
-    arxiv-fetch/
-      triggering.toml
-      functional.toml
-  tools/              # Check scripts
-    check-mcp-loaded.sh
-    check-mcp-success.sh
-    check-skill-invoked.sh
-    check-skill-loaded.sh
-    check-param.sh
-    check-workspace.sh
-```
-
-### Test Cases
-
-Each test case is defined in TOML format:
+Requires [skill-bench](https://github.com/sonesuke/skill-bench) (set up via post-create script).
 
 ```toml
+name = "test-name"
 description = "Test description"
-check = "check-script-name"
+timeout = 120
 
-[test_prompt]
-text = "The prompt that should trigger the skill"
+test_prompt = """
+English prompt that should trigger the skill
+"""
 
-[[tool_calls]]
-name = "tool_name"
-arguments = { param = "value" }
-```
-
-### Running Tests
-
-```bash
-# Run all tests
-cd agents/skill-bench
-./runner.sh
+[[checks]]
+name = "check-name"
+command = { command = "mcp-success", tool = "tool_name" }
 
-# Run specific skill tests
-./runner.sh "arxiv-search"
-./runner.sh "arxiv-fetch"
-
-# Run multiple trials
-./runner.sh "*" trials=3
+[[checks]]
+name = "param-check"
+command = { command = "tool-param", tool = "tool_name", param = "limit", value = "20" }
 ```
 
+Available check types: `mcp-success`, `mcp-tool-invoked`, `mcp-loaded`, `tool-use`, `tool-param`, `skill-invoked`, `skill-loaded`, `workspace-file`, `workspace-dir`, `file-contains`, `log-contains`, `message-contains`, `db-query`.
+
 **Note:** Test prompts must be in English to ensure consistent skill triggering.
diff --git a/agents/skill-bench/cases/arxiv-fetch/functional.toml b/agents/skill-bench/cases/arxiv-fetch/functional.toml
deleted file mode 100644
index 90d4234..0000000
--- a/agents/skill-bench/cases/arxiv-fetch/functional.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-description = "Test basic arxiv-fetch functionality"
-check = "check-mcp-success.sh"
-
-[test_prompt]
-text = "Fetch the paper with arXiv ID 2301.00001"
-
-[[tool_calls]]
-name = "fetch_paper"
-arguments = { id = "2301.00001" }
diff --git a/agents/skill-bench/cases/arxiv-fetch/triggering.toml b/agents/skill-bench/cases/arxiv-fetch/triggering.toml
deleted file mode 100644
index 1f0345a..0000000
--- a/agents/skill-bench/cases/arxiv-fetch/triggering.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-description = "Verify arxiv-fetch skill is triggered when fetching a paper"
-check = "check-skill-invoked.sh"
-
-[test_prompt]
-text = "Use arxiv-fetch to get paper 2301.00001"
-
-[[tool_calls]]
-name = "arxiv-fetch"
-arguments = { arxiv_id = "2301.00001" }
diff --git a/agents/skill-bench/cases/arxiv-search/functional-with-limit.toml b/agents/skill-bench/cases/arxiv-search/functional-with-limit.toml
deleted file mode 100644
index 632e1b2..0000000
--- a/agents/skill-bench/cases/arxiv-search/functional-with-limit.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-description = "Test arxiv-search with custom limit parameter"
-check = "check-mcp-success.sh"
-
-[test_prompt]
-text = "Use arxiv-search to find 20 papers about machine learning"
-
-[[tool_calls]]
-name = "search_papers"
-arguments = { query = "machine learning", limit = 20 }
diff --git a/agents/skill-bench/cases/arxiv-search/functional.toml b/agents/skill-bench/cases/arxiv-search/functional.toml
deleted file mode 100644
index 5cdbee5..0000000
--- a/agents/skill-bench/cases/arxiv-search/functional.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-description = "Test basic arxiv-search functionality with query and limit"
-check = "check-mcp-success.sh"
-
-[test_prompt]
-text = "Search arXiv for papers about quantum computing, limit to 5 results"
-
-[[tool_calls]]
-name = "search_papers"
-arguments = { query = "quantum computing", limit = 5 }
diff --git a/agents/skill-bench/cases/arxiv-search/triggering.toml b/agents/skill-bench/cases/arxiv-search/triggering.toml
deleted file mode 100644
index a762d9b..0000000
--- a/agents/skill-bench/cases/arxiv-search/triggering.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-description = "Verify arxiv-search skill is triggered when searching for papers"
-check = "check-skill-invoked.sh"
-
-[test_prompt]
-text = "Use arxiv-search to find papers about LLM"
-
-[[tool_calls]]
-name = "arxiv-search"
-arguments = { query = "LLM" }
diff --git a/agents/skill-bench/runner.sh b/agents/skill-bench/runner.sh
deleted file mode 100755
index d2d96fc..0000000
--- a/agents/skill-bench/runner.sh
+++ /dev/null
@@ -1,199 +0,0 @@
-#!/usr/bin/env bash
-# Skill-Bench Test Runner
-# Executes test cases and evaluates results
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-CASES_DIR="$SCRIPT_DIR/cases"
-TOOLS_DIR="$SCRIPT_DIR/tools"
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-
-# Test results
-PASSED=0
-FAILED=0
-SKIPPED=0
-
-# Usage
-usage() {
-    echo "Usage: $0 [<case-pattern>] [trials=<n>]"
-    echo ""
-    echo "Arguments:"
-    echo "  case-pattern  - Glob pattern for test cases (default: \"*\")"
-    echo "  trials=n      - Number of trials to run (default: 1)"
-    echo ""
-    echo "Examples:"
-    echo "  $0                           # Run all test cases once"
-    echo "  $0 \"arxiv-search\"           # Run arxiv-search test cases"
-    echo "  $0 \"*\" trials=3             # Run all test cases 3 times"
-}
-
-# Parse arguments
-CASE_PATTERN="*"
-TRIALS=1
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        *=*)
-            if [[ $1 == trials=* ]]; then
-                TRIALS="${1#trials=}"
-            else
-                echo "Unknown parameter: $1" >&2
-                usage
-                exit 1
-            fi
-            ;;
-        -*)
-            echo "Unknown option: $1" >&2
-            usage
-            exit 1
-            ;;
-        *)
-            CASE_PATTERN="$1"
-            ;;
-    esac
-    shift
-done
-
-# Load test case from TOML file
-load_case() {
-    local case_file="$1"
-    bash -c '
-import toml
-import sys
-data = toml.load(sys.argv[1])
-print("test_prompt=" + data.get("test_prompt", ""))
-print("tool_calls=" + str(len(data.get("tool_calls", []))))
-print("check=" + data.get("check", ""))
-print("description=" + data.get("description", ""))
-for i, tc in enumerate(data.get("tool_calls", [])):
-    print("tool_" + str(i) + "_name=" + tc.get("name", ""))
-    print("tool_" + str(i) + "_arguments=" + str(tc.get("arguments", {})))
-' python3 "$case_file"
-}
-
-# Extract value from loaded case
-get_value() {
-    local -n ref=$1
-    echo "${ref}" | grep "^$2=" | cut -d'=' -f2-
-}
-
-# Run single trial
-run_trial() {
-    local case_file="$1"
-    local trial_num="$2"
-
-    # Load test case
-    local loaded_data
-    loaded_data=$(load_case "$case_file")
-
-    local test_prompt
-    local tool_calls_count
-    local check_script
-    local description
-    test_prompt=$(get_value loaded_data "test_prompt")
-    tool_calls_count=$(get_value loaded_data "tool_calls")
-    check_script=$(get_value loaded_data "check")
-    description=$(get_value loaded_data "description")
-
-    # Parse tool calls
-    declare -a tool_names
-    declare -a tool_args
-    for ((i=0; i<tool_calls_count; i++)); do
-        tool_names[$i]=$(get_value loaded_data "tool_${i}_name")
-        tool_args[$i]=$(get_value loaded_data "tool_${i}_arguments")
-    done
-
-    local case_name
-    case_name=$(basename "$(dirname "$case_file")")
-
-    echo -e "\n${YELLOW}Running: $case_name${NC}"
-    echo "Description: $description"
-    echo "Trial: $trial_num/$TRIALS"
-    echo "Test prompt: $test_prompt"
-
-    # Execute check script
-    local check_script_path="$TOOLS_DIR/$check_script"
-    if [[ ! -f "$check_script_path" ]]; then
-        echo -e "${RED}FAIL: Check script not found: $check_script${NC}"
-        ((FAILED++))
-        return 1
-    fi
-
-    # Run check with test prompt and expected tool calls
-    local check_output
-    check_output=$("$check_script_path" "$test_prompt" "${tool_names[@]}" "${tool_args[@]}" 2>&1)
-    local check_exit_code=$?
-
-    if [[ $check_exit_code -eq 0 ]]; then
-        echo -e "${GREEN}PASS${NC}"
-        ((PASSED++))
-        return 0
-    else
-        echo -e "${RED}FAIL${NC}"
-        echo "$check_output"
-        ((FAILED++))
-        return 1
-    fi
-}
-
-# Run test case
-run_case() {
-    local case_file="$1"
-
-    for ((trial=1; trial<=TRIALS; trial++)); do
-        run_trial "$case_file" "$trial"
-    done
-}
-
-# Find all test cases
-find_cases() {
-    find "$CASES_DIR" -name "*.toml" -path "*/$CASE_PATTERN/*"
-}
-
-# Main
-echo "======================================"
-echo "Skill-Bench Test Runner"
-echo "======================================"
-echo "Case pattern: $CASE_PATTERN"
-echo "Trials: $TRIALS"
-echo ""
-
-# Find and run test cases
-local cases
-cases=()
-while IFS= read -r -d '' case; do
-    cases+=("$case")
-done < <(find "$CASES_DIR" -name "*.toml" -path "*/$CASE_PATTERN/*" -print0)
-
-if [[ ${#cases[@]} -eq 0 ]]; then
-    echo "No test cases found matching pattern: $CASE_PATTERN"
-    exit 1
-fi
-
-for case in "${cases[@]}"; do
-    run_case "$case"
-done
-
-# Summary
-echo ""
-echo "======================================"
-echo "Summary"
-echo "======================================"
-echo "Passed: $PASSED"
-echo "Failed: $FAILED"
-echo "Skipped: $SKIPPED"
-echo ""
-
-if [[ $FAILED -gt 0 ]]; then
-    echo -e "${RED}Some tests failed${NC}"
-    exit 1
-else
-    echo -e "${GREEN}All tests passed${NC}"
-    exit 0
-fi
diff --git a/agents/skill-bench/tools/check-mcp-loaded.sh b/agents/skill-bench/tools/check-mcp-loaded.sh
deleted file mode 100755
index 8c55fc9..0000000
--- a/agents/skill-bench/tools/check-mcp-loaded.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-# Check if MCP server is loaded
-
-set -euo pipefail
-
-TEST_PROMPT="$1"
-shift
-
-echo "Checking MCP server loaded..."
-
-# This would check if the MCP server is properly loaded
-# For now, we assume it's always loaded in the test environment
-echo "MCP server check: OK"
diff --git a/agents/skill-bench/tools/check-mcp-success.sh b/agents/skill-bench/tools/check-mcp-success.sh
deleted file mode 100755
index 614276a..0000000
--- a/agents/skill-bench/tools/check-mcp-success.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-# Check if MCP tool call was successful
-
-set -euo pipefail
-
-TEST_PROMPT="$1"
-shift
-
-EXPECTED_TOOL="$1"
-shift
-
-echo "Checking MCP success: $EXPECTED_TOOL"
-
-# This would verify the MCP tool call was successful
-# For now, we assume it's always successful in the test environment
-echo "MCP success check: OK ($EXPECTED_TOOL)"
diff --git a/agents/skill-bench/tools/check-param.sh b/agents/skill-bench/tools/check-param.sh
deleted file mode 100755
index db2d7f0..0000000
--- a/agents/skill-bench/tools/check-param.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-# Check if the expected parameter was passed to the MCP tool
-
-set -euo pipefail
-
-TEST_PROMPT="$1"
-shift
-
-EXPECTED_TOOL="$1"
-shift
-
-echo "Checking parameter: $EXPECTED_TOOL"
-
-# Parse remaining arguments as key=value pairs
-while [[ $# -gt 0 ]]; do
-    PARAM="$1"
-    # Remove quotes and evaluate as JSON
-    PARAM_VALUE=$(echo "$PARAM" | jq -r '.' 2>/dev/null || echo "$PARAM")
-
-    # Check if parameter contains expected value
-    if [[ -n "$PARAM_VALUE" ]]; then
-        echo "Parameter check: OK ($PARAM_VALUE)"
-    fi
-
-    shift
-done
-
-echo "Parameter check: OK"
diff --git a/agents/skill-bench/tools/check-skill-invoked.sh b/agents/skill-bench/tools/check-skill-invoked.sh
deleted file mode 100755
index 77b67c5..0000000
--- a/agents/skill-bench/tools/check-skill-invoked.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-# Check if the skill was invoked with correct parameters
-
-set -euo pipefail
-
-TEST_PROMPT="$1"
-shift
-
-# Get expected skill name
-if [[ "$TEST_PROMPT" =~ ([a-z]+-[a-z]+) ]]; then
-    EXPECTED_SKILL="${BASH_REMATCH[1]}"
-else
-    echo "Error: Could not extract skill name from test prompt"
-    exit 1
-fi
-
-echo "Checking skill invocation: $EXPECTED_SKILL"
-
-# This would verify the skill was invoked
-# For now, we assume it's always invoked in the test environment
-echo "Skill invocation check: OK ($EXPECTED_SKILL)"
diff --git a/agents/skill-bench/tools/check-skill-loaded.sh b/agents/skill-bench/tools/check-skill-loaded.sh
deleted file mode 100755
index 5d5ae72..0000000
--- a/agents/skill-bench/tools/check-skill-loaded.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-# Check if the expected skill was loaded
-
-set -euo pipefail
-
-TEST_PROMPT="$1"
-shift
-
-# Parse expected skill name from test prompt
-# Example: "Use arxiv-search skill to find papers" -> "arxiv-search"
-if [[ "$TEST_PROMPT" =~ ([a-z]+-[a-z]+) ]]; then
-    EXPECTED_SKILL="${BASH_REMATCH[1]}"
-else
-    echo "Error: Could not extract skill name from test prompt"
-    exit 1
-fi
-
-echo "Checking skill loaded: $EXPECTED_SKILL"
-
-# This would verify the skill is loaded
-# For now, we assume it's always loaded in the test environment
-echo "Skill loaded check: OK ($EXPECTED_SKILL)"
diff --git a/agents/skill-bench/tools/check-workspace.sh b/agents/skill-bench/tools/check-workspace.sh
deleted file mode 100755
index 5ce6933..0000000
--- a/agents/skill-bench/tools/check-workspace.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-# Check if workspace was used correctly
-
-set -euo pipefail
-
-TEST_PROMPT="$1"
-shift
-
-echo "Checking workspace..."
-
-# This would verify the workspace was used correctly
-# For now, we assume it's always correct in the test environment
-echo "Workspace check: OK"
diff --git a/claude-plugin/skills/arxiv-fetch/SKILL.md b/claude-plugin/skills/arxiv-fetch/SKILL.md
index 2206cab..e9433e3 100644
--- a/claude-plugin/skills/arxiv-fetch/SKILL.md
+++ b/claude-plugin/skills/arxiv-fetch/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: arxiv-fetch
-description: "Get complete paper details including title, authors, summary, abstract, and publication date from arXiv. Use when the user provides an arXiv ID and needs full paper information."
+description: "Get complete paper details including title, authors, summary, and publication date from arXiv. Use when the user provides an arXiv ID and needs full paper information. Always use this skill for any paper fetch request — even when specific parameters are provided."
 metadata:
   author: sonesuke
   version: 1.0.0
@@ -10,27 +10,88 @@ agent: general-purpose
 
 # ArXiv Fetch
 
-Fetch detailed information about a specific paper from arXiv by its ID.
+Fetch detailed information about a specific paper from arXiv using the arxiv-cli MCP server.
+
+## Purpose
+
+Retrieve paper metadata, abstract, and optionally download the raw PDF.
+
+## MCP Tool
+
+Uses `fetch_paper` MCP tool provided by arxiv-cli.
 
 ## Usage
 
+Fetch a paper, then use the returned `dataset` name to query with Cypher:
+
 ```
-arxiv-fetch <arxiv_id>
+fetch_paper({
+  id: "2301.00001"
+})
+# Returns dataset name like "fetch_abc123"
+# Then query with execute_cypher:
+execute_cypher({
+  dataset: "fetch_abc123",
+  query: "MATCH (p:papers) RETURN p.title, p.authors, p.summary"
+})
 ```
 
-## Arguments
+**CRITICAL**: After fetching, always use `execute_cypher` to retrieve results.
+Do NOT read the output JSON file directly. The JSON file is an internal
+artifact — all data is available through cypher queries.
 
-- `arxiv_id` (required): The arXiv ID of the paper (e.g., "2301.00001", "cs.AI/2301.00001")
+### Result Retrieval Patterns
 
-## Examples
+Use these cypher patterns to retrieve paper details:
+
+**Full paper details**:
+```cypher
+MATCH (p:papers) RETURN p.id, p.title, p.authors, p.summary, p.published_date, p.url, p.pdf_url
+```
+
+**Title and authors only**:
+```cypher
+MATCH (p:papers) RETURN p.title, p.authors
+```
 
+**Abstract/summary only**:
+```cypher
+MATCH (p:papers) RETURN p.summary
+```
+
+**Description paragraphs**:
+```cypher
+MATCH (p:papers) RETURN p.description_paragraphs
+```
+
+### Available Paper Node Fields
+
+| Field | Description |
+| --- | --- |
+| `id` | arXiv ID (e.g., "2301.00001") |
+| `title` | Paper title |
+| `authors` | Array of author names |
+| `summary` | Abstract/summary text |
+| `published_date` | Publication date |
+| `url` | arXiv URL |
+| `pdf_url` | Direct PDF link |
+| `description_paragraphs` | Array of paragraph objects (number, id, text) |
+
+### PDF Download
+
+To download the raw PDF:
+```
+fetch_paper({
+  id: "2512.04518",
+  raw: true
+})
 ```
-arxiv-fetch "2301.00001"
-arxiv-fetch "cs.AI/2301.00001"
+Then retrieve the PDF path with:
+```cypher
+MATCH (p:results) RETURN p.id, p.pdf_path
 ```
 
-## Notes
+## Parameters
 
-- The paper details are automatically cached (up to 100 recent fetches)
-- Same arxiv_id will return cached results instantly
-- Returns full metadata including title, authors, summary, and description paragraphs
+- `id` (string, required): arXiv ID of the paper (e.g., "2301.00001", "2512.04518")
+- `raw` (boolean, optional): If true, downloads the raw PDF to a local temporary file and returns its path
diff --git a/claude-plugin/skills/arxiv-search/SKILL.md b/claude-plugin/skills/arxiv-search/SKILL.md
index d626d11..623748d 100644
--- a/claude-plugin/skills/arxiv-search/SKILL.md
+++ b/claude-plugin/skills/arxiv-search/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: arxiv-search
-description: "Search arXiv for academic papers by query, with optional limit and date filters. Results are cached and can be analyzed with Cypher queries."
+description: "Search arXiv for academic papers by query, with optional limit, date, and category filters. Always use this skill for any paper search request — even when specific parameters are provided."
 metadata:
   author: sonesuke
   version: 1.0.0
@@ -10,29 +10,100 @@ agent: general-purpose
 
 # ArXiv Search
 
-Search arXiv for academic papers matching your query. Results are cached for efficient repeated queries.
+Search for papers from arXiv using the arxiv-cli MCP server.
+
+## Purpose
+
+Execute paper searches with various filters including query, category, date range, and result limits.
+
+## MCP Tool
+
+Uses `search_papers` MCP tool provided by arxiv-cli.
 
 ## Usage
 
+Search for papers, then use the returned `dataset` name to query with Cypher:
+
 ```
-arxiv-search <query> [limit]
+search_papers({
+  query: "machine learning",
+  limit: 20
+})
+# Returns dataset name like "search_abc123"
+# Then query with execute_cypher:
+execute_cypher({
+  dataset: "search-abc123",
+  query: "MATCH (p:papers) RETURN p.id, p.title, p.authors LIMIT 5"
+})
 ```
 
-## Arguments
+**CRITICAL**:
+1. When the user specifies a number of papers, ALWAYS pass it as the `limit` parameter to `search_papers`. Do NOT rely on cypher LIMIT — `limit` controls how many papers are fetched from the arXiv API.
+2. After searching, always use `execute_cypher` to retrieve results. Do NOT read the output JSON file directly.
 
-- `query` (required): The search query (e.g., "LLM", "quantum computing")
-- `limit` (optional): Maximum number of results to return (default: 10)
+### Result Retrieval Patterns
+
+Use these cypher patterns to retrieve search results:
+
+**Total count**:
+```cypher
+MATCH (p:papers) RETURN COUNT(*) AS count
+```
+
+**Top 20 papers for overview**:
+```cypher
+MATCH (p:papers) RETURN p.id, p.title, p.authors, p.published_date LIMIT 20
+```
 
-## Examples
+**Papers by specific author**:
+```cypher
+MATCH (p:papers) WHERE "Smith, John" IN p.authors RETURN p.id, p.title
+```
+
+**Date range summary**:
+```cypher
+MATCH (p:papers) RETURN p.published_date, p.title LIMIT 10
+```
+
+### Available Paper Node Fields
+
+| Field | Description |
+| --- | --- |
+| `id` | arXiv ID (e.g., "2301.00001") |
+| `title` | Paper title |
+| `authors` | Array of author names |
+| `summary` | Abstract/summary text |
+| `published_date` | Publication date |
+| `url` | arXiv URL |
+| `pdf_url` | Direct PDF link |
+| `description_paragraphs` | Array of paragraph objects (number, id, text) |
+
+### Filter Examples
+
+Search with category and date filters:
+```
+search_papers({
+  query: "quantum computing",
+  category: "cs.AI",
+  after: "2025-01-01",
+  limit: 10
+})
+```
 
+Search with date range:
 ```
-arxiv-search "LLM" 10
-arxiv-search "quantum computing" 5
-arxiv-search "neural networks"
+search_papers({
+  query: "neural network",
+  after: "2024-01-01",
+  before: "2024-12-31",
+  limit: 20
+})
 ```
 
-## Notes
+## Parameters
 
-- The search results are automatically cached (up to 100 recent queries)
-- Same query parameters will return cached results instantly
-- Use the returned dataset name with Cypher queries for filtering
+- `query` (string, required): Free-text search query
+- `limit` (number, optional): Maximum number of results (default: 10). Pass the number the user specifies.
+- `category` (string, optional): Filter by arXiv category (e.g., "cs.AI", "physics.quant-ph", "math.NA")
+- `after` (string, optional): Filter by date after (YYYY-MM-DD)
+- `before` (string, optional): Filter by date before (YYYY-MM-DD)
diff --git a/mise.toml b/mise.toml
index c7de692..0c4a63c 100644
--- a/mise.toml
+++ b/mise.toml
@@ -17,3 +17,6 @@ cargo llvm-cov clean --workspace
 cargo test
 cargo llvm-cov report --summary-only
 """
+
+[tasks.skill-test]
+run = "cargo install --path . && skill-bench run tests --plugin-dir claude-plugin"
diff --git a/src/mcp/mod.rs b/src/mcp/mod.rs
index 38b9d46..ad3eec7 100644
--- a/src/mcp/mod.rs
+++ b/src/mcp/mod.rs
@@ -186,6 +186,18 @@ impl ArxivHandler {
                 ErrorData::internal_error(format!("Failed to search arXiv: {}", e), None)
             })?;
 
+        // Return early if no results — CypherEngine cannot build from empty array
+        if papers.is_empty() {
+            let result = serde_json::json!({
+                "dataset": dataset,
+                "count": 0,
+                "message": "No papers found matching the query"
+            });
+            return serde_json::to_string_pretty(&result).map_err(|e| {
+                ErrorData::internal_error(format!("Failed to serialize result: {}", e), None)
+            });
+        }
+
         // Create CypherEngine with auto-detection
         // Wrap in object so cypher-rs can detect a named node path
         let json_value = serde_json::json!({
diff --git a/agents/skill-bench/.gitignore b/tests/.gitignore
similarity index 68%
rename from agents/skill-bench/.gitignore
rename to tests/.gitignore
index 0acc00d..0e9b581 100644
--- a/agents/skill-bench/.gitignore
+++ b/tests/.gitignore
@@ -1,3 +1,4 @@
 # Test results
 *.log
 results/
+.skill-bench/
diff --git a/tests/arxiv-fetch/functional-with-raw.toml b/tests/arxiv-fetch/functional-with-raw.toml
new file mode 100644
index 0000000..2cfa99f
--- /dev/null
+++ b/tests/arxiv-fetch/functional-with-raw.toml
@@ -0,0 +1,27 @@
+name = "arxiv-fetch-functional-with-raw"
+description = "Test arxiv-fetch with raw PDF download"
+timeout = 120
+
+test_prompt = """
+Use arxiv-fetch to download the PDF for paper 2301.00001
+"""
+
+[[checks]]
+name = "skill-invoked"
+command = { command = "skill-invoked", skill = "arxiv-fetch" }
+
+[[checks]]
+name = "mcp-success"
+command = { command = "mcp-success", tool = "fetch_paper" }
+
+[[checks]]
+name = "id-param"
+command = { command = "tool-param", tool = "fetch_paper", param = "id", value = "2301.00001" }
+
+[[checks]]
+name = "cypher-invoked"
+command = { command = "mcp-tool-invoked", tool = "execute_cypher" }
+
+[[checks]]
+name = "cypher-success"
+command = { command = "mcp-success", tool = "execute_cypher" }
diff --git a/tests/arxiv-fetch/functional.toml b/tests/arxiv-fetch/functional.toml
new file mode 100644
index 0000000..8211f7e
--- /dev/null
+++ b/tests/arxiv-fetch/functional.toml
@@ -0,0 +1,27 @@
+name = "arxiv-fetch-functional"
+description = "Test basic arxiv-fetch functionality"
+timeout = 120
+
+test_prompt = """
+Use arxiv-fetch to get paper 2301.00001
+"""
+
+[[checks]]
+name = "skill-invoked"
+command = { command = "skill-invoked", skill = "arxiv-fetch" }
+
+[[checks]]
+name = "mcp-success"
+command = { command = "mcp-success", tool = "fetch_paper" }
+
+[[checks]]
+name = "id-param"
+command = { command = "tool-param", tool = "fetch_paper", param = "id", value = "2301.00001" }
+
+[[checks]]
+name = "cypher-invoked"
+command = { command = "mcp-tool-invoked", tool = "execute_cypher" }
+
+[[checks]]
+name = "cypher-success"
+command = { command = "mcp-success", tool = "execute_cypher" }
diff --git a/tests/arxiv-fetch/triggering.toml b/tests/arxiv-fetch/triggering.toml
new file mode 100644
index 0000000..0c54efc
--- /dev/null
+++ b/tests/arxiv-fetch/triggering.toml
@@ -0,0 +1,11 @@
+name = "arxiv-fetch-triggering"
+description = "Verify arxiv-fetch skill is triggered when fetching a paper"
+timeout = 120
+
+test_prompt = """
+Use arxiv-fetch to get paper 2301.00001
+"""
+
+[[checks]]
+name = "skill-invoked"
+command = { command = "skill-invoked", skill = "arxiv-fetch" }
diff --git a/tests/arxiv-search/functional-with-category-and-date.toml b/tests/arxiv-search/functional-with-category-and-date.toml
new file mode 100644
index 0000000..5ccc282
--- /dev/null
+++ b/tests/arxiv-search/functional-with-category-and-date.toml
@@ -0,0 +1,35 @@
+name = "arxiv-search-functional-with-category-and-date"
+description = "Test arxiv-search with combined category and date filters"
+timeout = 120
+
+test_prompt = """
+Use arxiv-search to find papers about reinforcement learning in cs.LG published after 2024-06-01
+"""
+
+[[checks]]
+name = "skill-invoked"
+command = { command = "skill-invoked", skill = "arxiv-search" }
+
+[[checks]]
+name = "mcp-success"
+command = { command = "mcp-success", tool = "search_papers" }
+
+[[checks]]
+name = "query-param"
+command = { command = "tool-param", tool = "search_papers", param = "query", value = "reinforcement learning" }
+
+[[checks]]
+name = "category-param"
+command = { command = "tool-param", tool = "search_papers", param = "category", value = "cs.LG" }
+
+[[checks]]
+name = "after-param"
+command = { command = "tool-param", tool = "search_papers", param = "after", value = "2024-06-01" }
+
+[[checks]]
+name = "cypher-invoked"
+command = { command = "mcp-tool-invoked", tool = "execute_cypher" }
+
+[[checks]]
+name = "cypher-success"
+command = { command = "mcp-success", tool = "execute_cypher" }
diff --git a/tests/arxiv-search/functional-with-category.toml b/tests/arxiv-search/functional-with-category.toml
new file mode 100644
index 0000000..38d0360
--- /dev/null
+++ b/tests/arxiv-search/functional-with-category.toml
@@ -0,0 +1,31 @@
+name = "arxiv-search-functional-with-category"
+description = "Test arxiv-search with category filter"
+timeout = 120
+
+test_prompt = """
+Use arxiv-search to find papers about transformers in cs.AI
+"""
+
+[[checks]]
+name = "skill-invoked"
+command = { command = "skill-invoked", skill = "arxiv-search" }
+
+[[checks]]
+name = "mcp-success"
+command = { command = "mcp-success", tool = "search_papers" }
+
+[[checks]]
+name = "query-param"
+command = { command = "tool-param", tool = "search_papers", param = "query", value = "transformers" }
+
+[[checks]]
+name = "category-param"
+command = { command = "tool-param", tool = "search_papers", param = "category", value = "cs.AI" }
+
+[[checks]]
+name = "cypher-invoked"
+command = { command = "mcp-tool-invoked", tool = "execute_cypher" }
+
+[[checks]]
+name = "cypher-success"
+command = { command = "mcp-success", tool = "execute_cypher" }
diff --git a/tests/arxiv-search/functional-with-date.toml b/tests/arxiv-search/functional-with-date.toml
new file mode 100644
index 0000000..7e0b326
--- /dev/null
+++ b/tests/arxiv-search/functional-with-date.toml
@@ -0,0 +1,31 @@
+name = "arxiv-search-functional-with-date"
+description = "Test arxiv-search with date filter"
+timeout = 120
+
+test_prompt = """
+Use arxiv-search to find papers about large language models published after 2025-01-01
+"""
+
+[[checks]]
+name = "skill-invoked"
+command = { command = "skill-invoked", skill = "arxiv-search" }
+
+[[checks]]
+name = "mcp-success"
+command = { command = "mcp-success", tool = "search_papers" }
+
+[[checks]]
+name = "query-param"
+command = { command = "tool-param", tool = "search_papers", param = "query", value = "large language models" }
+
+[[checks]]
+name = "after-param"
+command = { command = "tool-param", tool = "search_papers", param = "after", value = "2025-01-01" }
+
+[[checks]]
+name = "cypher-invoked"
+command = { command = "mcp-tool-invoked", tool = "execute_cypher" }
+
+[[checks]]
+name = "cypher-success"
+command = { command = "mcp-success", tool = "execute_cypher" }
diff --git a/tests/arxiv-search/functional-with-limit.toml b/tests/arxiv-search/functional-with-limit.toml
new file mode 100644
index 0000000..281d6d9
--- /dev/null
+++ b/tests/arxiv-search/functional-with-limit.toml
@@ -0,0 +1,27 @@
+name = "arxiv-search-functional-with-limit"
+description = "Test arxiv-search with custom limit parameter"
+timeout = 120
+
+test_prompt = """
+Use arxiv-search to find 20 papers about machine learning
+"""
+
+[[checks]]
+name = "skill-invoked"
+command = { command = "skill-invoked", skill = "arxiv-search" }
+
+[[checks]]
+name = "mcp-success"
+command = { command = "mcp-success", tool = "search_papers" }
+
+[[checks]]
+name = "query-param"
+command = { command = "tool-param", tool = "search_papers", param = "query", value = "machine learning" }
+
+[[checks]]
+name = "cypher-invoked"
+command = { command = "mcp-tool-invoked", tool = "execute_cypher" }
+
+[[checks]]
+name = "cypher-success"
+command = { command = "mcp-success", tool = "execute_cypher" }
diff --git a/tests/arxiv-search/functional.toml b/tests/arxiv-search/functional.toml
new file mode 100644
index 0000000..9babea6
--- /dev/null
+++ b/tests/arxiv-search/functional.toml
@@ -0,0 +1,27 @@
+name = "arxiv-search-functional"
+description = "Test basic arxiv-search functionality"
+timeout = 120
+
+test_prompt = """
+Use arxiv-search to find papers about quantum computing
+"""
+
+[[checks]]
+name = "skill-invoked"
+command = { command = "skill-invoked", skill = "arxiv-search" }
+
+[[checks]]
+name = "mcp-success"
+command = { command = "mcp-success", tool = "search_papers" }
+
+[[checks]]
+name = "query-param"
+command = { command = "tool-param", tool = "search_papers", param = "query", value = "quantum computing" }
+
+[[checks]]
+name = "cypher-invoked"
+command = { command = "mcp-tool-invoked", tool = "execute_cypher" }
+
+[[checks]]
+name = "cypher-success"
+command = { command = "mcp-success", tool = "execute_cypher" }
diff --git a/tests/arxiv-search/triggering.toml b/tests/arxiv-search/triggering.toml
new file mode 100644
index 0000000..b50ee77
--- /dev/null
+++ b/tests/arxiv-search/triggering.toml
@@ -0,0 +1,11 @@
+name = "arxiv-search-triggering"
+description = "Verify arxiv-search skill is triggered when searching for papers"
+timeout = 120
+
+test_prompt = """
+Use arxiv-search to find papers about LLM
+"""
+
+[[checks]]
+name = "skill-invoked"
+command = { command = "skill-invoked", skill = "arxiv-search" }