diff --git a/CHANGELOG.md b/CHANGELOG.md index e3db0de..7e4613d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,30 @@ The format is inspired by Keep a Changelog and this project follows Semantic Ver - (none yet) +## 0.13.0 - 2026-04-05 + +### Added + +- Discrete confidence label model (`FindingConfidence` enum: Informational, Possible, Likely, Confirmed) derived from continuous confidence float using calibrated thresholds (#85). +- `confidence_label` field on all findings, auto-derived from confidence float via `confidence_to_label()`. +- `Finding::new()` constructor that auto-populates confidence label; `with_derived_label()` backfill method for existing findings. +- Declarative investigation templates replacing hardcoded keyword matching in `investigation_plan()` (#84). +- Six built-in investigation templates: broad-host-triage, ssh-key-investigation, persistence-analysis, network-exposure-audit, privilege-escalation-check, file-integrity-check. +- `resolve_investigation_template()` scores templates by keyword match count and falls back to broad-host-triage. +- Investigation templates shown in `--list-task-templates` output with tool lists. +- Task scope validation: detects out-of-scope tasks (cloud/AWS/Azure/GCP/Kubernetes/etc.) and emits an info finding instead of misleading host data (#83). +- `FindingRelevance` enum (Primary/Supplementary) for tagging finding relevance to the user's task (#86). +- `relevance` field on all findings; template-primary tools produce Primary findings, others produce Supplementary. +- `supplementary_findings` array in run report for findings separated in compact mode. +- Compact output mode now moves supplementary findings to `supplementary_findings` array; full mode shows all with relevance tags. +- 22 new tests covering confidence labels, template resolution, scope validation, and finding filtering. + +### Changed + +- `investigation_plan()` replaced by template-based `resolve_investigation_template()` — tool selection is now declarative and extensible. +- Run report JSON schema updated with `confidence_label`, `relevance`, and `supplementary_findings` fields. +- Integration tests updated to find tool turns by tool name rather than assuming fixed order. + ## 0.12.0 - 2026-04-05 ### Added diff --git a/cli/src/main.rs b/cli/src/main.rs index 98ba383..f4869ca 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -478,9 +478,9 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{Parser, ValueEnum}; use core_engine::agent::Agent; use core_engine::{ - classify_capability, CoverageBaseline, EvidencePointer, Finding, FindingSeverity, - LiveFailureReasonCount, LiveFallbackDecision, LiveRunMetrics, ModelCapabilityReport, - ModelCapabilityTier, RunReport, + builtin_investigation_templates, classify_capability, CoverageBaseline, EvidencePointer, + Finding, FindingSeverity, LiveFailureReasonCount, LiveFallbackDecision, LiveRunMetrics, + ModelCapabilityReport, ModelCapabilityTier, RunReport, }; use cyber_tools::{ToolRegistry, ToolSpec}; use inference_bridge::onnx_vitis::{inspect_runtime_compatibility, RuntimeCompatibilitySeverity}; @@ -2083,7 +2083,7 @@ fn render_task_template_list() -> String { let templates = task_template_descriptors(); let _ = writeln!(output, "WraithRun Task Templates"); - for descriptor in templates { + for descriptor in &templates { let _ = writeln!(output, "- {}: {}", descriptor.name, descriptor.prompt); if descriptor.supports_template_target && descriptor.supports_template_lines { let default_target = descriptor.default_target.unwrap_or("(none)"); @@ -2104,6 +2104,13 @@ fn render_task_template_list() -> String { } } + let _ = writeln!(output); + let _ = writeln!(output, "Investigation Templates"); + for template in builtin_investigation_templates() { + let _ = writeln!(output, "- {}: {}", template.name, template.description); + let _ = writeln!(output, " tools: {}", template.tools.join(", ")); + } + output.trim_end().to_string() } @@ -4377,6 +4384,34 @@ fn render_json_compact(report: &RunReport) -> Result { object.remove("turns"); + // In compact mode, move supplementary findings to supplementary_findings (#86). + if let Some(findings_val) = object.remove("findings") { + if let Value::Array(all_findings) = findings_val { + let mut primary = Vec::new(); + let mut supplementary = Vec::new(); + + for f in all_findings { + let is_supplementary = f + .get("relevance") + .and_then(Value::as_str) + == Some("supplementary"); + if is_supplementary { + supplementary.push(f); + } else { + primary.push(f); + } + } + + object.insert("findings".to_string(), Value::Array(primary)); + if !supplementary.is_empty() { + object.insert( + "supplementary_findings".to_string(), + Value::Array(supplementary), + ); + } + } + } + object.insert( "contract_version".to_string(), Value::String(JSON_CONTRACT_VERSION.to_string()), @@ -5428,21 +5463,21 @@ fn append_live_fallback_finding(report: &mut RunReport, decision: &LiveFallbackD return; } - report.findings.push(Finding { - title: "Live mode fallback applied after inference failure".to_string(), - severity: FindingSeverity::Info, - confidence: 1.0, - evidence_pointer: EvidencePointer { + report.findings.push(Finding::new( + "Live mode fallback applied after inference failure".to_string(), + FindingSeverity::Info, + 1.0, + EvidencePointer { turn: None, tool: None, field: "live_fallback_decision.live_error".to_string(), }, - recommended_action: format!( + format!( "Review live inference error details and model-pack readiness, then rerun live mode after fixing root cause. Fallback reason: {} (code: {}).", decision.reason, decision.reason_code ), - }); + )); } fn classify_live_error_reason_code(live_error: &str) -> &'static str { @@ -5542,7 +5577,7 @@ mod tests { use std::time::{SystemTime, UNIX_EPOCH}; use std::{env, fs}; - use serde_json::json; + use serde_json::{json, Value}; use core_engine::{ AgentTurn, EvidencePointer, Finding, FindingSeverity, LiveFailureReasonCount, @@ -5639,18 +5674,19 @@ mod tests { observation: Some(json!({ "listener_count": 3, "listeners": [] })), }], final_answer: "Dry-run cycle complete.".to_string(), - findings: vec![Finding { - title: "Active listening sockets observed (3)".to_string(), - severity: FindingSeverity::Medium, - confidence: 0.68, - evidence_pointer: EvidencePointer { + findings: vec![Finding::new( + "Active listening sockets observed (3)".to_string(), + FindingSeverity::Medium, + 0.68, + EvidencePointer { turn: Some(1), tool: Some("scan_network".to_string()), field: "observation.listener_count".to_string(), }, - recommended_action: "Correlate listener PIDs and ports with expected services." + "Correlate listener PIDs and ports with expected services." .to_string(), - }], + )], + supplementary_findings: Vec::new(), } } @@ -5676,6 +5712,74 @@ mod tests { assert!(!rendered.contains("\"turns\"")); } + #[test] + fn compact_mode_separates_supplementary_findings() { + use core_engine::FindingRelevance; + + let mut report = sample_report(); + let mut supp_finding = Finding::new( + "Generic persistence noise".to_string(), + FindingSeverity::Low, + 0.50, + EvidencePointer { + turn: Some(1), + tool: Some("inspect_persistence_locations".to_string()), + field: "observation.suspicious_entry_count".to_string(), + }, + "Review entries.".to_string(), + ); + supp_finding.relevance = FindingRelevance::Supplementary; + report.findings.push(supp_finding); + + let rendered = render_report(&report, OutputFormat::Json, OutputMode::Compact, None) + .expect("compact render should work"); + + let json: Value = serde_json::from_str(&rendered).unwrap(); + let findings = json["findings"].as_array().unwrap(); + let supplementary = json["supplementary_findings"].as_array().unwrap(); + + assert_eq!(findings.len(), 1, "only primary finding in findings"); + assert_eq!( + supplementary.len(), + 1, + "supplementary finding moved to supplementary_findings" + ); + assert!(supplementary[0]["title"] + .as_str() + .unwrap() + .contains("persistence")); + } + + #[test] + fn full_mode_keeps_all_findings_with_relevance_tags() { + use core_engine::FindingRelevance; + + let mut report = sample_report(); + let mut supp_finding = Finding::new( + "Generic persistence noise".to_string(), + FindingSeverity::Low, + 0.50, + EvidencePointer { + turn: Some(1), + tool: Some("inspect_persistence_locations".to_string()), + field: "observation.suspicious_entry_count".to_string(), + }, + "Review entries.".to_string(), + ); + supp_finding.relevance = FindingRelevance::Supplementary; + report.findings.push(supp_finding); + + let rendered = render_report(&report, OutputFormat::Json, OutputMode::Full, None) + .expect("full render should work"); + + let json: Value = serde_json::from_str(&rendered).unwrap(); + let findings = json["findings"].as_array().unwrap(); + + // Full mode keeps all findings in the main array with relevance tags. + assert_eq!(findings.len(), 2); + assert!(rendered.contains("\"relevance\"")); + } + #[test] fn renders_json_output_with_live_metrics() { let mut report = sample_report(); diff --git a/cli/tests/stdin_integration.rs b/cli/tests/stdin_integration.rs index 7342979..cca2567 100644 --- a/cli/tests/stdin_integration.rs +++ b/cli/tests/stdin_integration.rs @@ -1048,20 +1048,26 @@ fn baseline_bundle_import_populates_drift_tool_arguments() { .get("turns") .and_then(Value::as_array) .expect("turns should be an array"); - let first_args = turns - .first() + let audit_turn_args = turns + .iter() + .find(|turn| { + turn.get("tool_call") + .and_then(|call| call.get("tool")) + .and_then(Value::as_str) + == Some("audit_account_changes") + }) .and_then(|turn| turn.get("tool_call")) .and_then(|call| call.get("args")) .and_then(Value::as_object) - .expect("first tool call args should be an object"); + .expect("audit_account_changes tool call args should be present"); - let baseline_accounts = first_args + let baseline_accounts = audit_turn_args .get("baseline_privileged_accounts") .and_then(Value::as_array) .expect("baseline_privileged_accounts should be present"); assert!(baseline_accounts.iter().any(|entry| entry == "svc-admin")); - let approved_accounts = first_args + let approved_accounts = audit_turn_args .get("approved_privileged_accounts") .and_then(Value::as_array) .expect("approved_privileged_accounts should be present"); @@ -1121,14 +1127,20 @@ fn baseline_bundle_import_accepts_raw_file_path_with_spaces() { .get("turns") .and_then(Value::as_array) .expect("turns should be an array"); - let first_args = turns - .first() + let audit_turn_args = turns + .iter() + .find(|turn| { + turn.get("tool_call") + .and_then(|call| call.get("tool")) + .and_then(Value::as_str) + == Some("audit_account_changes") + }) .and_then(|turn| turn.get("tool_call")) .and_then(|call| call.get("args")) .and_then(Value::as_object) - .expect("first tool call args should be an object"); + .expect("audit_account_changes tool call args should be present"); - let baseline_accounts = first_args + let baseline_accounts = audit_turn_args .get("baseline_privileged_accounts") .and_then(Value::as_array) .expect("baseline_privileged_accounts should be present"); diff --git a/core_engine/src/agent.rs b/core_engine/src/agent.rs index a38886d..13a186a 100644 --- a/core_engine/src/agent.rs +++ b/core_engine/src/agent.rs @@ -7,8 +7,9 @@ use cyber_tools::ToolRegistry; use inference_bridge::InferenceEngine; use crate::{ - basic_tier_summary, deduplicate_findings, derive_findings, extract_tag, max_severity, - quality_checked_final_answer, sort_findings, AgentTurn, CoverageBaseline, + basic_tier_summary, builtin_investigation_templates, deduplicate_findings, derive_findings, + extract_tag, max_severity, quality_checked_final_answer, sort_findings, AgentTurn, + CoverageBaseline, EvidencePointer, Finding, FindingSeverity, InvestigationTemplate, ModelCapabilityReport, ModelCapabilityTier, RunReport, RunTimingMetrics, ToolCall, }; @@ -109,11 +110,29 @@ impl Agent { pub async fn run(&self, task: &str) -> Result { let run_started_at = Instant::now(); + // Scope validation (#83): detect out-of-scope tasks before tool execution. + if let Some(scope_finding) = check_task_scope(task) { + info!("task is out of scope for host-local tools"); + return Ok(RunReport { + task: task.to_string(), + case_id: None, + max_severity: Some(FindingSeverity::Info), + model_capability: self.model_capability_report.clone(), + live_fallback_decision: None, + run_timing: Some(build_run_timing_metrics(run_started_at, None)), + live_run_metrics: None, + turns: Vec::new(), + final_answer: "Task is outside the scope of available host-local investigation tools. No tools were executed.".to_string(), + findings: vec![scope_finding], + supplementary_findings: Vec::new(), + }); + } + // Phase 1: deterministic tool execution — gather evidence. - let tool_plan = investigation_plan(task); + let template = resolve_investigation_template(task); let mut turns = Vec::new(); - for tool_name in tool_plan.iter().take(self.max_steps) { + for tool_name in template.tools.iter().take(self.max_steps) { // #74: skip tools with known-failing preconditions. if !self.check_tool_precondition(tool_name) { debug!(tool = %tool_name, "skipping tool: precondition not met"); @@ -145,6 +164,17 @@ impl Agent { let mut findings = deduplicate_findings(raw_findings); sort_findings(&mut findings); + // Tag finding relevance based on template's primary tool set (#86). + let primary_tools: std::collections::HashSet<&str> = + template.tools.iter().copied().collect(); + for finding in &mut findings { + if let Some(tool) = finding.evidence_pointer.tool.as_deref() { + if !primary_tools.contains(tool) { + finding.relevance = crate::FindingRelevance::Supplementary; + } + } + } + let (final_answer, first_token_latency_ms) = match self.capability_tier { ModelCapabilityTier::Basic => { // Skip LLM entirely; build deterministic summary from findings. @@ -192,6 +222,7 @@ impl Agent { turns, final_answer, findings, + supplementary_findings: Vec::new(), }) } @@ -224,48 +255,109 @@ fn elapsed_ms_since(started_at: Instant) -> u64 { .unwrap_or(u64::MAX) } -/// Select tools to run based on the task description. -/// All tools are cheap local introspection, so we run a broad set by default. -fn investigation_plan(task: &str) -> Vec<&'static str> { +/// Keywords that indicate a task is within scope of host-local investigation tools. +const IN_SCOPE_KEYWORDS: &[&str] = &[ + "host", + "account", + "persistence", + "network", + "process", + "privilege", + "ssh", + "listener", + "port", + "autorun", + "hash", + "integrity", + "log", +]; + +/// Keywords that indicate a task targets capabilities outside the local toolset. +const OUT_OF_SCOPE_INDICATORS: &[&str] = &[ + "cloud", "aws", "azure", "gcp", "s3", "iam", "kubernetes", "container", "api", "email", + "phishing", "siem", +]; + +/// Check whether a task is within the scope of available host-local tools. +/// +/// Returns `Some(finding)` if the task is out of scope, `None` if in scope. +pub fn check_task_scope(task: &str) -> Option { let lower = task.to_lowercase(); - let mut plan = Vec::new(); - - // Always start with broad evidence gathering. - plan.push("audit_account_changes"); - plan.push("inspect_persistence_locations"); - plan.push("read_syslog"); - - // Network-related tasks. - if lower.contains("network") - || lower.contains("connection") - || has_word(&lower, "port") - || lower.contains("listen") - || lower.contains("ssh") - || lower.contains("lateral") - || lower.contains("beacon") - { - plan.push("scan_network"); - plan.push("correlate_process_network"); - } - - // Privilege / escalation tasks. - if lower.contains("privilege") - || lower.contains("escalat") - || lower.contains("admin") - || lower.contains("root") - || lower.contains("sudo") - || lower.contains("unauthori") - { - plan.push("check_privilege_escalation_vectors"); - } - - // If the plan is still small (generic task), add more tools. - if plan.len() <= 3 { - plan.push("scan_network"); - plan.push("check_privilege_escalation_vectors"); - } - - plan + + let has_in_scope = IN_SCOPE_KEYWORDS + .iter() + .any(|kw| has_word(&lower, kw)); + + if has_in_scope { + return None; + } + + let matched_domains: Vec<&&str> = OUT_OF_SCOPE_INDICATORS + .iter() + .filter(|kw| has_word(&lower, kw)) + .collect(); + + if matched_domains.is_empty() { + return None; + } + + let domain_hint = matched_domains + .iter() + .map(|kw| **kw) + .collect::>() + .join(", "); + + Some(Finding::new( + "Task is outside the scope of available host-local investigation tools".to_string(), + FindingSeverity::Info, + 1.0, + EvidencePointer { + turn: None, + tool: None, + field: "scope_check".to_string(), + }, + format!( + "This task requires capabilities not present in the current toolset. Consider tools for: {domain_hint}." + ), + )) +} + +/// Resolve the best-matching investigation template for a task. +/// +/// Scores each built-in template by counting keyword matches in the task +/// description. Returns the highest scoring template, or the `broad-host-triage` +/// fallback if no template matches. +pub fn resolve_investigation_template(task: &str) -> &'static InvestigationTemplate { + let lower = task.to_lowercase(); + let templates = builtin_investigation_templates(); + + let mut best: Option<(&InvestigationTemplate, usize)> = None; + + for template in templates { + if template.match_keywords.is_empty() { + continue; // fallback template, skip scoring + } + + let score = template + .match_keywords + .iter() + .filter(|kw| has_word(&lower, kw)) + .count(); + + if score > 0 { + if let Some((_, best_score)) = best { + if score > best_score { + best = Some((template, score)); + } + } else { + best = Some((template, score)); + } + } + } + + let selected = best.map(|(t, _)| t).unwrap_or(&templates[0]); + info!(template = %selected.name, "investigation template selected"); + selected } /// Check whether `word` appears as a standalone word in `text` (not as a substring of another word). @@ -550,4 +642,119 @@ mod tests { "expected at least one finding from tool observations" ); } + + // ── Investigation template resolution tests (#84) ── + + use super::resolve_investigation_template; + + #[test] + fn resolves_ssh_template_for_ssh_task() { + let template = resolve_investigation_template("Investigate unauthorized SSH keys"); + assert_eq!(template.name, "ssh-key-investigation"); + } + + #[test] + fn resolves_network_template_for_listener_task() { + let template = resolve_investigation_template("Check suspicious listener ports"); + assert_eq!(template.name, "network-exposure-audit"); + } + + #[test] + fn resolves_persistence_template() { + let template = resolve_investigation_template("Analyze autorun persistence entries"); + assert_eq!(template.name, "persistence-analysis"); + } + + #[test] + fn resolves_privilege_escalation_template() { + let template = + resolve_investigation_template("Review local privilege escalation indicators"); + assert_eq!(template.name, "privilege-escalation-check"); + } + + #[test] + fn resolves_file_integrity_template() { + let template = resolve_investigation_template("Verify hash integrity of system binaries"); + assert_eq!(template.name, "file-integrity-check"); + } + + #[test] + fn falls_back_to_broad_triage_for_generic_task() { + let template = resolve_investigation_template("Perform a quick triage of this host"); + assert_eq!(template.name, "broad-host-triage"); + } + + #[test] + fn template_resolution_is_case_insensitive() { + let template = resolve_investigation_template("CHECK SSH ACCESS NOW"); + assert_eq!(template.name, "ssh-key-investigation"); + } + + #[test] + fn higher_keyword_count_wins() { + // "network lateral" matches network-exposure-audit with 2 keywords + let template = + resolve_investigation_template("Investigate network lateral movement indicators"); + assert_eq!(template.name, "network-exposure-audit"); + } + + // ── Scope validation tests (#83) ── + + use super::check_task_scope; + + #[test] + fn out_of_scope_cloud_task_returns_finding() { + let finding = check_task_scope("Check if my AWS S3 buckets are misconfigured"); + assert!(finding.is_some()); + let f = finding.unwrap(); + assert_eq!(f.severity, crate::FindingSeverity::Info); + assert!(f.title.contains("outside the scope")); + assert!(f.recommended_action.contains("aws")); + } + + #[test] + fn out_of_scope_kubernetes_task_returns_finding() { + let finding = check_task_scope("Analyze Kubernetes pod security policies"); + assert!(finding.is_some()); + } + + #[test] + fn in_scope_host_task_returns_none() { + let finding = check_task_scope("Investigate unauthorized SSH keys on this host"); + assert!(finding.is_none()); + } + + #[test] + fn in_scope_network_task_returns_none() { + let finding = check_task_scope("Check network listener ports for suspicious activity"); + assert!(finding.is_none()); + } + + #[test] + fn mixed_scope_with_in_scope_keyword_returns_none() { + // Has both "cloud" (out-of-scope) and "host" (in-scope) — in-scope wins + let finding = check_task_scope("Check host logs for cloud credential leaks"); + assert!(finding.is_none()); + } + + #[test] + fn generic_task_without_scope_keywords_returns_none() { + // No in-scope AND no out-of-scope keywords → proceed normally + let finding = check_task_scope("Perform a general security assessment"); + assert!(finding.is_none()); + } + + #[tokio::test] + async fn out_of_scope_task_skips_tool_execution() { + let engine = MockEngine::new(vec![]); + let agent = Agent::new(engine, ToolRegistry::with_default_tools()); + let report = agent + .run("Check if my AWS S3 buckets are misconfigured") + .await + .expect("agent run should succeed"); + + assert!(report.turns.is_empty(), "no tools should be executed"); + assert_eq!(report.findings.len(), 1); + assert!(report.findings[0].title.contains("outside the scope")); + } } diff --git a/core_engine/src/lib.rs b/core_engine/src/lib.rs index add0ac8..9aaf25d 100644 --- a/core_engine/src/lib.rs +++ b/core_engine/src/lib.rs @@ -6,6 +6,88 @@ use inference_bridge::ModelCapabilityProbe; use serde::{Deserialize, Serialize, Serializer}; use serde_json::Value; +// ── Investigation templates (#84) ── + +/// A declarative investigation template that maps task keywords to tool sets. +#[derive(Debug, Clone, Serialize)] +pub struct InvestigationTemplate { + pub name: &'static str, + pub description: &'static str, + #[serde(skip)] + pub match_keywords: &'static [&'static str], + pub tools: &'static [&'static str], +} + +/// All built-in investigation templates. +pub fn builtin_investigation_templates() -> &'static [InvestigationTemplate] { + &BUILTIN_TEMPLATES +} + +static BUILTIN_TEMPLATES: [InvestigationTemplate; 6] = [ + InvestigationTemplate { + name: "broad-host-triage", + description: "General-purpose host investigation covering persistence, accounts, network, and privilege vectors", + match_keywords: &[], + tools: &[ + "audit_account_changes", + "inspect_persistence_locations", + "read_syslog", + "scan_network", + "check_privilege_escalation_vectors", + ], + }, + InvestigationTemplate { + name: "ssh-key-investigation", + description: "Investigate unauthorized SSH keys and related access", + match_keywords: &["ssh", "authorized_keys", "key"], + tools: &[ + "audit_account_changes", + "inspect_persistence_locations", + "check_privilege_escalation_vectors", + "scan_network", + ], + }, + InvestigationTemplate { + name: "persistence-analysis", + description: "Analyze persistence mechanisms including autoruns and scheduled tasks", + match_keywords: &["persistence", "autorun", "startup", "cron", "scheduled"], + tools: &[ + "inspect_persistence_locations", + "audit_account_changes", + "read_syslog", + ], + }, + InvestigationTemplate { + name: "network-exposure-audit", + description: "Audit network listeners, exposed services, and lateral movement indicators", + match_keywords: &["network", "connection", "port", "listen", "listener", "lateral", "beacon", "socket"], + tools: &[ + "scan_network", + "correlate_process_network", + "audit_account_changes", + ], + }, + InvestigationTemplate { + name: "privilege-escalation-check", + description: "Review local privilege escalation vectors and unauthorized account grants", + match_keywords: &["privilege", "escalat", "admin", "root", "sudo", "unauthori"], + tools: &[ + "check_privilege_escalation_vectors", + "audit_account_changes", + "inspect_persistence_locations", + ], + }, + InvestigationTemplate { + name: "file-integrity-check", + description: "Verify file hashes and detect tampering of critical binaries", + match_keywords: &["hash", "integrity", "checksum", "binary", "tamper"], + tools: &[ + "audit_account_changes", + "inspect_persistence_locations", + ], + }, +]; + // ── Capability tiering thresholds (const, easy to tune) ── /// Models below this parameter count (billions) are classified as Basic. @@ -140,16 +222,100 @@ fn serialize_confidence(value: &f32, serializer: S) -> Result &'static str { + match self { + Self::Informational => "informational", + Self::Possible => "possible", + Self::Likely => "likely", + Self::Confirmed => "confirmed", + } + } +} + +impl std::fmt::Display for FindingConfidence { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.token()) + } +} + +/// Map a continuous confidence float to a discrete label. +pub fn confidence_to_label(confidence: f32) -> FindingConfidence { + if confidence >= 0.90 { + FindingConfidence::Confirmed + } else if confidence >= 0.72 { + FindingConfidence::Likely + } else if confidence >= 0.55 { + FindingConfidence::Possible + } else { + FindingConfidence::Informational + } +} + +/// Relevance of a finding relative to the user's task (#86). +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "lowercase")] +pub enum FindingRelevance { + #[default] + Primary, + Supplementary, +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct Finding { pub title: String, pub severity: FindingSeverity, #[serde(serialize_with = "serialize_confidence")] pub confidence: f32, + #[serde(default = "default_confidence_label")] + pub confidence_label: FindingConfidence, + #[serde(default)] + pub relevance: FindingRelevance, pub evidence_pointer: EvidencePointer, pub recommended_action: String, } +fn default_confidence_label() -> FindingConfidence { + FindingConfidence::Possible +} + +impl Finding { + /// Create a new finding, auto-deriving confidence_label from the float value. + pub fn new( + title: String, + severity: FindingSeverity, + confidence: f32, + evidence_pointer: EvidencePointer, + recommended_action: String, + ) -> Self { + Self { + title, + severity, + confidence_label: confidence_to_label(confidence), + relevance: FindingRelevance::Primary, + confidence, + evidence_pointer, + recommended_action, + } + } + + /// Backfill confidence_label from the confidence float. + pub fn with_derived_label(mut self) -> Self { + self.confidence_label = confidence_to_label(self.confidence); + self + } +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct LiveFallbackDecision { pub policy: String, @@ -206,6 +372,8 @@ pub struct RunReport { pub final_answer: String, #[serde(default)] pub findings: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub supplementary_findings: Vec, } pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec { @@ -220,19 +388,19 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec if let Some(error) = observation.get("error").and_then(Value::as_str) { let tool_label = tool_name.as_deref().unwrap_or("unknown_tool"); - findings.push(Finding { - title: format!("Tool execution failed for {tool_label}"), - severity: FindingSeverity::High, - confidence: 0.95, - evidence_pointer: EvidencePointer { + findings.push(Finding::new( + format!("Tool execution failed for {tool_label}"), + FindingSeverity::High, + 0.95, + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.error".to_string(), }, - recommended_action: format!( + format!( "Review tool arguments and host access policy, then rerun {tool_label}. Error sample: {error}" ), - }); + )); continue; } @@ -244,19 +412,19 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec FindingSeverity::Medium }; - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Privilege escalation indicators detected ({indicator_count})" ), severity, - confidence: confidence_from_count(0.68, indicator_count, 0.06, 0.96), - evidence_pointer: EvidencePointer { + confidence_from_count(0.68, indicator_count, 0.06, 0.96), + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.indicator_count".to_string(), }, - recommended_action: "Review potential_vectors and verify whether elevated rights are expected; revoke or constrain unexpected grants.".to_string(), - }); + "Review potential_vectors and verify whether elevated rights are expected; revoke or constrain unexpected grants.".to_string(), + )); } } @@ -270,17 +438,17 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec FindingSeverity::Low }; - findings.push(Finding { - title: format!("Active listening sockets observed ({listener_count})"), + findings.push(Finding::new( + format!("Active listening sockets observed ({listener_count})"), severity, - confidence: confidence_from_count(0.62, listener_count, 0.02, 0.92), - evidence_pointer: EvidencePointer { + confidence_from_count(0.62, listener_count, 0.02, 0.92), + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.listener_count".to_string(), }, - recommended_action: "Correlate listener PIDs and ports with expected services; investigate unknown listeners and expose only required interfaces.".to_string(), - }); + "Correlate listener PIDs and ports with expected services; investigate unknown listeners and expose only required interfaces.".to_string(), + )); } } @@ -299,21 +467,21 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec .and_then(Value::as_u64) .unwrap_or(0); - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Coverage baseline captured ({baseline_entries_count} persistence entries, {baseline_privileged_account_count} privileged accounts, {baseline_exposed_binding_count} exposed bindings)" ), - severity: FindingSeverity::Info, - confidence: 0.9, - evidence_pointer: EvidencePointer { + FindingSeverity::Info, + 0.9, + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.baseline_version".to_string(), }, - recommended_action: format!( + format!( "Store baseline arrays from this {baseline_version} snapshot and supply them to coverage tools in subsequent runs to detect drift." ), - }); + )); } let actionable_persistence_count = observation @@ -353,17 +521,17 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec ) }; - findings.push(Finding { + findings.push(Finding::new( title, severity, - confidence: confidence_from_count(0.7, persistence_count, 0.05, 0.95), - evidence_pointer: EvidencePointer { + confidence_from_count(0.7, persistence_count, 0.05, 0.95), + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: evidence_field.to_string(), }, - recommended_action: "Review persistence entries for unauthorized startup references, remove unapproved autoruns, and preserve forensic artifacts before cleanup.".to_string(), - }); + "Review persistence entries for unauthorized startup references, remove unapproved autoruns, and preserve forensic artifacts before cleanup.".to_string(), + )); } if let Some(baseline_new_count) = observation @@ -377,19 +545,19 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec FindingSeverity::Medium }; - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Persistence baseline drift detected ({baseline_new_count} new entries)" ), severity, - confidence: confidence_from_count(0.69, baseline_new_count, 0.04, 0.94), - evidence_pointer: EvidencePointer { + confidence_from_count(0.69, baseline_new_count, 0.04, 0.94), + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.baseline_new_count".to_string(), }, - recommended_action: "Compare baseline_new_entries against approved software changes and investigate unexpected startup additions for persistence abuse.".to_string(), - }); + "Compare baseline_new_entries against approved software changes and investigate unexpected startup additions for persistence abuse.".to_string(), + )); } } @@ -398,24 +566,24 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec .and_then(Value::as_u64) { if non_default_privileged_account_count > 0 { - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Non-default privileged accounts observed ({non_default_privileged_account_count})" ), - severity: FindingSeverity::High, - confidence: confidence_from_count( + FindingSeverity::High, + confidence_from_count( 0.74, non_default_privileged_account_count, 0.04, 0.96, ), - evidence_pointer: EvidencePointer { + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.non_default_privileged_account_count".to_string(), }, - recommended_action: "Validate each non-default privileged account against approved access records; revoke unauthorized role grants and rotate exposed credentials.".to_string(), - }); + "Validate each non-default privileged account against approved access records; revoke unauthorized role grants and rotate exposed credentials.".to_string(), + )); } } @@ -424,28 +592,28 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec .and_then(Value::as_u64) { if newly_privileged_account_count > 0 { - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Privileged account baseline drift detected ({newly_privileged_account_count} new account(s))" ), - severity: if newly_privileged_account_count >= 3 { + if newly_privileged_account_count >= 3 { FindingSeverity::Critical } else { FindingSeverity::High }, - confidence: confidence_from_count( + confidence_from_count( 0.78, newly_privileged_account_count, 0.04, 0.97, ), - evidence_pointer: EvidencePointer { + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.newly_privileged_account_count".to_string(), }, - recommended_action: "Validate each newly privileged account against approved access changes, disable unauthorized grants, and rotate impacted credentials.".to_string(), - }); + "Validate each newly privileged account against approved access changes, disable unauthorized grants, and rotate impacted credentials.".to_string(), + )); } } @@ -454,24 +622,24 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec .and_then(Value::as_u64) { if unapproved_privileged_account_count > 0 { - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Unapproved privileged accounts detected ({unapproved_privileged_account_count})" ), - severity: FindingSeverity::Critical, - confidence: confidence_from_count( + FindingSeverity::Critical, + confidence_from_count( 0.8, unapproved_privileged_account_count, 0.04, 0.98, ), - evidence_pointer: EvidencePointer { + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.unapproved_privileged_account_count".to_string(), }, - recommended_action: "Escalate immediately: remove unapproved privileged memberships, confirm identity ownership, and collect IAM audit evidence.".to_string(), - }); + "Escalate immediately: remove unapproved privileged memberships, confirm identity ownership, and collect IAM audit evidence.".to_string(), + )); } } @@ -486,19 +654,19 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec FindingSeverity::Medium }; - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Externally exposed listening endpoints observed ({externally_exposed_count})" ), severity, - confidence: confidence_from_count(0.66, externally_exposed_count, 0.03, 0.93), - evidence_pointer: EvidencePointer { + confidence_from_count(0.66, externally_exposed_count, 0.03, 0.93), + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.externally_exposed_count".to_string(), }, - recommended_action: "Confirm process ownership and necessity of exposed listeners; close or firewall unnecessary bindings and monitor for reappearance.".to_string(), - }); + "Confirm process ownership and necessity of exposed listeners; close or firewall unnecessary bindings and monitor for reappearance.".to_string(), + )); } } @@ -507,23 +675,23 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec .and_then(Value::as_u64) { if high_risk_exposed_count > 0 { - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "High-risk process listeners exposed externally ({high_risk_exposed_count})" ), - severity: if high_risk_exposed_count >= 2 { + if high_risk_exposed_count >= 2 { FindingSeverity::Critical } else { FindingSeverity::High }, - confidence: confidence_from_count(0.76, high_risk_exposed_count, 0.04, 0.97), - evidence_pointer: EvidencePointer { + confidence_from_count(0.76, high_risk_exposed_count, 0.04, 0.97), + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.high_risk_exposed_count".to_string(), }, - recommended_action: "Prioritize containment for high-risk exposed processes, validate command-line lineage, and restrict inbound access immediately.".to_string(), - }); + "Prioritize containment for high-risk exposed processes, validate command-line lineage, and restrict inbound access immediately.".to_string(), + )); } } @@ -532,24 +700,24 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec .and_then(Value::as_u64) { if unknown_exposed_process_count > 0 { - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Unexpected exposed processes relative to expected allowlist ({unknown_exposed_process_count})" ), - severity: FindingSeverity::High, - confidence: confidence_from_count( + FindingSeverity::High, + confidence_from_count( 0.72, unknown_exposed_process_count, 0.04, 0.95, ), - evidence_pointer: EvidencePointer { + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.unknown_exposed_process_count".to_string(), }, - recommended_action: "Reconcile unknown exposed processes against approved service inventory and close unapproved listeners through host firewall or service disablement.".to_string(), - }); + "Reconcile unknown exposed processes against approved service inventory and close unapproved listeners through host firewall or service disablement.".to_string(), + )); } } @@ -564,19 +732,19 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec FindingSeverity::High }; - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Network exposure risk score exceeded threshold ({network_risk_score})" ), severity, - confidence: confidence_from_count(0.7, network_risk_score, 0.002, 0.96), - evidence_pointer: EvidencePointer { + confidence_from_count(0.7, network_risk_score, 0.002, 0.96), + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.network_risk_score".to_string(), }, - recommended_action: "Escalate to incident triage: prioritize exposed services with highest risk contribution and verify baseline drift across process-network bindings.".to_string(), - }); + "Escalate to incident triage: prioritize exposed services with highest risk contribution and verify baseline drift across process-network bindings.".to_string(), + )); } } @@ -584,17 +752,17 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec observation.get("path").and_then(Value::as_str), observation.get("sha256").and_then(Value::as_str), ) { - findings.push(Finding { - title: format!("File hash captured for {path}"), - severity: FindingSeverity::Info, - confidence: 0.90, - evidence_pointer: EvidencePointer { + findings.push(Finding::new( + format!("File hash captured for {path}"), + FindingSeverity::Info, + 0.90, + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.sha256".to_string(), }, - recommended_action: "Compare the hash against trusted baseline or threat-intel sources before taking containment action.".to_string(), - }); + "Compare the hash against trusted baseline or threat-intel sources before taking containment action.".to_string(), + )); } if let Some(lines) = observation.get("lines").and_then(Value::as_array) { @@ -611,43 +779,44 @@ pub fn derive_findings(turns: &[AgentTurn], final_answer: &str) -> Vec FindingSeverity::Medium }; - findings.push(Finding { - title: format!( + findings.push(Finding::new( + format!( "Suspicious log keywords observed in {suspicious_hits} line(s)" ), severity, - confidence: confidence_from_count(0.64, suspicious_hits as u64, 0.05, 0.93), - evidence_pointer: EvidencePointer { + confidence_from_count(0.64, suspicious_hits as u64, 0.05, 0.93), + EvidencePointer { turn: Some(idx + 1), tool: tool_name.clone(), field: "observation.lines".to_string(), }, - recommended_action: "Inspect matching log lines for account abuse or execution anomalies, then pivot to host and identity telemetry.".to_string(), - }); + "Inspect matching log lines for account abuse or execution anomalies, then pivot to host and identity telemetry.".to_string(), + )); } } } if findings.is_empty() { - findings.push(Finding { - title: "No high-confidence host findings derived from collected evidence".to_string(), - severity: FindingSeverity::Info, - confidence: 0.55, - evidence_pointer: EvidencePointer { + findings.push(Finding::new( + "No high-confidence host findings derived from collected evidence".to_string(), + FindingSeverity::Info, + 0.55, + EvidencePointer { turn: None, tool: None, field: "final_answer".to_string(), }, - recommended_action: if final_answer.trim().is_empty() { + if final_answer.trim().is_empty() { "Review raw observations and rerun targeted task templates for deeper coverage." .to_string() } else { "Review the final answer and raw observations; rerun targeted task templates if analyst confidence is low.".to_string() }, - }); + )); } - findings + // Backfill confidence labels from float values (#85). + findings.into_iter().map(|f| f.with_derived_label()).collect() } /// Tool authority ranking — higher means this tool's findings should be preferred @@ -1074,17 +1243,17 @@ mod tests { tool: &str, field: &str, ) -> Finding { - Finding { - title: title.to_string(), + Finding::new( + title.to_string(), severity, confidence, - evidence_pointer: EvidencePointer { + EvidencePointer { turn: Some(1), tool: Some(tool.to_string()), field: field.to_string(), }, - recommended_action: "Investigate further.".to_string(), - } + "Investigate further.".to_string(), + ) } #[test] @@ -1390,6 +1559,97 @@ mod tests { assert_eq!(a, b); } + // ── Discrete confidence label tests (#85) ── + + use super::{confidence_to_label, FindingConfidence, FindingRelevance}; + + #[test] + fn confidence_label_confirmed_threshold() { + assert_eq!(confidence_to_label(0.90), FindingConfidence::Confirmed); + assert_eq!(confidence_to_label(0.99), FindingConfidence::Confirmed); + assert_eq!(confidence_to_label(1.0), FindingConfidence::Confirmed); + } + + #[test] + fn confidence_label_likely_threshold() { + assert_eq!(confidence_to_label(0.72), FindingConfidence::Likely); + assert_eq!(confidence_to_label(0.89), FindingConfidence::Likely); + } + + #[test] + fn confidence_label_possible_threshold() { + assert_eq!(confidence_to_label(0.55), FindingConfidence::Possible); + assert_eq!(confidence_to_label(0.71), FindingConfidence::Possible); + } + + #[test] + fn confidence_label_informational_threshold() { + assert_eq!(confidence_to_label(0.54), FindingConfidence::Informational); + assert_eq!(confidence_to_label(0.10), FindingConfidence::Informational); + assert_eq!(confidence_to_label(0.0), FindingConfidence::Informational); + } + + #[test] + fn finding_new_auto_derives_confidence_label() { + let finding = make_finding("Test", FindingSeverity::High, 0.92, "scan_network", "a"); + assert_eq!(finding.confidence_label, FindingConfidence::Confirmed); + + let finding = make_finding("Test", FindingSeverity::Low, 0.60, "scan_network", "b"); + assert_eq!(finding.confidence_label, FindingConfidence::Possible); + } + + #[test] + fn finding_confidence_label_ordering() { + assert!(FindingConfidence::Confirmed > FindingConfidence::Likely); + assert!(FindingConfidence::Likely > FindingConfidence::Possible); + assert!(FindingConfidence::Possible > FindingConfidence::Informational); + } + + #[test] + fn finding_confidence_serializes_lowercase() { + let json = serde_json::to_string(&FindingConfidence::Confirmed).unwrap(); + assert_eq!(json, "\"confirmed\""); + let json = serde_json::to_string(&FindingConfidence::Informational).unwrap(); + assert_eq!(json, "\"informational\""); + } + + #[test] + fn finding_relevance_default_is_primary() { + assert_eq!(FindingRelevance::default(), FindingRelevance::Primary); + } + + #[test] + fn finding_confidence_label_in_json_output() { + let finding = make_finding("Test", FindingSeverity::High, 0.95, "scan_network", "a"); + let json = serde_json::to_string(&finding).unwrap(); + assert!(json.contains("\"confidence_label\":\"confirmed\"")); + assert!(json.contains("\"relevance\":\"primary\"")); + } + + #[test] + fn derive_findings_backfills_confidence_labels() { + let turns = vec![AgentTurn { + thought: "{...}".to_string(), + tool_call: Some(ToolCall { + tool: "check_privilege_escalation_vectors".to_string(), + args: json!({}), + }), + observation: Some(json!({ + "indicator_count": 2, + })), + }]; + + let findings = derive_findings(&turns, ""); + for finding in &findings { + assert_eq!( + finding.confidence_label, + confidence_to_label(finding.confidence), + "confidence_label should match float for '{}'", + finding.title + ); + } + } + // ── ModelCapabilityReport tests (#80) ── #[test] diff --git a/docs/USAGE_EXAMPLES.md b/docs/USAGE_EXAMPLES.md index 76299c7..bd8909d 100644 --- a/docs/USAGE_EXAMPLES.md +++ b/docs/USAGE_EXAMPLES.md @@ -335,6 +335,39 @@ export WRAITHRUN_COMMAND_DENYLIST="bash,sh,python,curl,wget" - `Review local privilege escalation indicators` - `Read and summarize last 200 lines from C:/Logs/agent.log` +## Investigation Templates and Scope Validation + +The agent resolves a declarative investigation template based on task keywords. Templates determine tool selection and execution order. + +List investigation templates: + +```powershell +.\wraithrun.exe --list-task-templates +``` + +Tasks outside supported scope (cloud, Kubernetes, email, SIEM) return an informational scoping finding: + +```powershell +.\wraithrun.exe --task "Check our AWS S3 bucket permissions" +# Returns informational finding: task is outside host-level investigation scope +``` + +## Finding Confidence Labels and Relevance + +Findings include a discrete `confidence_label` derived from the numeric score: + +```powershell +.\wraithrun.exe --task "Investigate unauthorized SSH keys" --output-mode full +# Each finding includes: "confidence_label": "confirmed", "relevance": "primary" +``` + +In compact mode (default), supplementary findings from non-primary tools are separated: + +```powershell +.\wraithrun.exe --task "Investigate unauthorized SSH keys" +# JSON contains "findings": [...] and "supplementary_findings": [...] +``` + ## Troubleshooting Quick Checks - `Vitis inference is disabled`: diff --git a/docs/automation-contracts.md b/docs/automation-contracts.md index 3bf55e7..bd44f1f 100644 --- a/docs/automation-contracts.md +++ b/docs/automation-contracts.md @@ -16,6 +16,8 @@ Live-mode note: - Run report includes optional `run_timing` and `live_run_metrics` for latency and reliability telemetry. - Findings adapter summary includes optional `live_run_metrics` for downstream alerting and scoring. - Doctor introspection checks now include an optional `remediation` field with actionable fix guidance for each `reason_code`. +- Run report findings now include `confidence_label` (discrete tier) and `relevance` (primary/supplementary) fields. +- Run report includes an optional `supplementary_findings` array for lower-relevance findings (compact output mode). ## Schema Files diff --git a/docs/automation-workflows.md b/docs/automation-workflows.md index 32ca16d..58d5a3a 100644 --- a/docs/automation-workflows.md +++ b/docs/automation-workflows.md @@ -63,6 +63,8 @@ Example adapter payload fields for forwarding: - `summary.highest_severity` - `findings[].finding_id` - `findings[].severity` +- `findings[].confidence_label` (discrete tier: `informational`, `possible`, `likely`, `confirmed`) +- `findings[].relevance` (`primary` or `supplementary`) - `findings[].recommended_action` - `findings[].evidence_pointer` - `summary.live_fallback_decision` (when fallback is triggered) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 0841f57..aaa20f2 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -423,9 +423,13 @@ When `--baseline-bundle` is set, the runtime imports the latest `capture_coverag - `title`: concise finding summary. - `severity`: one of `info`, `low`, `medium`, `high`, `critical`. - `confidence`: numeric confidence score (`0.00` to `1.00`). +- `confidence_label`: discrete confidence tier derived from the numeric score. One of `informational` (< 0.55), `possible` (≥ 0.55), `likely` (≥ 0.72), `confirmed` (≥ 0.90). +- `relevance`: finding relevance to the resolved investigation template. One of `primary` (from template-selected tools) or `supplementary` (from non-primary tools). Default: `primary`. - `evidence_pointer`: pointer back to supporting evidence. - `recommended_action`: analyst-facing next action. +In compact output mode, supplementary findings are separated into a `supplementary_findings` array. In full mode, all findings remain in the main `findings` array with their `relevance` tag. + `evidence_pointer` fields: - `turn`: 1-based turn index when evidence comes from a tool observation (`null` when sourced from `final_answer`). @@ -448,6 +452,25 @@ Template parameter support: - `syslog-summary`: supports `--template-target` and `--template-lines`. - `ssh-keys`, `listener-risk`, `priv-esc-review`: no template parameters. +## Investigation Templates + +When a free-text `--task` is provided, the agent resolves a declarative investigation template by scoring keywords in the task description. The matched template determines which tools run and in what order. + +Built-in investigation templates: + +- **broad-host-triage**: default fallback. Runs all host-level tools. +- **ssh-key-investigation**: SSH key and account audit focus. +- **persistence-analysis**: autorun and persistence mechanism checks. +- **network-exposure-audit**: listener and network binding analysis. +- **privilege-escalation-check**: privilege escalation indicator checks. +- **file-integrity-check**: hash verification and file integrity analysis. + +List investigation templates via `--list-task-templates`. + +## Task Scope Validation + +The agent validates that the task description falls within its supported scope (host-level cyber investigation). Tasks that reference out-of-scope domains (cloud infrastructure, container orchestration, email/phishing, SIEM) return an informational finding explaining the scope boundary instead of running the investigation. + ## Built-In Profiles - `local-lab`: dry-run, compact step/token budget, summary output. diff --git a/docs/getting-started.md b/docs/getting-started.md index 4df23c2..21e5b8f 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -155,7 +155,8 @@ WraithRun prints a JSON report with: - task: your original request. - max_severity: highest severity level across all findings (when findings are present). - model_capability: capability tier, estimated parameters, execution provider, latency, and vocab size (live mode). -- findings: normalized actionable findings (deduplicated, sorted by severity). +- findings: normalized actionable findings (deduplicated, sorted by severity). Each finding includes a discrete `confidence_label` and `relevance` tag. +- supplementary_findings: lower-relevance findings from non-primary tools (compact mode only). - run_timing: optional latency fields (`first_token_latency_ms`, `total_run_duration_ms`). - live_run_metrics: optional live reliability/latency fields for live-mode runs. - turns: intermediate reasoning and tool observations (included when `--output-mode full` is used). diff --git a/docs/schemas/examples/run-report.example.json b/docs/schemas/examples/run-report.example.json index fb82f1d..7c43514 100644 --- a/docs/schemas/examples/run-report.example.json +++ b/docs/schemas/examples/run-report.example.json @@ -56,6 +56,8 @@ "title": "Privilege escalation indicators detected (1)", "severity": "medium", "confidence": 0.74, + "confidence_label": "likely", + "relevance": "primary", "evidence_pointer": { "turn": 1, "tool": "check_privilege_escalation_vectors", @@ -67,6 +69,8 @@ "title": "Live mode fallback applied after inference failure", "severity": "info", "confidence": 1.0, + "confidence_label": "confirmed", + "relevance": "primary", "evidence_pointer": { "turn": null, "tool": null, diff --git a/docs/schemas/run-report.schema.json b/docs/schemas/run-report.schema.json index f751e0a..ed1cc88 100644 --- a/docs/schemas/run-report.schema.json +++ b/docs/schemas/run-report.schema.json @@ -231,6 +231,15 @@ "minimum": 0.0, "maximum": 1.0 }, + "confidence_label": { + "type": "string", + "enum": ["informational", "possible", "likely", "confirmed"] + }, + "relevance": { + "type": "string", + "enum": ["primary", "supplementary"], + "default": "primary" + }, "evidence_pointer": { "type": "object", "required": ["turn", "tool", "field"], @@ -254,6 +263,12 @@ }, "additionalProperties": false } + }, + "supplementary_findings": { + "type": "array", + "items": { + "$ref": "#/properties/findings/items" + } } }, "additionalProperties": false diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index b02d52d..71115c6 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -178,4 +178,26 @@ Symptom: Fix: - This happens when the model is classified as Basic tier (deterministic summary) or when LLM output quality is detected as low. + +## Task returned a scope-boundary finding instead of running + +Symptom: + +- The agent returns a single informational finding about the task being outside host-level investigation scope, without executing any tools. + +Fix: + +- WraithRun validates that tasks fall within its supported domain (host-level cyber investigation). Tasks referencing cloud infrastructure (AWS, Azure, GCP), container orchestration (Kubernetes), email/phishing, or SIEM are rejected. +- Rephrase your task to focus on host-level analysis: accounts, processes, persistence, network listeners, file integrity, or logs. + +## Some findings appear in supplementary_findings instead of findings + +Symptom: + +- In compact JSON output, some findings are in a `supplementary_findings` array instead of the main `findings` array. + +Fix: + +- This is expected. Since v0.13.0, the agent tags findings by relevance to the resolved investigation template. Findings from non-primary tools are classified as `supplementary` and separated in compact mode. +- Use `--output-mode full` to keep all findings in the main `findings` array with their `relevance` tag. - If your model is capable, use `--capability-override moderate` or `--capability-override strong` to force LLM synthesis. diff --git a/docs/upgrades.md b/docs/upgrades.md index 83ce7da..8a35a37 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -1,5 +1,45 @@ # Upgrade Notes +## v0.13.0 + +### Breaking/visible changes + +- Findings now include a `confidence_label` field (one of `informational`, `possible`, `likely`, `confirmed`) derived from the numeric `confidence` score. Existing `confidence` field is unchanged. +- Findings now include a `relevance` field (`primary` or `supplementary`) indicating whether the finding came from a template-selected tool. Default: `primary`. +- In compact output mode, supplementary findings are separated into a new `supplementary_findings` array. Full mode keeps all findings in the main array with relevance tags. +- Free-text tasks are now matched against declarative investigation templates that determine tool selection order. Previously, tool selection used a hardcoded keyword mapping. +- Tasks referencing out-of-scope domains (cloud, Kubernetes, email/phishing, SIEM) now return an informational scope-boundary finding instead of running the investigation. + +### Migration examples + +The `confidence_label` field is additive — existing parsers that ignore unknown fields are unaffected: + +```json +{ + "title": "Unauthorized SSH key detected", + "severity": "high", + "confidence": 0.92, + "confidence_label": "confirmed", + "relevance": "primary" +} +``` + +If your pipeline consumes compact JSON and filters on `findings[]`, check for a new `supplementary_findings` array containing lower-relevance findings: + +```json +{ + "findings": [ ... ], + "supplementary_findings": [ ... ] +} +``` + +Confidence label thresholds: + +- `confirmed`: score ≥ 0.90 +- `likely`: score ≥ 0.72 +- `possible`: score ≥ 0.55 +- `informational`: score < 0.55 + ## v0.12.0 ### Breaking/visible changes