From b7ceb232b542896b845e9a78631cd17e32bacb4c Mon Sep 17 00:00:00 2001 From: Steven Enamakel Date: Sun, 12 Apr 2026 14:06:15 -0700 Subject: [PATCH 1/3] feat: add native computer control tools for keyboard and mouse - Introduced `ComputerControlConfig` to manage mouse and keyboard tool activation. - Implemented `KeyboardTool` and `MouseTool` for native input control using platform-native APIs. - Updated configuration schema to include computer control settings. - Enhanced tool registration logic to conditionally include mouse and keyboard tools based on user configuration. - Added comprehensive documentation and tests for new functionalities, ensuring robust integration and usability. --- src/openhuman/config/schema/mod.rs | 5 +- src/openhuman/config/schema/tools.rs | 16 + src/openhuman/config/schema/types.rs | 4 + src/openhuman/tools/impl/computer/keyboard.rs | 475 ++++++++++++++++++ src/openhuman/tools/impl/computer/mod.rs | 5 + src/openhuman/tools/impl/computer/mouse.rs | 401 +++++++++++++++ src/openhuman/tools/impl/mod.rs | 2 + src/openhuman/tools/ops.rs | 7 + 8 files changed, 913 insertions(+), 2 deletions(-) create mode 100644 src/openhuman/tools/impl/computer/keyboard.rs create mode 100644 src/openhuman/tools/impl/computer/mod.rs create mode 100644 src/openhuman/tools/impl/computer/mouse.rs diff --git a/src/openhuman/config/schema/mod.rs b/src/openhuman/config/schema/mod.rs index 624966e47..66610f133 100644 --- a/src/openhuman/config/schema/mod.rs +++ b/src/openhuman/config/schema/mod.rs @@ -59,8 +59,9 @@ pub use storage_memory::{ MemoryConfig, StorageConfig, StorageProviderConfig, StorageProviderSection, }; pub use tools::{ - BrowserComputerUseConfig, BrowserConfig, ComposioConfig, HttpRequestConfig, IntegrationToggle, - IntegrationsConfig, MultimodalConfig, SecretsConfig, WebSearchConfig, + BrowserComputerUseConfig, BrowserConfig, ComposioConfig, ComputerControlConfig, + HttpRequestConfig, IntegrationToggle, IntegrationsConfig, MultimodalConfig, SecretsConfig, + WebSearchConfig, }; pub use update::UpdateConfig; mod voice_server; diff --git a/src/openhuman/config/schema/tools.rs b/src/openhuman/config/schema/tools.rs index 34139c94c..30c156669 100644 --- a/src/openhuman/config/schema/tools.rs +++ b/src/openhuman/config/schema/tools.rs @@ -237,6 +237,22 @@ impl Default for SecretsConfig { } } +// ── Native computer control (mouse + keyboard) ───────────────────── + +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub struct ComputerControlConfig { + /// Master toggle for mouse and keyboard tools. Disabled by default — + /// the user must explicitly opt in. + #[serde(default)] + pub enabled: bool, +} + +impl Default for ComputerControlConfig { + fn default() -> Self { + Self { enabled: false } + } +} + // ── Agent integration tools (backend-proxied) ─────────────────────── /// Per-integration on/off toggle. diff --git a/src/openhuman/config/schema/types.rs b/src/openhuman/config/schema/types.rs index e2309a675..dd0ed8e69 100644 --- a/src/openhuman/config/schema/types.rs +++ b/src/openhuman/config/schema/types.rs @@ -112,6 +112,9 @@ pub struct Config { #[serde(default)] pub cost: CostConfig, + #[serde(default)] + pub computer_control: ComputerControlConfig, + #[serde(default)] pub peripherals: PeripheralsConfig, @@ -181,6 +184,7 @@ impl Default for Config { web_search: WebSearchConfig::default(), proxy: ProxyConfig::default(), cost: CostConfig::default(), + computer_control: ComputerControlConfig::default(), peripherals: PeripheralsConfig::default(), agents: HashMap::new(), local_ai: LocalAiConfig::default(), diff --git a/src/openhuman/tools/impl/computer/keyboard.rs b/src/openhuman/tools/impl/computer/keyboard.rs new file mode 100644 index 000000000..373e3e508 --- /dev/null +++ b/src/openhuman/tools/impl/computer/keyboard.rs @@ -0,0 +1,475 @@ +//! Native keyboard control tool using enigo. +//! +//! Provides text typing, individual key presses, and hotkey combinations +//! via platform-native APIs (Core Graphics on macOS, SendInput on Windows, +//! X11/libxdo on Linux). + +use crate::openhuman::security::SecurityPolicy; +use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolResult}; +use async_trait::async_trait; +use enigo::{Direction, Enigo, Key, Keyboard, Settings}; +use serde_json::{json, Value}; +use std::sync::Arc; +use std::time::Duration; +use tracing::{debug, info}; + +/// Small delay between key events in a hotkey sequence so the OS +/// registers each modifier correctly. +const HOTKEY_INTER_KEY_DELAY: Duration = Duration::from_millis(20); + +/// Maximum text length for the `type` action to prevent accidental floods. +const MAX_TYPE_LENGTH: usize = 10_000; + +pub struct KeyboardTool { + security: Arc, +} + +impl KeyboardTool { + pub fn new(security: Arc) -> Self { + Self { security } + } +} + +/// Parse a human-readable key name into an enigo `Key`. +/// +/// Accepts common names (case-insensitive) plus single characters. +fn parse_key(name: &str) -> Option { + let lower = name.to_ascii_lowercase(); + match lower.as_str() { + // Modifiers + "ctrl" | "control" => Some(Key::Control), + "shift" => Some(Key::Shift), + "alt" | "option" => Some(Key::Alt), + "cmd" | "command" | "meta" | "super" | "win" | "windows" => Some(Key::Meta), + + // Navigation + "enter" | "return" => Some(Key::Return), + "tab" => Some(Key::Tab), + "escape" | "esc" => Some(Key::Escape), + "backspace" => Some(Key::Backspace), + "delete" | "del" => Some(Key::Delete), + "space" => Some(Key::Space), + + // Arrow keys + "up" | "arrowup" => Some(Key::UpArrow), + "down" | "arrowdown" => Some(Key::DownArrow), + "left" | "arrowleft" => Some(Key::LeftArrow), + "right" | "arrowright" => Some(Key::RightArrow), + + // Home / End / Page + "home" => Some(Key::Home), + "end" => Some(Key::End), + "pageup" | "page_up" => Some(Key::PageUp), + "pagedown" | "page_down" => Some(Key::PageDown), + + // Function keys + "f1" => Some(Key::F1), + "f2" => Some(Key::F2), + "f3" => Some(Key::F3), + "f4" => Some(Key::F4), + "f5" => Some(Key::F5), + "f6" => Some(Key::F6), + "f7" => Some(Key::F7), + "f8" => Some(Key::F8), + "f9" => Some(Key::F9), + "f10" => Some(Key::F10), + "f11" => Some(Key::F11), + "f12" => Some(Key::F12), + + // Caps Lock + "capslock" | "caps_lock" => Some(Key::CapsLock), + + // Single character — letters, digits, punctuation + _ => { + let chars: Vec = name.chars().collect(); + if chars.len() == 1 { + Some(Key::Unicode(chars[0])) + } else { + None + } + } + } +} + +/// Returns true if the key is a modifier (Ctrl, Shift, Alt, Meta). +fn is_modifier(key: &Key) -> bool { + matches!( + key, + Key::Control | Key::Shift | Key::Alt | Key::Meta + ) +} + +#[async_trait] +impl Tool for KeyboardTool { + fn name(&self) -> &str { + "keyboard" + } + + fn description(&self) -> &str { + concat!( + "Simulate keyboard input natively. Actions: type (enter a text string), ", + "press (tap a single key like Enter or Tab), hotkey (key combination like ", + "Ctrl+C or Cmd+Shift+S). Key names are case-insensitive." + ) + } + + fn permission_level(&self) -> PermissionLevel { + PermissionLevel::Dangerous + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["type", "press", "hotkey"], + "description": "Keyboard action to perform" + }, + "text": { + "type": "string", + "description": "Text to type. Required for 'type' action. Max 10,000 chars." + }, + "key": { + "type": "string", + "description": "Key name (e.g. 'Enter', 'Tab', 'Escape', 'a', 'F5'). Required for 'press' action." + }, + "keys": { + "type": "array", + "items": { "type": "string" }, + "description": "Key combination as ordered array. Modifiers first, then the final key (e.g. ['Ctrl', 'C'] or ['Cmd', 'Shift', 'S']). Required for 'hotkey' action." + } + }, + "required": ["action"] + }) + } + + async fn execute(&self, args: Value) -> anyhow::Result { + if !self.security.can_act() { + return Ok(ToolResult::error("Action blocked: autonomy is read-only")); + } + if !self.security.record_action() { + return Ok(ToolResult::error("Action blocked: rate limit exceeded")); + } + + let action = args + .get("action") + .and_then(Value::as_str) + .ok_or_else(|| anyhow::anyhow!("Missing 'action' parameter"))?; + + debug!(tool = "keyboard", action = action, "[computer] keyboard action requested"); + + match action { + "type" => { + let text = args + .get("text") + .and_then(Value::as_str) + .ok_or_else(|| anyhow::anyhow!("Missing 'text' for type action"))? + .to_string(); + + if text.is_empty() { + return Ok(ToolResult::error("'text' cannot be empty")); + } + if text.len() > MAX_TYPE_LENGTH { + return Ok(ToolResult::error(format!( + "Text too long ({} chars). Maximum is {MAX_TYPE_LENGTH}.", + text.len() + ))); + } + + let len = text.len(); + tokio::task::spawn_blocking(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; + enigo + .text(&text) + .map_err(|e| anyhow::anyhow!("text typing failed: {e}"))?; + info!( + tool = "keyboard", action = "type", chars = len, + "[computer] typed text" + ); + Ok(ToolResult::success(format!("Typed {len} characters"))) + }) + .await? + } + + "press" => { + let key_name = args + .get("key") + .and_then(Value::as_str) + .ok_or_else(|| anyhow::anyhow!("Missing 'key' for press action"))? + .to_string(); + + let key = parse_key(&key_name).ok_or_else(|| { + anyhow::anyhow!("Unknown key '{key_name}'. Use names like Enter, Tab, Escape, F1-F12, a-z, 0-9, Space, etc.") + })?; + + tokio::task::spawn_blocking(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; + enigo + .key(key, Direction::Click) + .map_err(|e| anyhow::anyhow!("key press failed: {e}"))?; + info!( + tool = "keyboard", action = "press", key = key_name.as_str(), + "[computer] pressed key" + ); + Ok(ToolResult::success(format!("Pressed key '{key_name}'"))) + }) + .await? + } + + "hotkey" => { + let key_names: Vec = args + .get("keys") + .and_then(Value::as_array) + .ok_or_else(|| anyhow::anyhow!("Missing 'keys' array for hotkey action"))? + .iter() + .filter_map(|v| v.as_str().map(String::from)) + .collect(); + + if key_names.is_empty() { + return Ok(ToolResult::error("'keys' array cannot be empty")); + } + if key_names.len() > 6 { + return Ok(ToolResult::error( + "Too many keys in hotkey combination (max 6)", + )); + } + + let mut keys: Vec = Vec::with_capacity(key_names.len()); + for name in &key_names { + let key = parse_key(name).ok_or_else(|| { + anyhow::anyhow!("Unknown key '{name}' in hotkey combination") + })?; + keys.push(key); + } + + let combo_desc = key_names.join("+"); + tokio::task::spawn_blocking(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; + + // Press all keys in order (modifiers first, then the final key) + for key in &keys { + enigo + .key(*key, Direction::Press) + .map_err(|e| anyhow::anyhow!("key press failed: {e}"))?; + std::thread::sleep(HOTKEY_INTER_KEY_DELAY); + } + + // Release in reverse order + for key in keys.iter().rev() { + enigo + .key(*key, Direction::Release) + .map_err(|e| anyhow::anyhow!("key release failed: {e}"))?; + } + + info!( + tool = "keyboard", action = "hotkey", combo = combo_desc.as_str(), + "[computer] hotkey executed" + ); + Ok(ToolResult::success(format!("Executed hotkey: {combo_desc}"))) + }) + .await? + } + + other => Ok(ToolResult::error(format!( + "Unknown keyboard action '{other}'. Use: type, press, hotkey" + ))), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_tool() -> KeyboardTool { + KeyboardTool::new(Arc::new(SecurityPolicy::default())) + } + + #[test] + fn schema_has_required_action() { + let tool = make_tool(); + let schema = tool.parameters_schema(); + assert_eq!(schema["required"], json!(["action"])); + } + + #[test] + fn schema_enumerates_actions() { + let tool = make_tool(); + let schema = tool.parameters_schema(); + let actions = schema["properties"]["action"]["enum"].as_array().unwrap(); + let names: Vec<&str> = actions.iter().map(|v| v.as_str().unwrap()).collect(); + assert!(names.contains(&"type")); + assert!(names.contains(&"press")); + assert!(names.contains(&"hotkey")); + } + + #[test] + fn permission_is_dangerous() { + assert_eq!(make_tool().permission_level(), PermissionLevel::Dangerous); + } + + #[test] + fn name_is_keyboard() { + assert_eq!(make_tool().name(), "keyboard"); + } + + // ── parse_key tests ────────────────────────────────────────── + + #[test] + fn parse_key_modifiers() { + assert_eq!(parse_key("Ctrl"), Some(Key::Control)); + assert_eq!(parse_key("control"), Some(Key::Control)); + assert_eq!(parse_key("Shift"), Some(Key::Shift)); + assert_eq!(parse_key("Alt"), Some(Key::Alt)); + assert_eq!(parse_key("Option"), Some(Key::Alt)); + assert_eq!(parse_key("Cmd"), Some(Key::Meta)); + assert_eq!(parse_key("Command"), Some(Key::Meta)); + assert_eq!(parse_key("Meta"), Some(Key::Meta)); + assert_eq!(parse_key("Super"), Some(Key::Meta)); + assert_eq!(parse_key("Win"), Some(Key::Meta)); + } + + #[test] + fn parse_key_navigation() { + assert_eq!(parse_key("Enter"), Some(Key::Return)); + assert_eq!(parse_key("Return"), Some(Key::Return)); + assert_eq!(parse_key("Tab"), Some(Key::Tab)); + assert_eq!(parse_key("Escape"), Some(Key::Escape)); + assert_eq!(parse_key("Esc"), Some(Key::Escape)); + assert_eq!(parse_key("Backspace"), Some(Key::Backspace)); + assert_eq!(parse_key("Delete"), Some(Key::Delete)); + assert_eq!(parse_key("Space"), Some(Key::Space)); + } + + #[test] + fn parse_key_arrows() { + assert_eq!(parse_key("Up"), Some(Key::UpArrow)); + assert_eq!(parse_key("Down"), Some(Key::DownArrow)); + assert_eq!(parse_key("Left"), Some(Key::LeftArrow)); + assert_eq!(parse_key("Right"), Some(Key::RightArrow)); + } + + #[test] + fn parse_key_function_keys() { + assert_eq!(parse_key("F1"), Some(Key::F1)); + assert_eq!(parse_key("f5"), Some(Key::F5)); + assert_eq!(parse_key("F12"), Some(Key::F12)); + } + + #[test] + fn parse_key_single_chars() { + assert_eq!(parse_key("a"), Some(Key::Unicode('a'))); + assert_eq!(parse_key("A"), Some(Key::Unicode('A'))); + assert_eq!(parse_key("5"), Some(Key::Unicode('5'))); + assert_eq!(parse_key("/"), Some(Key::Unicode('/'))); + } + + #[test] + fn parse_key_unknown_returns_none() { + assert_eq!(parse_key("FooBar"), None); + assert_eq!(parse_key(""), None); + } + + #[test] + fn modifier_detection() { + assert!(is_modifier(&Key::Control)); + assert!(is_modifier(&Key::Shift)); + assert!(is_modifier(&Key::Alt)); + assert!(is_modifier(&Key::Meta)); + assert!(!is_modifier(&Key::Return)); + assert!(!is_modifier(&Key::Unicode('a'))); + } + + // ── execute validation tests ───────────────────────────────── + + #[tokio::test] + async fn missing_action_returns_error() { + let tool = make_tool(); + let result = tool.execute(json!({})).await; + assert!(result.is_err() || result.unwrap().is_error); + } + + #[tokio::test] + async fn unknown_action_returns_error() { + let tool = make_tool(); + let result = tool.execute(json!({"action": "smash"})).await.unwrap(); + assert!(result.is_error); + assert!(result.output().contains("Unknown keyboard action")); + } + + #[tokio::test] + async fn type_missing_text_returns_error() { + let tool = make_tool(); + let result = tool.execute(json!({"action": "type"})).await; + assert!(result.is_err() || result.unwrap().is_error); + } + + #[tokio::test] + async fn type_empty_text_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "type", "text": ""})) + .await + .unwrap(); + assert!(result.is_error); + } + + #[tokio::test] + async fn press_missing_key_returns_error() { + let tool = make_tool(); + let result = tool.execute(json!({"action": "press"})).await; + assert!(result.is_err() || result.unwrap().is_error); + } + + #[tokio::test] + async fn press_unknown_key_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "press", "key": "FooBarBaz"})) + .await; + assert!(result.is_err() || result.unwrap().is_error); + } + + #[tokio::test] + async fn hotkey_missing_keys_returns_error() { + let tool = make_tool(); + let result = tool.execute(json!({"action": "hotkey"})).await; + assert!(result.is_err() || result.unwrap().is_error); + } + + #[tokio::test] + async fn hotkey_empty_array_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "hotkey", "keys": []})) + .await + .unwrap(); + assert!(result.is_error); + } + + #[tokio::test] + async fn hotkey_too_many_keys_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "hotkey", "keys": ["a","b","c","d","e","f","g"]})) + .await + .unwrap(); + assert!(result.is_error); + } + + #[tokio::test] + async fn type_too_long_returns_error() { + let tool = make_tool(); + let long_text = "x".repeat(MAX_TYPE_LENGTH + 1); + let result = tool + .execute(json!({"action": "type", "text": long_text})) + .await + .unwrap(); + assert!(result.is_error); + assert!(result.output().contains("too long")); + } +} diff --git a/src/openhuman/tools/impl/computer/mod.rs b/src/openhuman/tools/impl/computer/mod.rs new file mode 100644 index 000000000..d9136c81f --- /dev/null +++ b/src/openhuman/tools/impl/computer/mod.rs @@ -0,0 +1,5 @@ +mod keyboard; +mod mouse; + +pub use keyboard::KeyboardTool; +pub use mouse::MouseTool; diff --git a/src/openhuman/tools/impl/computer/mouse.rs b/src/openhuman/tools/impl/computer/mouse.rs new file mode 100644 index 000000000..b6ac3a7cb --- /dev/null +++ b/src/openhuman/tools/impl/computer/mouse.rs @@ -0,0 +1,401 @@ +//! Native mouse control tool using enigo. +//! +//! Provides absolute-coordinate mouse movement, clicking, double-clicking, +//! dragging, and scrolling via platform-native APIs (Core Graphics on macOS, +//! SendInput on Windows, X11/libxdo on Linux). + +use crate::openhuman::security::SecurityPolicy; +use crate::openhuman::tools::traits::{PermissionLevel, Tool, ToolResult}; +use async_trait::async_trait; +use enigo::{Button, Coordinate, Direction, Enigo, Mouse, Settings}; +use serde_json::{json, Value}; +use std::sync::Arc; +use tracing::{debug, info}; + +/// Coordinate safety bound — reject values outside this range. +const MAX_COORD: i64 = 32768; + +pub struct MouseTool { + security: Arc, +} + +impl MouseTool { + pub fn new(security: Arc) -> Self { + Self { security } + } +} + +fn parse_button(args: &Value) -> Button { + match args.get("button").and_then(Value::as_str) { + Some("right") => Button::Right, + Some("middle") => Button::Middle, + _ => Button::Left, + } +} + +fn require_xy(args: &Value) -> anyhow::Result<(i32, i32)> { + let x = args + .get("x") + .and_then(Value::as_i64) + .ok_or_else(|| anyhow::anyhow!("Missing required 'x' parameter"))?; + let y = args + .get("y") + .and_then(Value::as_i64) + .ok_or_else(|| anyhow::anyhow!("Missing required 'y' parameter"))?; + validate_coord("x", x)?; + validate_coord("y", y)?; + Ok((x as i32, y as i32)) +} + +fn validate_coord(name: &str, value: i64) -> anyhow::Result<()> { + if value < 0 || value > MAX_COORD { + anyhow::bail!("'{name}' coordinate {value} is out of range (0..{MAX_COORD})"); + } + Ok(()) +} + +#[async_trait] +impl Tool for MouseTool { + fn name(&self) -> &str { + "mouse" + } + + fn description(&self) -> &str { + concat!( + "Control the mouse cursor natively. Actions: move (reposition cursor), ", + "click (move + click), double_click, drag (press at start, release at end), ", + "scroll (vertical/horizontal). All coordinates are absolute screen pixels." + ) + } + + fn permission_level(&self) -> PermissionLevel { + PermissionLevel::Dangerous + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["move", "click", "double_click", "drag", "scroll"], + "description": "Mouse action to perform" + }, + "x": { + "type": "integer", + "description": "Target X coordinate (absolute screen pixels). Required for move, click, double_click." + }, + "y": { + "type": "integer", + "description": "Target Y coordinate (absolute screen pixels). Required for move, click, double_click." + }, + "button": { + "type": "string", + "enum": ["left", "right", "middle"], + "description": "Mouse button for click/double_click/drag. Default: left." + }, + "start_x": { + "type": "integer", + "description": "Drag start X coordinate (absolute). Required for drag." + }, + "start_y": { + "type": "integer", + "description": "Drag start Y coordinate (absolute). Required for drag." + }, + "scroll_x": { + "type": "integer", + "description": "Horizontal scroll amount (positive = right, negative = left). For scroll action." + }, + "scroll_y": { + "type": "integer", + "description": "Vertical scroll amount (positive = down, negative = up). For scroll action." + } + }, + "required": ["action"] + }) + } + + async fn execute(&self, args: Value) -> anyhow::Result { + if !self.security.can_act() { + return Ok(ToolResult::error("Action blocked: autonomy is read-only")); + } + if !self.security.record_action() { + return Ok(ToolResult::error("Action blocked: rate limit exceeded")); + } + + let action = args + .get("action") + .and_then(Value::as_str) + .ok_or_else(|| anyhow::anyhow!("Missing 'action' parameter"))?; + + debug!(tool = "mouse", action = action, "[computer] mouse action requested"); + + match action { + "move" => { + let (x, y) = require_xy(&args)?; + tokio::task::spawn_blocking(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; + enigo + .move_mouse(x, y, Coordinate::Abs) + .map_err(|e| anyhow::anyhow!("move_mouse failed: {e}"))?; + info!(tool = "mouse", action = "move", x = x, y = y, "[computer] cursor moved"); + Ok(ToolResult::success(format!("Moved cursor to ({x}, {y})"))) + }) + .await? + } + + "click" => { + let (x, y) = require_xy(&args)?; + let button = parse_button(&args); + tokio::task::spawn_blocking(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; + enigo + .move_mouse(x, y, Coordinate::Abs) + .map_err(|e| anyhow::anyhow!("move_mouse failed: {e}"))?; + enigo + .button(button, Direction::Click) + .map_err(|e| anyhow::anyhow!("button click failed: {e}"))?; + info!( + tool = "mouse", action = "click", + x = x, y = y, button = ?button, + "[computer] clicked" + ); + Ok(ToolResult::success(format!( + "Clicked {button:?} at ({x}, {y})" + ))) + }) + .await? + } + + "double_click" => { + let (x, y) = require_xy(&args)?; + let button = parse_button(&args); + tokio::task::spawn_blocking(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; + enigo + .move_mouse(x, y, Coordinate::Abs) + .map_err(|e| anyhow::anyhow!("move_mouse failed: {e}"))?; + enigo + .button(button, Direction::Click) + .map_err(|e| anyhow::anyhow!("button click failed: {e}"))?; + enigo + .button(button, Direction::Click) + .map_err(|e| anyhow::anyhow!("button click failed: {e}"))?; + info!( + tool = "mouse", action = "double_click", + x = x, y = y, button = ?button, + "[computer] double-clicked" + ); + Ok(ToolResult::success(format!( + "Double-clicked {button:?} at ({x}, {y})" + ))) + }) + .await? + } + + "drag" => { + let start_x = args + .get("start_x") + .and_then(Value::as_i64) + .ok_or_else(|| anyhow::anyhow!("Missing 'start_x' for drag"))?; + let start_y = args + .get("start_y") + .and_then(Value::as_i64) + .ok_or_else(|| anyhow::anyhow!("Missing 'start_y' for drag"))?; + validate_coord("start_x", start_x)?; + validate_coord("start_y", start_y)?; + let (end_x, end_y) = require_xy(&args)?; + let button = parse_button(&args); + let sx = start_x as i32; + let sy = start_y as i32; + + tokio::task::spawn_blocking(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; + enigo + .move_mouse(sx, sy, Coordinate::Abs) + .map_err(|e| anyhow::anyhow!("move_mouse (start) failed: {e}"))?; + enigo + .button(button, Direction::Press) + .map_err(|e| anyhow::anyhow!("button press failed: {e}"))?; + enigo + .move_mouse(end_x, end_y, Coordinate::Abs) + .map_err(|e| anyhow::anyhow!("move_mouse (end) failed: {e}"))?; + enigo + .button(button, Direction::Release) + .map_err(|e| anyhow::anyhow!("button release failed: {e}"))?; + info!( + tool = "mouse", action = "drag", + start_x = sx, start_y = sy, + end_x = end_x, end_y = end_y, button = ?button, + "[computer] dragged" + ); + Ok(ToolResult::success(format!( + "Dragged {button:?} from ({sx}, {sy}) to ({end_x}, {end_y})" + ))) + }) + .await? + } + + "scroll" => { + let scroll_x = args.get("scroll_x").and_then(Value::as_i64).unwrap_or(0) as i32; + let scroll_y = args.get("scroll_y").and_then(Value::as_i64).unwrap_or(0) as i32; + + if scroll_x == 0 && scroll_y == 0 { + return Ok(ToolResult::error( + "At least one of 'scroll_x' or 'scroll_y' must be non-zero", + )); + } + + tokio::task::spawn_blocking(move || { + let mut enigo = Enigo::new(&Settings::default()) + .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; + if scroll_y != 0 { + enigo + .scroll(scroll_y, enigo::Axis::Vertical) + .map_err(|e| anyhow::anyhow!("vertical scroll failed: {e}"))?; + } + if scroll_x != 0 { + enigo + .scroll(scroll_x, enigo::Axis::Horizontal) + .map_err(|e| anyhow::anyhow!("horizontal scroll failed: {e}"))?; + } + info!( + tool = "mouse", action = "scroll", + scroll_x = scroll_x, scroll_y = scroll_y, + "[computer] scrolled" + ); + Ok(ToolResult::success(format!( + "Scrolled (x={scroll_x}, y={scroll_y})" + ))) + }) + .await? + } + + other => Ok(ToolResult::error(format!( + "Unknown mouse action '{other}'. Use: move, click, double_click, drag, scroll" + ))), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_tool() -> MouseTool { + MouseTool::new(Arc::new(SecurityPolicy::default())) + } + + #[test] + fn schema_has_required_action() { + let tool = make_tool(); + let schema = tool.parameters_schema(); + assert_eq!(schema["required"], json!(["action"])); + } + + #[test] + fn schema_enumerates_actions() { + let tool = make_tool(); + let schema = tool.parameters_schema(); + let actions = schema["properties"]["action"]["enum"].as_array().unwrap(); + let names: Vec<&str> = actions.iter().map(|v| v.as_str().unwrap()).collect(); + assert!(names.contains(&"move")); + assert!(names.contains(&"click")); + assert!(names.contains(&"double_click")); + assert!(names.contains(&"drag")); + assert!(names.contains(&"scroll")); + } + + #[test] + fn permission_is_dangerous() { + let tool = make_tool(); + assert_eq!(tool.permission_level(), PermissionLevel::Dangerous); + } + + #[test] + fn name_is_mouse() { + assert_eq!(make_tool().name(), "mouse"); + } + + #[test] + fn coord_validation_rejects_negative() { + assert!(validate_coord("x", -1).is_err()); + } + + #[test] + fn coord_validation_rejects_overflow() { + assert!(validate_coord("x", MAX_COORD + 1).is_err()); + } + + #[test] + fn coord_validation_accepts_zero() { + assert!(validate_coord("x", 0).is_ok()); + } + + #[test] + fn coord_validation_accepts_max() { + assert!(validate_coord("x", MAX_COORD).is_ok()); + } + + #[test] + fn parse_button_defaults_to_left() { + assert_eq!(parse_button(&json!({})), Button::Left); + assert_eq!(parse_button(&json!({"button": "left"})), Button::Left); + } + + #[test] + fn parse_button_right() { + assert_eq!(parse_button(&json!({"button": "right"})), Button::Right); + } + + #[test] + fn parse_button_middle() { + assert_eq!(parse_button(&json!({"button": "middle"})), Button::Middle); + } + + #[tokio::test] + async fn missing_action_returns_error() { + let tool = make_tool(); + let result = tool.execute(json!({})).await; + assert!(result.is_err() || result.unwrap().is_error); + } + + #[tokio::test] + async fn unknown_action_returns_error() { + let tool = make_tool(); + let result = tool.execute(json!({"action": "teleport"})).await.unwrap(); + assert!(result.is_error); + assert!(result.output().contains("Unknown mouse action")); + } + + #[tokio::test] + async fn click_missing_coords_returns_error() { + let tool = make_tool(); + let result = tool.execute(json!({"action": "click"})).await; + // Should fail with missing x/y + assert!(result.is_err() || result.unwrap().is_error); + } + + #[tokio::test] + async fn scroll_zero_both_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "scroll", "scroll_x": 0, "scroll_y": 0})) + .await + .unwrap(); + assert!(result.is_error); + } + + #[tokio::test] + async fn drag_missing_start_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "drag", "x": 100, "y": 100})) + .await; + assert!(result.is_err() || result.unwrap().is_error); + } +} diff --git a/src/openhuman/tools/impl/mod.rs b/src/openhuman/tools/impl/mod.rs index 003275b84..e15740596 100644 --- a/src/openhuman/tools/impl/mod.rs +++ b/src/openhuman/tools/impl/mod.rs @@ -1,5 +1,6 @@ pub mod agent; pub mod browser; +pub mod computer; pub mod cron; pub mod filesystem; pub mod memory; @@ -8,6 +9,7 @@ pub mod system; pub use agent::*; pub use browser::*; +pub use computer::*; pub use cron::*; pub use filesystem::*; pub use memory::*; diff --git a/src/openhuman/tools/ops.rs b/src/openhuman/tools/ops.rs index 56a2c7ad3..ef7b7f0dc 100644 --- a/src/openhuman/tools/ops.rs +++ b/src/openhuman/tools/ops.rs @@ -154,6 +154,13 @@ pub fn all_tools_with_runtime( tools.push(Box::new(ScreenshotTool::new(security.clone()))); tools.push(Box::new(ImageInfoTool::new(security.clone()))); + // Native mouse + keyboard control (disabled by default) + if root_config.computer_control.enabled { + tools.push(Box::new(MouseTool::new(security.clone()))); + tools.push(Box::new(KeyboardTool::new(security.clone()))); + tracing::debug!("[computer] mouse and keyboard tools registered"); + } + if let Some(key) = composio_key { if !key.is_empty() { tools.push(Box::new(ComposioTool::new( From 3cd7180d51dc27ecb4174bb98b906298c5cc215f Mon Sep 17 00:00:00 2001 From: Steven Enamakel Date: Sun, 12 Apr 2026 14:40:38 -0700 Subject: [PATCH 2/3] style: apply cargo fmt to computer control tools --- src/openhuman/tools/impl/computer/keyboard.rs | 27 ++++++++++++------- src/openhuman/tools/impl/computer/mouse.rs | 20 +++++++++++--- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/src/openhuman/tools/impl/computer/keyboard.rs b/src/openhuman/tools/impl/computer/keyboard.rs index 373e3e508..aa3acf040 100644 --- a/src/openhuman/tools/impl/computer/keyboard.rs +++ b/src/openhuman/tools/impl/computer/keyboard.rs @@ -93,10 +93,7 @@ fn parse_key(name: &str) -> Option { /// Returns true if the key is a modifier (Ctrl, Shift, Alt, Meta). fn is_modifier(key: &Key) -> bool { - matches!( - key, - Key::Control | Key::Shift | Key::Alt | Key::Meta - ) + matches!(key, Key::Control | Key::Shift | Key::Alt | Key::Meta) } #[async_trait] @@ -157,7 +154,11 @@ impl Tool for KeyboardTool { .and_then(Value::as_str) .ok_or_else(|| anyhow::anyhow!("Missing 'action' parameter"))?; - debug!(tool = "keyboard", action = action, "[computer] keyboard action requested"); + debug!( + tool = "keyboard", + action = action, + "[computer] keyboard action requested" + ); match action { "type" => { @@ -185,7 +186,9 @@ impl Tool for KeyboardTool { .text(&text) .map_err(|e| anyhow::anyhow!("text typing failed: {e}"))?; info!( - tool = "keyboard", action = "type", chars = len, + tool = "keyboard", + action = "type", + chars = len, "[computer] typed text" ); Ok(ToolResult::success(format!("Typed {len} characters"))) @@ -211,7 +214,9 @@ impl Tool for KeyboardTool { .key(key, Direction::Click) .map_err(|e| anyhow::anyhow!("key press failed: {e}"))?; info!( - tool = "keyboard", action = "press", key = key_name.as_str(), + tool = "keyboard", + action = "press", + key = key_name.as_str(), "[computer] pressed key" ); Ok(ToolResult::success(format!("Pressed key '{key_name}'"))) @@ -266,10 +271,14 @@ impl Tool for KeyboardTool { } info!( - tool = "keyboard", action = "hotkey", combo = combo_desc.as_str(), + tool = "keyboard", + action = "hotkey", + combo = combo_desc.as_str(), "[computer] hotkey executed" ); - Ok(ToolResult::success(format!("Executed hotkey: {combo_desc}"))) + Ok(ToolResult::success(format!( + "Executed hotkey: {combo_desc}" + ))) }) .await? } diff --git a/src/openhuman/tools/impl/computer/mouse.rs b/src/openhuman/tools/impl/computer/mouse.rs index b6ac3a7cb..327206822 100644 --- a/src/openhuman/tools/impl/computer/mouse.rs +++ b/src/openhuman/tools/impl/computer/mouse.rs @@ -128,7 +128,11 @@ impl Tool for MouseTool { .and_then(Value::as_str) .ok_or_else(|| anyhow::anyhow!("Missing 'action' parameter"))?; - debug!(tool = "mouse", action = action, "[computer] mouse action requested"); + debug!( + tool = "mouse", + action = action, + "[computer] mouse action requested" + ); match action { "move" => { @@ -139,7 +143,13 @@ impl Tool for MouseTool { enigo .move_mouse(x, y, Coordinate::Abs) .map_err(|e| anyhow::anyhow!("move_mouse failed: {e}"))?; - info!(tool = "mouse", action = "move", x = x, y = y, "[computer] cursor moved"); + info!( + tool = "mouse", + action = "move", + x = x, + y = y, + "[computer] cursor moved" + ); Ok(ToolResult::success(format!("Moved cursor to ({x}, {y})"))) }) .await? @@ -264,8 +274,10 @@ impl Tool for MouseTool { .map_err(|e| anyhow::anyhow!("horizontal scroll failed: {e}"))?; } info!( - tool = "mouse", action = "scroll", - scroll_x = scroll_x, scroll_y = scroll_y, + tool = "mouse", + action = "scroll", + scroll_x = scroll_x, + scroll_y = scroll_y, "[computer] scrolled" ); Ok(ToolResult::success(format!( From 02202ebbdd6913dbb7132304dc0e696734d6134a Mon Sep 17 00:00:00 2001 From: Steven Enamakel Date: Sun, 12 Apr 2026 15:05:38 -0700 Subject: [PATCH 3/3] =?UTF-8?q?fix:=20harden=20computer=20control=20tools?= =?UTF-8?q?=20=E2=80=94=20safe=20cleanup,=20strict=20validation,=20gate=20?= =?UTF-8?q?tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - keyboard hotkey: track pressed keys and always release in reverse on error (prevents stuck modifiers); validate modifier-first pattern (reject ["a","Ctrl"], ["Ctrl"], ["Ctrl","Shift"]) - mouse drag: guarantee button release via best-effort cleanup after press, even when move_mouse fails - mouse parse_button: return Result, reject unknown/non-string values instead of silently defaulting to left - mouse scroll: use i32::try_from instead of silent `as i32` truncation - Add debug logs on security block branches (can_act / record_action) - Add ops.rs regression tests for computer_control enabled/disabled gate --- src/openhuman/tools/impl/computer/keyboard.rs | 125 +++++++++++++++--- src/openhuman/tools/impl/computer/mouse.rs | 101 +++++++++++--- src/openhuman/tools/ops.rs | 80 +++++++++++ 3 files changed, 268 insertions(+), 38 deletions(-) diff --git a/src/openhuman/tools/impl/computer/keyboard.rs b/src/openhuman/tools/impl/computer/keyboard.rs index aa3acf040..7af85890b 100644 --- a/src/openhuman/tools/impl/computer/keyboard.rs +++ b/src/openhuman/tools/impl/computer/keyboard.rs @@ -143,9 +143,14 @@ impl Tool for KeyboardTool { async fn execute(&self, args: Value) -> anyhow::Result { if !self.security.can_act() { + debug!( + tool = "keyboard", + "[computer] blocked: autonomy is read-only" + ); return Ok(ToolResult::error("Action blocked: autonomy is read-only")); } if !self.security.record_action() { + debug!(tool = "keyboard", "[computer] blocked: rate limit exceeded"); return Ok(ToolResult::error("Action blocked: rate limit exceeded")); } @@ -225,13 +230,19 @@ impl Tool for KeyboardTool { } "hotkey" => { - let key_names: Vec = args + let raw_keys = args .get("keys") .and_then(Value::as_array) - .ok_or_else(|| anyhow::anyhow!("Missing 'keys' array for hotkey action"))? - .iter() - .filter_map(|v| v.as_str().map(String::from)) - .collect(); + .ok_or_else(|| anyhow::anyhow!("Missing 'keys' array for hotkey action"))?; + + // Reject non-string entries up front. + let mut key_names: Vec = Vec::with_capacity(raw_keys.len()); + for (i, v) in raw_keys.iter().enumerate() { + let s = v.as_str().ok_or_else(|| { + anyhow::anyhow!("Element {i} in 'keys' array is not a string (got {v})") + })?; + key_names.push(s.to_string()); + } if key_names.is_empty() { return Ok(ToolResult::error("'keys' array cannot be empty")); @@ -241,7 +252,13 @@ impl Tool for KeyboardTool { "Too many keys in hotkey combination (max 6)", )); } + if key_names.len() < 2 { + return Ok(ToolResult::error( + "Hotkey requires at least one modifier and one final key (e.g. ['Ctrl', 'C'])", + )); + } + // Parse all key names into Key values. let mut keys: Vec = Vec::with_capacity(key_names.len()); for name in &key_names { let key = parse_key(name).ok_or_else(|| { @@ -250,25 +267,58 @@ impl Tool for KeyboardTool { keys.push(key); } + // Validate modifier-first pattern: all keys except the last + // must be modifiers, and the last must be a non-modifier. + let (modifiers, final_key) = keys.split_at(keys.len() - 1); + for (i, key) in modifiers.iter().enumerate() { + if !is_modifier(key) { + return Ok(ToolResult::error(format!( + "Key '{}' at position {i} must be a modifier (Ctrl/Shift/Alt/Cmd). Non-modifier keys must be last.", + key_names[i] + ))); + } + } + if is_modifier(&final_key[0]) { + return Ok(ToolResult::error(format!( + "Last key '{}' cannot be a modifier. Hotkey must end with a non-modifier key (e.g. 'C', 'Enter').", + key_names.last().unwrap() + ))); + } + let combo_desc = key_names.join("+"); tokio::task::spawn_blocking(move || { let mut enigo = Enigo::new(&Settings::default()) .map_err(|e| anyhow::anyhow!("Failed to create enigo instance: {e}"))?; - // Press all keys in order (modifiers first, then the final key) - for key in &keys { - enigo - .key(*key, Direction::Press) - .map_err(|e| anyhow::anyhow!("key press failed: {e}"))?; - std::thread::sleep(HOTKEY_INTER_KEY_DELAY); + // Press keys in order, tracking which were successfully + // pressed so we can release them on error. + let mut pressed_keys: Vec = Vec::with_capacity(keys.len()); + let press_result: Result<(), anyhow::Error> = (|| { + for key in &keys { + enigo.key(*key, Direction::Press).map_err(|e| { + anyhow::anyhow!("key press failed for {key:?}: {e}") + })?; + pressed_keys.push(*key); + std::thread::sleep(HOTKEY_INTER_KEY_DELAY); + } + Ok(()) + })(); + + // Always release all successfully pressed keys in reverse + // order, even if a press failed partway through. + for key in pressed_keys.iter().rev() { + if let Err(e) = enigo.key(*key, Direction::Release) { + tracing::warn!( + tool = "keyboard", + key = ?key, + error = %e, + "[computer] best-effort key release failed during cleanup" + ); + } } - // Release in reverse order - for key in keys.iter().rev() { - enigo - .key(*key, Direction::Release) - .map_err(|e| anyhow::anyhow!("key release failed: {e}"))?; - } + // Now propagate any press error. + press_result?; info!( tool = "keyboard", @@ -481,4 +531,45 @@ mod tests { assert!(result.is_error); assert!(result.output().contains("too long")); } + + // ── hotkey validation tests ────────────────────────────────── + + #[tokio::test] + async fn hotkey_non_string_entry_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "hotkey", "keys": ["Ctrl", 1]})) + .await; + assert!(result.is_err() || result.unwrap().is_error); + } + + #[tokio::test] + async fn hotkey_modifier_only_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "hotkey", "keys": ["Ctrl"]})) + .await + .unwrap(); + assert!(result.is_error); + } + + #[tokio::test] + async fn hotkey_non_modifier_before_last_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "hotkey", "keys": ["a", "Ctrl"]})) + .await + .unwrap(); + assert!(result.is_error); + } + + #[tokio::test] + async fn hotkey_modifier_as_last_returns_error() { + let tool = make_tool(); + let result = tool + .execute(json!({"action": "hotkey", "keys": ["Ctrl", "Shift"]})) + .await + .unwrap(); + assert!(result.is_error); + } } diff --git a/src/openhuman/tools/impl/computer/mouse.rs b/src/openhuman/tools/impl/computer/mouse.rs index 327206822..d2c6edea2 100644 --- a/src/openhuman/tools/impl/computer/mouse.rs +++ b/src/openhuman/tools/impl/computer/mouse.rs @@ -10,7 +10,7 @@ use async_trait::async_trait; use enigo::{Button, Coordinate, Direction, Enigo, Mouse, Settings}; use serde_json::{json, Value}; use std::sync::Arc; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; /// Coordinate safety bound — reject values outside this range. const MAX_COORD: i64 = 32768; @@ -25,11 +25,18 @@ impl MouseTool { } } -fn parse_button(args: &Value) -> Button { - match args.get("button").and_then(Value::as_str) { - Some("right") => Button::Right, - Some("middle") => Button::Middle, - _ => Button::Left, +fn parse_button(args: &Value) -> anyhow::Result