diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 4f4075169..68cd21f6a 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -12,16 +12,26 @@ use crate::integrations::sessions::{IntegrationSession, get_session_hashmap_key} use crate::global_context::GlobalContext; use crate::call_validation::{ChatContent, ChatMessage}; use crate::scratchpads::multimodality::MultimodalElement; +use crate::postprocessing::pp_command_output::{CmdlineOutputFilter, output_mini_postprocessing}; use crate::tools::tools_description::{Tool, ToolDesc, ToolParam}; use reqwest::Client; use std::path::PathBuf; -use headless_chrome::{Browser, LaunchOptions, Tab}; +use headless_chrome::{Browser, LaunchOptions, Tab as HeadlessTab}; use headless_chrome::browser::tab::point::Point; use headless_chrome::protocol::cdp::Page; use headless_chrome::protocol::cdp::Emulation; +use headless_chrome::protocol::cdp::types::Event; use serde::{Deserialize, Serialize}; +use std::sync::Mutex; +use std::fmt; +use tokio::time::sleep; +use chrono::DateTime; +use base64::Engine; +use std::io::Cursor; +use image::imageops::FilterType; +use image::{ImageFormat, ImageReader}; #[derive(Clone, Serialize, Deserialize, Debug)] pub struct IntegrationChrome { @@ -36,12 +46,52 @@ fn default_headless() -> bool { true } pub struct ToolChrome { integration_chrome: IntegrationChrome, - supports_clicks: bool, +} + +#[derive(Clone, Debug)] +enum DeviceType { + DESKTOP, + MOBILE, +} + +impl fmt::Display for DeviceType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + DeviceType::DESKTOP => write!(f, "desktop"), + DeviceType::MOBILE => write!(f, "mobile"), + } + } +} + +const MAX_CACHED_LOG_LINES: usize = 1000; + +#[derive(Clone)] +pub struct ChromeTab { + headless_tab: Arc, + device: DeviceType, + tab_id: String, + screenshot_scale_factor: f64, + tab_log: Arc>>, +} + +impl ChromeTab { + fn new(headless_tab: Arc, device: &DeviceType, tab_id: &String) -> Self { + Self { + headless_tab, + device: device.clone(), + tab_id: tab_id.clone(), + screenshot_scale_factor: 1.0, + tab_log: Arc::new(Mutex::new(Vec::new())), + } + } + pub fn state_string(&self) -> String { + format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.headless_tab.get_url()) + } } struct ChromeSession { browser: Browser, - tabs: HashMap>, + tabs: HashMap>>, } impl ChromeSession { @@ -66,15 +116,12 @@ impl IntegrationSession for ChromeSession } impl ToolChrome { - pub fn new_from_yaml(v: &serde_yaml::Value, supports_clicks: bool,) -> Result { + pub fn new_from_yaml(v: &serde_yaml::Value) -> Result { let integration_chrome = serde_yaml::from_value::(v.clone()).map_err(|e| { let location = e.location().map(|loc| format!(" at line {}, column {}", loc.line(), loc.column())).unwrap_or_default(); format!("{}{}", e.to_string(), location) })?; - Ok(Self { - integration_chrome, - supports_clicks, - }) + Ok(Self { integration_chrome }) } } @@ -97,37 +144,43 @@ impl Tool for ToolChrome { None => return Err("Missing argument `commands`".to_string()) }; - let mut content = vec![]; + let session_hashmap_key = get_session_hashmap_key("chrome", &chat_id); + let mut tool_log = setup_chrome_session(gcx.clone(), &self.integration_chrome, &session_hashmap_key).await?; + + let command_session = { + let gcx_locked = gcx.read().await; + gcx_locked.integration_sessions.get(&session_hashmap_key) + .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? + .clone() + }; + + let mut mutlimodal_els = vec![]; for command in commands_str.lines().map(|s| s.trim()).collect::>() { let parsed_command = match parse_single_command(&command.to_string()) { Ok(command) => command, Err(e) => { - content.push(MultimodalElement::new( - "text".to_string(), - format!("Failed to parse command: {}. Error: {}.", command, e) - )?); + tool_log.push(format!("failed to parse command `{}`: {}.", command, e)); break } }; - match interact_with_chrome( - gcx.clone(), - &chat_id, - &self.integration_chrome, - &parsed_command, - ).await { - Ok(command_content) => { - content.extend(command_content); + match chrome_command_exec(&parsed_command, command_session.clone()).await { + Ok((execute_log, command_multimodal_els)) => { + tool_log.extend(execute_log); + mutlimodal_els.extend(command_multimodal_els); }, Err(e) => { - content.push(MultimodalElement::new( - "text".to_string(), - format!("Failed to execute command: {}. Error: {}.", command, e) - )?); + tool_log.push(format!("failed to execute command `{}`: {}.", command, e)); break } }; } + let mut content= vec![]; + content.push(MultimodalElement::new( + "text".to_string(), tool_log.join("\n") + )?); + content.extend(mutlimodal_els); + let msg = ContextEnum::ChatMessage(ChatMessage { role: "tool".to_string(), content: ChatContent::Multimodal(content), @@ -140,24 +193,36 @@ impl Tool for ToolChrome { } fn tool_description(&self) -> ToolDesc { - let mut commands_desc = r#"One or several commands separated by newline. The is an integer, for example 10, for you to identify the tab later. Supported commands: -navigate_to -screenshot -html -reload -device "#.to_string(); - if self.supports_clicks { - commands_desc = format!("{}\nclick \ninsert_text \n", commands_desc); - } + let tool_description = vec![ + "A real web browser with graphical interface.", + "Notes about screenshot modes:", + "- plain mode is for visual validation and exploration;", + "- highlight mode gets clickable elements map to use it for click command.", + ].join("\n"); + let supported_commands = vec![ + "open_tab ", + "navigate_to ", + "screenshot ", + // "html ", + "reload ", + "press_key_at ", + "type_text_at ", + "tab_log ", + "click_at ", + ]; + let commands_description = format!( + "One or several commands separated by newline. \ + The is an integer, for example 10, for you to identify the tab later. \ + Supported commands:\n{}", supported_commands.join("\n")); ToolDesc { name: "chrome".to_string(), agentic: true, experimental: true, - description: "A real web browser with graphical interface.".to_string(), + description: tool_description, parameters: vec![ToolParam { name: "commands".to_string(), param_type: "string".to_string(), - description: commands_desc, + description: commands_description, }], parameters_required: vec!["commands".to_string()], } @@ -224,141 +289,384 @@ async fn setup_chrome_session( Ok(setup_log) } -async fn navigate_to(tab: &Arc, url: &String) -> Result { - tab.navigate_to(url.as_str()).map_err(|e| e.to_string())?; - tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(format!("Chrome tab navigated to {}", tab.get_url())) -} +async fn capture_screenshot_base64( + tab: Arc>, + highlight: bool, +) -> Result<(Vec, MultimodalElement), String> { + let mut interactive_element_map = vec![]; + let base64_data = { + let tab_lock = tab.lock().await; + match { + if highlight { + interactive_element_map = highlight_elements(&tab_lock.headless_tab) + .await.map_err(|e| e.to_string())?; + } + let data = tab_lock.headless_tab.call_method(Page::CaptureScreenshot { + format: Some(Page::CaptureScreenshotFormatOption::Png), + clip: None, + quality: None, + from_surface: Some(true), + capture_beyond_viewport: Some(false), + }).map_err(|e| e.to_string())?.data; + Ok::(data) + } { + Ok(data) => { + remove_highlight(&tab_lock.headless_tab).await.map_err(|e| e.to_string())?; + data + }, + Err(e) => { + remove_highlight(&tab_lock.headless_tab).await.map_err(|e| e.to_string())?; + return Err(e) + } + } + }; -async fn click_on_point(tab: &Arc, point: &Point) -> Result { - tab.click_point(point.clone()).map_err(|e| e.to_string())?; - tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(format!("clicked on `{} {}`", point.x, point.y)) -} + let mut data = base64::prelude::BASE64_STANDARD + .decode(base64_data).map_err(|e| e.to_string())?; + let reader = ImageReader::with_format(Cursor::new(data), ImageFormat::Png); + let mut image = reader.decode().map_err(|e| e.to_string())?; + + let max_dimension = 800.0; + let scale_factor = max_dimension / std::cmp::max(image.width(), image.height()) as f32; + if scale_factor < 1.0 { + // NOTE: the tool operates on resized image well without a special model notification + let (nwidth, nheight) = (scale_factor * image.width() as f32, scale_factor * image.height() as f32); + image = image.resize(nwidth as u32, nheight as u32, FilterType::Lanczos3); + let mut interactive_element_map_scaled = vec![]; + for (label, x, y) in interactive_element_map { + let (scaled_x, scaled_y) = ((x as f32 * scale_factor) as i32, (y as f32 * scale_factor) as i32); + interactive_element_map_scaled.push((label, scaled_x, scaled_y)); + } + interactive_element_map = interactive_element_map_scaled; + // NOTE: we should store screenshot_scale_factor for every resized screenshot, not for a tab! + let mut tab_lock = tab.lock().await; + tab_lock.screenshot_scale_factor = scale_factor as f64; + } + + let mut interactive_element_map_visible = vec![]; + for (label, x, y) in interactive_element_map { + if x < image.width() as i32 && y < image.height() as i32 { + interactive_element_map_visible.push((label, x, y)); + } + } -async fn insert_text(tab: &Arc, text: &String) -> Result { - tab.type_str(text.as_str()).map_err(|e| e.to_string())?; - Ok(format!("inserted text `{}`", text.clone())) + let mut tool_log = vec![]; + if highlight && interactive_element_map_visible.len() > 0 { + tool_log.push("Clickable elements are highlighted with red rectangles and numbered labels at the top left.".to_string()); + tool_log.push("The interactive elements map to the rendered page:".to_string()); + for (label, x, y) in interactive_element_map_visible { + tool_log.push(format!("label `{}` center is ({}, {})", label, x, y)); + } + } + + data = Vec::new(); + image.write_to(&mut Cursor::new(&mut data), ImageFormat::Png).map_err(|e| e.to_string())?; + + let multimodal_el = MultimodalElement::new( + "image/png".to_string(), + base64::prelude::BASE64_STANDARD.encode(data) + ).map_err(|e| e.to_string())?; + + Ok((tool_log, multimodal_el)) } async fn session_open_tab( chrome_session: &mut ChromeSession, - tab_name: &String, -) -> Result<(Arc, String), String> { - match chrome_session.tabs.get(tab_name) { + tab_id: &String, + device: &DeviceType, +) -> Result { + match chrome_session.tabs.get(tab_id) { Some(tab) => { - Ok((tab.clone(), format!("Using opened tab {}\n", tab_name.clone()))) + let tab_lock = tab.lock().await; + Err(format!("Tab is already opened: {}\n", tab_lock.state_string())) }, None => { - let tab = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; - chrome_session.tabs.insert(tab_name.clone(), tab.clone()); - Ok((tab, format!("Opened new tab {}\n", tab_name.clone()))) + let headless_tab = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; + match device { + DeviceType::MOBILE => { + headless_tab.call_method(Emulation::SetDeviceMetricsOverride { + width: 375, + height: 812, + device_scale_factor: 0.0, + mobile: true, + scale: None, + screen_width: None, + screen_height: None, + position_x: None, + position_y: None, + dont_set_visible_size: None, + screen_orientation: None, + viewport: None, + display_feature: None, + }).map_err(|e| e.to_string())?; + }, + DeviceType::DESKTOP => { + headless_tab.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; + } + } + let tab = Arc::new(AMutex::new(ChromeTab::new(headless_tab, device, tab_id))); + let tab_lock = tab.lock().await; + let tab_log = Arc::clone(&tab_lock.tab_log); + tab_lock.headless_tab.enable_log().map_err(|e| e.to_string())?; + tab_lock.headless_tab.add_event_listener(Arc::new(move |event: &Event| { + if let Event::LogEntryAdded(e) = event { + let formatted_ts = { + let dt = DateTime::from_timestamp(e.params.entry.timestamp as i64, 0).unwrap(); + dt.format("%Y-%m-%d %H:%M:%S").to_string() + }; + let mut tab_log_lock = tab_log.lock().unwrap(); + tab_log_lock.push(format!("{} [{:?}]: {}", formatted_ts, e.params.entry.level, e.params.entry.text)); + if tab_log_lock.len() > MAX_CACHED_LOG_LINES { + tab_log_lock.remove(0); + } + } + })).map_err(|e| e.to_string())?; + chrome_session.tabs.insert(tab_id.clone(), tab.clone()); + Ok(format!("opened a new tab: {}\n", tab_lock.state_string())) } } } +async fn session_get_tab_arc( + chrome_session: &ChromeSession, + tab_id: &String, +) -> Result>, String> { + match chrome_session.tabs.get(tab_id) { + Some(tab) => Ok(tab.clone()), + None => Err(format!("tab_id {} is not opened", tab_id)), + } +} + #[derive(Debug)] enum Command { - // TODO: probably we need connect command - // if we're tying to operate on non-existing tab (no connection or something like this) - // we should not auto-open connection again + OpenTab(OpenTabArgs), NavigateTo(NavigateToArgs), Screenshot(ScreenshotArgs), Html(HtmlArgs), Reload(ReloadArgs), - Device(DeviceArgs), - Click(ClickArgs), - InsertText(InsertTextArgs), + ClickAt(ClickAtArgs), + TypeTextAt(TypeTextAtArgs), + PressKeyAt(PressKeyAtArgs), + TabLog(TabLogArgs), } -impl Command { - pub async fn execute( - &self, - chrome_session: &mut ChromeSession - ) -> Result<(Vec, Vec), String> { - let mut tool_log = vec![]; - let mut multimodal_els = vec![]; - - match self { - Command::NavigateTo(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = navigate_to(&tab, &args.uri).await.map_err( - |e| format!("Can't navigate_to `{}` on tab `{}`: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, args.tab_id, e) - )?; - tool_log.push(content); - }, - Command::Screenshot(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let screenshot = screenshot_jpeg_base64(&tab, false).await?; - tool_log.push(format!("Made a screenshot of {}", tab.get_url())); - multimodal_els.push(screenshot); - }, - Command::Html(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let client = Client::builder() - .build() - .map_err(|e| e.to_string())?; - let url = tab.get_url(); - let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; - if !response.status().is_success() { - tool_log.push(format!("Unable to fetch url: {}; status: {}", url, response.status())); - } else { - tool_log.push(response.text().await.map_err(|e| e.to_string())?); +async fn chrome_command_exec( + cmd: &Command, + chrome_session: Arc>>, +) -> Result<(Vec, Vec), String> { + let mut tool_log = vec![]; + let mut multimodal_els = vec![]; + + match cmd { + Command::OpenTab(args) => { + let log = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_open_tab(chrome_session, &args.tab_id, &args.device).await? + }; + tool_log.push(log); + }, + Command::NavigateTo(args) => { + let tab: Arc> = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + match { + tab_lock.headless_tab.navigate_to(args.uri.as_str()).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("navigate_to successful: {}", tab_lock.state_string()) + }, + Err(e) => { + format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()) + }, } - }, - Command::Reload(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - tab.reload(false, None).map_err(|e| e.to_string())?; - tool_log.push(format!("Page `{}` on tab `{}` reloaded", tab.get_url(), args.tab_id)); - }, - Command::Device(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - match args.device { - DeviceType::MOBILE => { - tab.call_method(Emulation::SetDeviceMetricsOverride { - width: 375, - height: 812, - device_scale_factor: 0.0, - mobile: true, - scale: None, - screen_width: None, - screen_height: None, - position_x: None, - position_y: None, - dont_set_visible_size: None, - screen_orientation: None, - viewport: None, - display_feature: None, - }).map_err(|e| e.to_string())?; - tool_log.push(format!("Tab `{}` set to mobile view", args.tab_id)); + }; + tool_log.push(log); + }, + Command::Screenshot(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let highlight = match args.mode { + ScreenshotMode::HIGHLIGHT => true, + _ => false, + }; + let log = { + // NOTE: this operation is not atomic, unfortunately + match capture_screenshot_base64(tab.clone(), highlight).await { + Ok((log, multimodal_el)) => { + multimodal_els.push(multimodal_el); + let tab_lock = tab.lock().await; + let log_str = log.join("\n"); + vec![log_str, format!("made a screenshot of {}", tab_lock.state_string())].join("\n\n") + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("screenshot failed for {}: {}", tab_lock.state_string(), e.to_string()) }, - DeviceType::DESKTOP => { - tab.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; - tool_log.push(format!("Tab `{}` set to desktop view", args.tab_id)); + } + }; + tool_log.push(log); + }, + Command::Html(args) => { + // NOTE: removed from commands list, please rewrite me... + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + let url = tab_lock.headless_tab.get_url(); + match { + let client = Client::builder() + .build() + .map_err(|e| e.to_string())?; + let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; + if response.status().is_success() { + let html = response.text().await.map_err(|e| e.to_string())?; + Ok(html) + } else { + Err(format!("status: {}", response.status())) } + } { + Ok(html) => { + format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) + }, + Err(e) => { + format!("can't fetch innerHtml of {}: {}", tab_lock.state_string(), e.to_string()) + }, } - }, - Command::Click(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = click_on_point(&tab, &args.point).await?; - tool_log.push(content); - }, - Command::InsertText(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = insert_text(&tab, &args.text).await?; - tool_log.push(content); - }, + }; + tool_log.push(log); + }, + Command::Reload(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + let chrome_tab = tab_lock.headless_tab.clone(); + match chrome_tab.reload(false, None) { + Ok(_) => { + format!("reload of {} successful", tab_lock.state_string()) + }, + Err(e) => { + format!("reload of {} failed: {}", tab_lock.state_string(), e.to_string()) + }, + } + }; + tool_log.push(log); + }, + Command::ClickAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + match { + let mapped_point = Point { + x: args.point.x / tab_lock.screenshot_scale_factor, + y: args.point.y / tab_lock.screenshot_scale_factor, + }; + tab_lock.headless_tab.click_point(mapped_point).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab_lock.state_string()) + }, + Err(e) => { + format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab_lock.state_string(), e.to_string()) + }, + } + }; + tool_log.push(log); + }, + Command::TypeTextAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + match tab_lock.headless_tab.type_str(args.text.as_str()) { + Ok(_) => { + format!("type `{}` at {}", args.text, tab_lock.state_string()) + }, + Err(e) => { + format!("type text failed at {}: {}", tab_lock.state_string(), e.to_string()) + }, + } + }; + tool_log.push(log); + }, + Command::PressKeyAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = { + let tab_lock = tab.lock().await; + match { + tab_lock.headless_tab.press_key(args.key.to_string().as_str()).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + // TODO: sometimes page isn't ready for next step + sleep(Duration::from_secs(1)).await; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("press `{}` at {}", args.key, tab_lock.state_string()) + }, + Err(e) => { + format!("press `{}` failed at {}: {}", args.key, tab_lock.state_string(), e.to_string()) + }, + } + }; + tool_log.push(log); + }, + Command::TabLog(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let tab_log = { + let tab_lock = tab.lock().await; + // NOTE: we're waiting for log to be collected for 3 seconds + sleep(Duration::from_secs(3)).await; + let mut tab_log_lock = tab_lock.tab_log.lock().unwrap(); + let tab_log = tab_log_lock.join("\n"); + tab_log_lock.clear(); + tab_log + }; + let filter = CmdlineOutputFilter::default(); + let filtered_log = output_mini_postprocessing(&filter, tab_log.as_str()); + tool_log.push(filtered_log.clone()); } - - Ok((tool_log, multimodal_els)) } + + Ok((tool_log, multimodal_els)) +} + +#[derive(Debug)] +struct OpenTabArgs { + device: DeviceType, + tab_id: String, } #[derive(Debug)] @@ -367,8 +675,15 @@ struct NavigateToArgs { tab_id: String, } +#[derive(Clone, Debug)] +enum ScreenshotMode { + PLAIN, + HIGHLIGHT, +} + #[derive(Debug)] struct ScreenshotArgs { + mode: ScreenshotMode, tab_id: String, } @@ -383,27 +698,49 @@ struct ReloadArgs { } #[derive(Debug)] -struct ClickArgs { +struct ClickAtArgs { point: Point, tab_id: String, } #[derive(Debug)] -struct InsertTextArgs { +struct TypeTextAtArgs { text: String, tab_id: String, } +#[derive(Clone, Debug)] +enum Key { + ENTER, + ESC, + PAGEUP, + PAGEDOWN, + HOME, + END, +} + +impl fmt::Display for Key { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Key::ENTER => write!(f, "Enter"), + Key::ESC => write!(f, "Escape"), + Key::PAGEUP => write!(f, "PageUp"), + Key::PAGEDOWN => write!(f, "PageDown"), + Key::HOME => write!(f, "Home"), + Key::END => write!(f, "End"), + } + } +} #[derive(Debug)] -enum DeviceType { - DESKTOP, - MOBILE, +struct PressKeyAtArgs { + key: Key, + tab_id: String, } #[derive(Debug)] -struct DeviceArgs { - device: DeviceType, +struct TabLogArgs { + // wait_secs: u32, tab_id: String, } @@ -416,6 +753,20 @@ fn parse_single_command(command: &String) -> Result { let (command_name, parsed_args) = (args[0].clone(), args[1..].to_vec()); match command_name.as_str() { + "open_tab" => { + if parsed_args.len() < 2 { + return Err(format!("`open_tab` requires 2 arguments: `` and `tab_id`. Provided: {:?}", parsed_args)); + } + let device = match parsed_args[0].as_str() { + "desktop" => DeviceType::DESKTOP, + "mobile" => DeviceType::MOBILE, + _ => return Err(format!("unknown device type: {}. Should be either `desktop` or `mobile`.", parsed_args[0])) + }; + Ok(Command::OpenTab(OpenTabArgs { + device, + tab_id: parsed_args[1].clone(), + })) + }, "navigate_to" => { if parsed_args.len() < 2 { return Err(format!("`navigate_to` requires 2 arguments: `uri` and `tab_id`. Provided: {:?}", parsed_args)); @@ -426,12 +777,22 @@ fn parse_single_command(command: &String) -> Result { })) }, "screenshot" => { - if parsed_args.len() < 1 { - return Err(format!("`screenshot` requires 1 argument: `tab_id`. Provided: {:?}", parsed_args)); + match parsed_args.as_slice() { + [mode_str, tab_id] => { + let mode = match mode_str.to_lowercase().as_str() { + "plain" => ScreenshotMode::PLAIN, + "highlight" => ScreenshotMode::HIGHLIGHT, + _ => return Err(format!("Unknown screenshot mode: {}.", mode_str)), + }; + Ok(Command::Screenshot(ScreenshotArgs { + mode: mode.clone(), + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'mode', 'tab_id'".to_string()) + } } - Ok(Command::Screenshot(ScreenshotArgs { - tab_id: parsed_args[0].clone(), - })) }, "html" => { if parsed_args.len() < 1 { @@ -449,26 +810,13 @@ fn parse_single_command(command: &String) -> Result { tab_id: parsed_args[0].clone(), })) }, - "device" => { - if parsed_args.len() < 2 { - return Err(format!("`device` requires 2 arguments: `desktop|mobile` and `tab_id`. Provided: {:?}", parsed_args)); - } - Ok(Command::Device(DeviceArgs { - device: match parsed_args[0].as_str() { - "desktop" => DeviceType::DESKTOP, - "mobile" => DeviceType::MOBILE, - _ => return Err(format!("Unknown device type: {}. Should be either `desktop` or `mobile`.", parsed_args[0])) - }, - tab_id: parsed_args[1].clone(), - })) - }, - "click" => { + "click_at" => { match parsed_args.as_slice() { [x_str, y_str, tab_id] => { let x = x_str.parse::().map_err(|e| format!("Failed to parse x: {}", e))?; let y = y_str.parse::().map_err(|e| format!("Failed to parse y: {}", e))?; let point = Point { x, y }; - Ok(Command::Click(ClickArgs { + Ok(Command::ClickAt(ClickAtArgs { point, tab_id: tab_id.clone(), })) @@ -478,10 +826,10 @@ fn parse_single_command(command: &String) -> Result { } } }, - "insert_text" => { + "type_text_at" => { match parsed_args.as_slice() { [text, tab_id] => { - Ok(Command::InsertText(InsertTextArgs { + Ok(Command::TypeTextAt(TypeTextAtArgs { text: text.clone(), tab_id: tab_id.clone(), })) @@ -491,46 +839,140 @@ fn parse_single_command(command: &String) -> Result { } } }, + "press_key_at" => { + match parsed_args.as_slice() { + [key_str, tab_id] => { + let key = match key_str.to_lowercase().as_str() { + "enter" => Key::ENTER, + "esc" => Key::ESC, + "pageup" => Key::PAGEUP, + "pagedown" => Key::PAGEDOWN, + "home" => Key::HOME, + "end" => Key::END, + _ => return Err(format!("Unknown key: {}", key_str)), + }; + Ok(Command::PressKeyAt(PressKeyAtArgs { + key, + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'key', 'tab_id'".to_string()) + } + } + }, + "tab_log" => { + match parsed_args.as_slice() { + [tab_id] => { + Ok(Command::TabLog(TabLogArgs { + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'tab_id'".to_string()) + } + } + }, _ => Err(format!("Unknown command: {:?}.", command_name)), } } -async fn interact_with_chrome( - gcx: Arc>, - chat_id: &String, - integration_chrome: &IntegrationChrome, - command: &Command, -) -> Result, String> { - let session_hashmap_key = get_session_hashmap_key("chrome", &chat_id); - let setup_log = setup_chrome_session(gcx.clone(), &integration_chrome, &session_hashmap_key).await?; - - let command_session = { - let gcx_locked = gcx.read().await; - gcx_locked.integration_sessions.get(&session_hashmap_key) - .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? - .clone() - }; - let mut command_session_locked = command_session.lock().await; - let chrome_session = command_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; - - let (execute_log, mut multimodal_els) = command.execute(chrome_session).await?; +async fn highlight_elements(tab: &Arc) -> Result, String> { + // NOTE: for now there is the problem with input, no label for it unfortunately + let func = " + (function () { + const clickableElements = document.querySelectorAll( + 'a, button, details, embed, input, menu, menuitem, object, select, textarea, summary, [onclick], [role=\"button\"]' + ); + let results = []; + clickableElements.forEach(element => { + if (element) { + const rect = element.getBoundingClientRect(); + if (rect.left >= 0 && rect.top >= 0 && rect.width * rect.height > 0) { + element.style.outline = '2px solid red'; + element.setAttribute('browser-user-highlight-id', 'screenshot-highlight'); + + const label_text = (results.length + 1).toString(); + + const label = document.createElement('div'); + label.className = 'screenshot-highlight-label'; + label.style.background = 'red'; + label.style.color = 'white'; + label.style.padding = '2px 6px'; + label.style.borderRadius = '10px'; + label.style.fontSize = '12px'; + label.textContent = label_text; + + label.style.position = 'absolute'; + label.style.zIndex = '999999'; + label.style.top = '0'; + label.style.left = '0'; + + // Set the parent element's position to relative if not already set + if (getComputedStyle(element).position === 'static') { + element.style.position = 'relative'; // Establish a positioning context + } - let tool_log = setup_log.iter().chain(execute_log.iter()).map(|s| s.clone()).collect::>(); - multimodal_els.push(MultimodalElement::new( - "text".to_string(), tool_log.join("\n") - )?); + element.appendChild(label); - Ok(multimodal_els) + midpoint_x = rect.left + rect.width / 2; + midpoint_y = rect.top + rect.height / 2; + midpoint_text = `${label_text}: ${parseInt(midpoint_x)}, ${parseInt(midpoint_y)}`; + results.push(midpoint_text); + } + } + }); + return results; + })();"; + + let result = tab.evaluate(func, false).map_err(|e| e.to_string())?; + if let Some(preview) = result.preview { + let mut interactive_element_map = vec![]; + for pp in preview.properties { + if let Some(value) = pp.value.clone() { + let parts: Vec = value.to_string().split(':').map(|x| x.to_string()).collect(); + if parts.len() != 2 { + continue; + } + let label = parts[0].trim().to_string(); + let coords: Vec<&str> = parts[1].trim().split(',').collect(); + if coords.len() != 2 { + continue; + } + let (x, y) = match { + let x = coords[0].trim().parse::().map_err(|e| e.to_string())?; + let y = coords[1].trim().parse::().map_err(|e| e.to_string())?; + Ok::<(i32, i32), String>((x, y)) + } { + Ok((x, y)) => (x, y), + Err(_) => continue, + }; + interactive_element_map.push((label, x, y)); + } + } + return Ok(interactive_element_map); + } + if let Some(e) = result.description { + return Err(e); + } + Err("Unexpected error while highlighting clickable elements".to_string()) } -async fn screenshot_jpeg_base64(tab: &Arc, capture_beyond_viewport: bool) -> Result { - let jpeg_data = tab.call_method(Page::CaptureScreenshot { - format: Some(Page::CaptureScreenshotFormatOption::Jpeg), - clip: None, - quality: Some(75), - from_surface: Some(true), - capture_beyond_viewport: Some(capture_beyond_viewport), - }).map_err(|e| e.to_string())?.data; - - MultimodalElement::new("image/jpeg".to_string(), jpeg_data) +async fn remove_highlight(tab: &Arc) -> Result<(), String> { + let func = " + (function () { + const highlightedElements = document.querySelectorAll('[browser-user-highlight-id=\"screenshot-highlight\"]'); + highlightedElements.forEach(element => { + element.style.outline = ''; + element.removeAttribute('browser-user-highlight-id'); + }); + const labels = document.querySelectorAll('.screenshot-highlight-label'); + labels.forEach(label => label.remove()); + })();"; + + let result = tab.evaluate(func, false).map_err(|e| e.to_string())?; + if let Some(e) = result.description { + return Err(e); + } + Ok(()) } diff --git a/src/integrations/mod.rs b/src/integrations/mod.rs index d004f0247..8ac7254a2 100644 --- a/src/integrations/mod.rs +++ b/src/integrations/mod.rs @@ -54,7 +54,7 @@ chrome: # Or you can give it ws:// path, read more here https://developer.chrome.com/docs/devtools/remote-debugging/local-server/ # In that case start chrome with --remote-debugging-port chrome_path: "ws://127.0.0.1:6006/" - window_size: [1024, 768] + window_size: [800, 600] idle_browser_timeout: 600 diff --git a/src/tools/tool_patch_aux/tickets_parsing.rs b/src/tools/tool_patch_aux/tickets_parsing.rs index ac3271901..bdf596fcf 100644 --- a/src/tools/tool_patch_aux/tickets_parsing.rs +++ b/src/tools/tool_patch_aux/tickets_parsing.rs @@ -71,7 +71,7 @@ pub struct TicketToApply { } pub fn good_error_text(reason: &str, tickets: &Vec, resolution: Option) -> (String, Option) { - let mut text = format!("Couldn't create patch for tickets: '{}'.\nReason: {reason}", tickets.join(", ")); + let text = format!("Couldn't create patch for tickets: '{}'.\nReason: {reason}", tickets.join(", ")); if let Some(resolution) = resolution { let cd_format = format!("💿 {resolution}"); return (text, Some(cd_format)) diff --git a/src/tools/tools_description.rs b/src/tools/tools_description.rs index 772aee0a7..15e802049 100644 --- a/src/tools/tools_description.rs +++ b/src/tools/tools_description.rs @@ -80,7 +80,7 @@ pub async fn read_integrations_yaml(gcx: Arc>) -> Result< pub async fn tools_merged_and_filtered( gcx: Arc>, - supports_clicks: bool, + _supports_clicks: bool, ) -> Result>>>, String> { let (ast_on, vecdb_on, allow_experimental) = { let gcx_locked = gcx.read().await; @@ -133,7 +133,7 @@ pub async fn tools_merged_and_filtered( tools_all.insert("pdb".to_string(), Arc::new(AMutex::new(Box::new(ToolPdb::new_from_yaml(pdb_config)?) as Box))); } if let Some(chrome_config) = integrations_value.get("chrome") { - tools_all.insert("chrome".to_string(), Arc::new(AMutex::new(Box::new(ToolChrome::new_from_yaml(chrome_config, supports_clicks)?) as Box))); + tools_all.insert("chrome".to_string(), Arc::new(AMutex::new(Box::new(ToolChrome::new_from_yaml(chrome_config)?) as Box))); } if let Some(postgres_config) = integrations_value.get("postgres") { tools_all.insert("postgres".to_string(), Arc::new(AMutex::new(Box::new(ToolPostgres::new_from_yaml(postgres_config)?) as Box)));