From 256257c7055808c548c4f1ce2bfccbc2d79f267d Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 14:15:25 +0300 Subject: [PATCH 01/18] new open tab command and chrome tool refactor --- src/integrations/integr_chrome.rs | 320 +++++++++++++++++------------- 1 file changed, 186 insertions(+), 134 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 4f4075169..074bd61a5 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -21,7 +21,7 @@ use headless_chrome::browser::tab::point::Point; use headless_chrome::protocol::cdp::Page; use headless_chrome::protocol::cdp::Emulation; use serde::{Deserialize, Serialize}; - +use std::fmt; #[derive(Clone, Serialize, Deserialize, Debug)] pub struct IntegrationChrome { @@ -39,9 +39,37 @@ pub struct ToolChrome { supports_clicks: bool, } +#[derive(Clone, Debug)] +enum DeviceType { + DESKTOP, + MOBILE, +} + +impl fmt::Display for DeviceType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + DeviceType::DESKTOP => write!(f, "desktop"), + DeviceType::MOBILE => write!(f, "mobile"), + } + } +} + +#[derive(Clone)] +pub struct ChromeTab { + instance: Arc, + device: DeviceType, + tab_id: String, +} + +impl ChromeTab { + pub fn state_string(&self) -> String { + format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.instance.get_url()) + } +} + struct ChromeSession { browser: Browser, - tabs: HashMap>, + tabs: HashMap, } impl ChromeSession { @@ -140,15 +168,23 @@ impl Tool for ToolChrome { } fn tool_description(&self) -> ToolDesc { - let mut commands_desc = r#"One or several commands separated by newline. The is an integer, for example 10, for you to identify the tab later. Supported commands: -navigate_to -screenshot -html -reload -device "#.to_string(); + let mut supported_commands = vec![ + "open_tab ", + "navigate_to ", + "screenshot ", + "html ", + "reload ", + ]; if self.supports_clicks { - commands_desc = format!("{}\nclick \ninsert_text \n", commands_desc); + supported_commands.extend(vec![ + "click ", + "insert_text ", + ]); } + let description = format!( + "One or several commands separated by newline. \ + The is an integer, for example 10, for you to identify the tab later. \ + Supported commands:\n{}", supported_commands.join("\n")); ToolDesc { name: "chrome".to_string(), agentic: true, @@ -157,7 +193,7 @@ device "#.to_string(); parameters: vec![ToolParam { name: "commands".to_string(), param_type: "string".to_string(), - description: commands_desc, + description, }], parameters_required: vec!["commands".to_string()], } @@ -224,49 +260,103 @@ async fn setup_chrome_session( Ok(setup_log) } -async fn navigate_to(tab: &Arc, url: &String) -> Result { - tab.navigate_to(url.as_str()).map_err(|e| e.to_string())?; - tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(format!("Chrome tab navigated to {}", tab.get_url())) +async fn navigate_to(instance: &Arc, url: &String) -> Result<(), String> { + instance.navigate_to(url.as_str()).map_err(|e| e.to_string())?; + instance.wait_until_navigated().map_err(|e| e.to_string())?; + Ok(()) } -async fn click_on_point(tab: &Arc, point: &Point) -> Result { - tab.click_point(point.clone()).map_err(|e| e.to_string())?; - tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(format!("clicked on `{} {}`", point.x, point.y)) +async fn screenshot_jpeg_base64(instance: &Arc, capture_beyond_viewport: bool) -> Result { + let jpeg_data = instance.call_method(Page::CaptureScreenshot { + format: Some(Page::CaptureScreenshotFormatOption::Jpeg), + clip: None, + quality: Some(75), + from_surface: Some(true), + capture_beyond_viewport: Some(capture_beyond_viewport), + }).map_err(|e| e.to_string())?.data; + MultimodalElement::new("image/jpeg".to_string(), jpeg_data) +} + +async fn inner_html(url: String) -> Result { + let client = Client::builder() + .build() + .map_err(|e| e.to_string())?; + let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; + if response.status().is_success() { + let html = response.text().await.map_err(|e| e.to_string())?; + Ok(html) + } else { + Err(format!("status: {}", response.status())) + } } -async fn insert_text(tab: &Arc, text: &String) -> Result { - tab.type_str(text.as_str()).map_err(|e| e.to_string())?; - Ok(format!("inserted text `{}`", text.clone())) +async fn click_on_point(instance: &Arc, point: &Point) -> Result<(), String> { + instance.click_point(point.clone()).map_err(|e| e.to_string())?; + instance.wait_until_navigated().map_err(|e| e.to_string())?; + Ok(()) } async fn session_open_tab( chrome_session: &mut ChromeSession, - tab_name: &String, -) -> Result<(Arc, String), String> { - match chrome_session.tabs.get(tab_name) { + tab_id: &String, + device: &DeviceType, +) -> Result { + match chrome_session.tabs.get(tab_id) { Some(tab) => { - Ok((tab.clone(), format!("Using opened tab {}\n", tab_name.clone()))) + Err(format!("Tab is already opened: {}\n", tab.state_string())) }, None => { - let tab = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; - chrome_session.tabs.insert(tab_name.clone(), tab.clone()); - Ok((tab, format!("Opened new tab {}\n", tab_name.clone()))) + let instance = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; + match device { + DeviceType::MOBILE => { + instance.call_method(Emulation::SetDeviceMetricsOverride { + width: 375, + height: 812, + device_scale_factor: 0.0, + mobile: true, + scale: None, + screen_width: None, + screen_height: None, + position_x: None, + position_y: None, + dont_set_visible_size: None, + screen_orientation: None, + viewport: None, + display_feature: None, + }).map_err(|e| e.to_string())?; + }, + DeviceType::DESKTOP => { + instance.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; + } + } + let tab = ChromeTab{ + instance, + tab_id: tab_id.clone(), + device: device.clone(), + }; + chrome_session.tabs.insert(tab_id.clone(), tab.clone()); + Ok(format!("opened a new tab: {}\n", tab.state_string())) } } } +async fn session_get_tab( + chrome_session: &mut ChromeSession, + tab_id: &String, +) -> Result { + match chrome_session.tabs.get(tab_id) { + Some(tab) => Ok(tab.clone()), + None => Err(format!("tab_id {} is not opened", tab_id)), + } +} + #[derive(Debug)] enum Command { - // TODO: probably we need connect command - // if we're tying to operate on non-existing tab (no connection or something like this) - // we should not auto-open connection again + OpenTab(OpenTabArgs), NavigateTo(NavigateToArgs), Screenshot(ScreenshotArgs), Html(HtmlArgs), Reload(ReloadArgs), - Device(DeviceArgs), Click(ClickArgs), InsertText(InsertTextArgs), } @@ -274,86 +364,66 @@ enum Command { impl Command { pub async fn execute( &self, - chrome_session: &mut ChromeSession + chrome_session: &mut ChromeSession, ) -> Result<(Vec, Vec), String> { let mut tool_log = vec![]; let mut multimodal_els = vec![]; match self { + Command::OpenTab(args) => { + let log = session_open_tab(chrome_session, &args.tab_id, &args.device).await?; + tool_log.push(log); + }, Command::NavigateTo(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = navigate_to(&tab, &args.uri).await.map_err( - |e| format!("Can't navigate_to `{}` on tab `{}`: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, args.tab_id, e) - )?; - tool_log.push(content); + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match navigate_to(&tab.instance, &args.uri).await { + Ok(_) => format!("navigate_to successful: {}", tab.state_string()), + Err(e) => format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()), + }; + tool_log.push(log); }, Command::Screenshot(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let screenshot = screenshot_jpeg_base64(&tab, false).await?; - tool_log.push(format!("Made a screenshot of {}", tab.get_url())); - multimodal_els.push(screenshot); + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match screenshot_jpeg_base64(&tab.instance, false).await { + Ok(multimodal_el) => { + multimodal_els.push(multimodal_el); + format!("made a screenshot of {}", tab.state_string()) + }, + Err(e) => format!("screenshot failed for {}: {}", tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, Command::Html(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let client = Client::builder() - .build() - .map_err(|e| e.to_string())?; - let url = tab.get_url(); - let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; - if !response.status().is_success() { - tool_log.push(format!("Unable to fetch url: {}; status: {}", url, response.status())); - } else { - tool_log.push(response.text().await.map_err(|e| e.to_string())?); - } + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match inner_html(tab.instance.get_url()).await { + Ok(html) => format!("innerHtml of {}:\n\n{}", tab.state_string(), html), + Err(e) => format!("can't fetch innerHtml of {}: {}", tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, Command::Reload(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - tab.reload(false, None).map_err(|e| e.to_string())?; - tool_log.push(format!("Page `{}` on tab `{}` reloaded", tab.get_url(), args.tab_id)); - }, - Command::Device(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - match args.device { - DeviceType::MOBILE => { - tab.call_method(Emulation::SetDeviceMetricsOverride { - width: 375, - height: 812, - device_scale_factor: 0.0, - mobile: true, - scale: None, - screen_width: None, - screen_height: None, - position_x: None, - position_y: None, - dont_set_visible_size: None, - screen_orientation: None, - viewport: None, - display_feature: None, - }).map_err(|e| e.to_string())?; - tool_log.push(format!("Tab `{}` set to mobile view", args.tab_id)); - }, - DeviceType::DESKTOP => { - tab.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; - tool_log.push(format!("Tab `{}` set to desktop view", args.tab_id)); - } - } + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match tab.instance.reload(false, None) { + Ok(_) => format!("reload of {} successful", tab.state_string()), + Err(e) => format!("reload of {} failed: {}", tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, Command::Click(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = click_on_point(&tab, &args.point).await?; - tool_log.push(content); + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match click_on_point(&tab.instance, &args.point).await { + Ok(_) => format!("clicked on `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), + Err(e) => format!("clicked on `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, Command::InsertText(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = insert_text(&tab, &args.text).await?; - tool_log.push(content); + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match tab.instance.type_str(args.text.as_str()) { + Ok(_) => format!("insert_text `{}` to {}", args.text, tab.state_string()), + Err(e) => format!("insert_text failed to {}: {}", tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, } @@ -361,6 +431,12 @@ impl Command { } } +#[derive(Debug)] +struct OpenTabArgs { + device: DeviceType, + tab_id: String, +} + #[derive(Debug)] struct NavigateToArgs { uri: String, @@ -394,19 +470,6 @@ struct InsertTextArgs { tab_id: String, } - -#[derive(Debug)] -enum DeviceType { - DESKTOP, - MOBILE, -} - -#[derive(Debug)] -struct DeviceArgs { - device: DeviceType, - tab_id: String, -} - fn parse_single_command(command: &String) -> Result { let args = shell_words::split(&command).map_err(|e| e.to_string())?; if args.is_empty() { @@ -416,6 +479,20 @@ fn parse_single_command(command: &String) -> Result { let (command_name, parsed_args) = (args[0].clone(), args[1..].to_vec()); match command_name.as_str() { + "open_tab" => { + if parsed_args.len() < 2 { + return Err(format!("`open_tab` requires 2 arguments: `` and `tab_id`. Provided: {:?}", parsed_args)); + } + let device = match parsed_args[0].as_str() { + "desktop" => DeviceType::DESKTOP, + "mobile" => DeviceType::MOBILE, + _ => return Err(format!("unknown device type: {}. Should be either `desktop` or `mobile`.", parsed_args[0])) + }; + Ok(Command::OpenTab(OpenTabArgs { + device, + tab_id: parsed_args[1].clone(), + })) + }, "navigate_to" => { if parsed_args.len() < 2 { return Err(format!("`navigate_to` requires 2 arguments: `uri` and `tab_id`. Provided: {:?}", parsed_args)); @@ -449,19 +526,6 @@ fn parse_single_command(command: &String) -> Result { tab_id: parsed_args[0].clone(), })) }, - "device" => { - if parsed_args.len() < 2 { - return Err(format!("`device` requires 2 arguments: `desktop|mobile` and `tab_id`. Provided: {:?}", parsed_args)); - } - Ok(Command::Device(DeviceArgs { - device: match parsed_args[0].as_str() { - "desktop" => DeviceType::DESKTOP, - "mobile" => DeviceType::MOBILE, - _ => return Err(format!("Unknown device type: {}. Should be either `desktop` or `mobile`.", parsed_args[0])) - }, - tab_id: parsed_args[1].clone(), - })) - }, "click" => { match parsed_args.as_slice() { [x_str, y_str, tab_id] => { @@ -522,15 +586,3 @@ async fn interact_with_chrome( Ok(multimodal_els) } - -async fn screenshot_jpeg_base64(tab: &Arc, capture_beyond_viewport: bool) -> Result { - let jpeg_data = tab.call_method(Page::CaptureScreenshot { - format: Some(Page::CaptureScreenshotFormatOption::Jpeg), - clip: None, - quality: Some(75), - from_surface: Some(true), - capture_beyond_viewport: Some(capture_beyond_viewport), - }).map_err(|e| e.to_string())?.data; - - MultimodalElement::new("image/jpeg".to_string(), jpeg_data) -} From 7d9afcfaa3eba62761a7382c1813089b0ad34d7d Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 14:33:46 +0300 Subject: [PATCH 02/18] chrome suppress command logs into one text element --- src/integrations/integr_chrome.rs | 70 +++++++++++-------------------- 1 file changed, 25 insertions(+), 45 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 074bd61a5..7052079a4 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -125,37 +125,45 @@ impl Tool for ToolChrome { None => return Err("Missing argument `commands`".to_string()) }; - let mut content = vec![]; + let session_hashmap_key = get_session_hashmap_key("chrome", &chat_id); + let mut tool_log = setup_chrome_session(gcx.clone(), &self.integration_chrome, &session_hashmap_key).await?; + + let command_session = { + let gcx_locked = gcx.read().await; + gcx_locked.integration_sessions.get(&session_hashmap_key) + .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? + .clone() + }; + let mut command_session_locked = command_session.lock().await; + let chrome_session = command_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + + let mut mutlimodal_els = vec![]; for command in commands_str.lines().map(|s| s.trim()).collect::>() { let parsed_command = match parse_single_command(&command.to_string()) { Ok(command) => command, Err(e) => { - content.push(MultimodalElement::new( - "text".to_string(), - format!("Failed to parse command: {}. Error: {}.", command, e) - )?); + tool_log.push(format!("failed to parse command `{}`: {}.", command, e)); break } }; - match interact_with_chrome( - gcx.clone(), - &chat_id, - &self.integration_chrome, - &parsed_command, - ).await { - Ok(command_content) => { - content.extend(command_content); + match parsed_command.execute(chrome_session).await { + Ok((execute_log, command_multimodal_els)) => { + tool_log.extend(execute_log); + mutlimodal_els.extend(command_multimodal_els); }, Err(e) => { - content.push(MultimodalElement::new( - "text".to_string(), - format!("Failed to execute command: {}. Error: {}.", command, e) - )?); + tool_log.push(format!("failed to execute command `{}`: {}.", command, e)); break } }; } + let mut content= vec![]; + content.push(MultimodalElement::new( + "text".to_string(), tool_log.join("\n") + )?); + content.extend(mutlimodal_els); + let msg = ContextEnum::ChatMessage(ChatMessage { role: "tool".to_string(), content: ChatContent::Multimodal(content), @@ -558,31 +566,3 @@ fn parse_single_command(command: &String) -> Result { _ => Err(format!("Unknown command: {:?}.", command_name)), } } - -async fn interact_with_chrome( - gcx: Arc>, - chat_id: &String, - integration_chrome: &IntegrationChrome, - command: &Command, -) -> Result, String> { - let session_hashmap_key = get_session_hashmap_key("chrome", &chat_id); - let setup_log = setup_chrome_session(gcx.clone(), &integration_chrome, &session_hashmap_key).await?; - - let command_session = { - let gcx_locked = gcx.read().await; - gcx_locked.integration_sessions.get(&session_hashmap_key) - .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? - .clone() - }; - let mut command_session_locked = command_session.lock().await; - let chrome_session = command_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; - - let (execute_log, mut multimodal_els) = command.execute(chrome_session).await?; - - let tool_log = setup_log.iter().chain(execute_log.iter()).map(|s| s.clone()).collect::>(); - multimodal_els.push(MultimodalElement::new( - "text".to_string(), tool_log.join("\n") - )?); - - Ok(multimodal_els) -} From 5f9fb5116e07859d53447d11f6c7015bcb260792 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 17:25:46 +0300 Subject: [PATCH 03/18] resize of screenshot --- src/integrations/integr_chrome.rs | 85 ++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 7052079a4..9316391c6 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -23,6 +23,11 @@ use headless_chrome::protocol::cdp::Emulation; use serde::{Deserialize, Serialize}; use std::fmt; +use base64::Engine; +use std::io::Cursor; +use image::imageops::FilterType; +use image::{ImageFormat, ImageReader}; + #[derive(Clone, Serialize, Deserialize, Debug)] pub struct IntegrationChrome { pub chrome_path: Option, @@ -59,9 +64,18 @@ pub struct ChromeTab { instance: Arc, device: DeviceType, tab_id: String, + screenshot_scale_factor: f64, } impl ChromeTab { + fn new(instance: Arc, device: &DeviceType, tab_id: &String) -> Self { + Self { + instance, + device: device.clone(), + tab_id: tab_id.clone(), + screenshot_scale_factor: 1.0, + } + } pub fn state_string(&self) -> String { format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.instance.get_url()) } @@ -274,15 +288,36 @@ async fn navigate_to(instance: &Arc, url: &String) -> Result<(), String> { Ok(()) } -async fn screenshot_jpeg_base64(instance: &Arc, capture_beyond_viewport: bool) -> Result { - let jpeg_data = instance.call_method(Page::CaptureScreenshot { +async fn screenshot_jpeg_base64( + tab: &mut ChromeTab, + capture_beyond_viewport: bool, +) -> Result { + let jpeg_base64_data = tab.instance.call_method(Page::CaptureScreenshot { format: Some(Page::CaptureScreenshotFormatOption::Jpeg), clip: None, quality: Some(75), from_surface: Some(true), capture_beyond_viewport: Some(capture_beyond_viewport), }).map_err(|e| e.to_string())?.data; - MultimodalElement::new("image/jpeg".to_string(), jpeg_data) + + let mut data = base64::prelude::BASE64_STANDARD + .decode(jpeg_base64_data).map_err(|e| e.to_string())?; + let reader = ImageReader::with_format(Cursor::new(data), ImageFormat::Jpeg); + let mut image = reader.decode().map_err(|e| e.to_string())?; + + let max_dimension = 800.0; + let scale_factor = max_dimension / std::cmp::max(image.width(), image.height()) as f32; + if scale_factor < 1.0 { + // NOTE: the tool operates on resized image well without a special model notification + let (nwidth, nheight) = (scale_factor * image.width() as f32, scale_factor * image.height() as f32); + image = image.resize(nwidth as u32, nheight as u32, FilterType::Lanczos3); + tab.screenshot_scale_factor = scale_factor as f64; + } + + data = Vec::new(); + image.write_to(&mut Cursor::new(&mut data), ImageFormat::Jpeg).map_err(|e| e.to_string())?; + + MultimodalElement::new("image/jpeg".to_string(), base64::prelude::BASE64_STANDARD.encode(data)) } async fn inner_html(url: String) -> Result { @@ -298,9 +333,13 @@ async fn inner_html(url: String) -> Result { } } -async fn click_on_point(instance: &Arc, point: &Point) -> Result<(), String> { - instance.click_point(point.clone()).map_err(|e| e.to_string())?; - instance.wait_until_navigated().map_err(|e| e.to_string())?; +async fn click_on_point(tab: &ChromeTab, point: &Point) -> Result<(), String> { + let mapped_point = Point { + x: point.x / tab.screenshot_scale_factor, + y: point.y / tab.screenshot_scale_factor, + }; + tab.instance.click_point(mapped_point).map_err(|e| e.to_string())?; + tab.instance.wait_until_navigated().map_err(|e| e.to_string())?; Ok(()) } @@ -337,23 +376,19 @@ async fn session_open_tab( instance.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; } } - let tab = ChromeTab{ - instance, - tab_id: tab_id.clone(), - device: device.clone(), - }; - chrome_session.tabs.insert(tab_id.clone(), tab.clone()); + let tab = ChromeTab::new(instance, device, tab_id); + chrome_session.tabs.insert(tab.tab_id.clone(), tab.clone()); Ok(format!("opened a new tab: {}\n", tab.state_string())) } } } -async fn session_get_tab( - chrome_session: &mut ChromeSession, +async fn session_get_tab_mut<'a>( + chrome_session: &'a mut ChromeSession, tab_id: &String, -) -> Result { - match chrome_session.tabs.get(tab_id) { - Some(tab) => Ok(tab.clone()), +) -> Result<&'a mut ChromeTab, String> { + match chrome_session.tabs.get_mut(tab_id) { + Some(tab) => Ok(tab), None => Err(format!("tab_id {} is not opened", tab_id)), } } @@ -383,7 +418,7 @@ impl Command { tool_log.push(log); }, Command::NavigateTo(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match navigate_to(&tab.instance, &args.uri).await { Ok(_) => format!("navigate_to successful: {}", tab.state_string()), Err(e) => format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()), @@ -391,8 +426,8 @@ impl Command { tool_log.push(log); }, Command::Screenshot(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; - let log = match screenshot_jpeg_base64(&tab.instance, false).await { + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; + let log = match screenshot_jpeg_base64(tab, false).await { Ok(multimodal_el) => { multimodal_els.push(multimodal_el); format!("made a screenshot of {}", tab.state_string()) @@ -402,7 +437,7 @@ impl Command { tool_log.push(log); }, Command::Html(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match inner_html(tab.instance.get_url()).await { Ok(html) => format!("innerHtml of {}:\n\n{}", tab.state_string(), html), Err(e) => format!("can't fetch innerHtml of {}: {}", tab.state_string(), e.to_string()), @@ -410,7 +445,7 @@ impl Command { tool_log.push(log); }, Command::Reload(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match tab.instance.reload(false, None) { Ok(_) => format!("reload of {} successful", tab.state_string()), Err(e) => format!("reload of {} failed: {}", tab.state_string(), e.to_string()), @@ -418,15 +453,15 @@ impl Command { tool_log.push(log); }, Command::Click(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; - let log = match click_on_point(&tab.instance, &args.point).await { + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; + let log = match click_on_point(&tab, &args.point).await { Ok(_) => format!("clicked on `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), Err(e) => format!("clicked on `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), }; tool_log.push(log); }, Command::InsertText(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match tab.instance.type_str(args.text.as_str()) { Ok(_) => format!("insert_text `{}` to {}", args.text, tab.state_string()), Err(e) => format!("insert_text failed to {}: {}", tab.state_string(), e.to_string()), From 6f0b68474e57a84ddabc2e763d9ead9f2dfe1a14 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 17:42:16 +0300 Subject: [PATCH 04/18] click -> click_at, insert_text -> insert_text_at --- src/integrations/integr_chrome.rs | 36 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 9316391c6..ee4149de3 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -199,8 +199,8 @@ impl Tool for ToolChrome { ]; if self.supports_clicks { supported_commands.extend(vec![ - "click ", - "insert_text ", + "click_at ", + "type_text_at ", ]); } let description = format!( @@ -333,7 +333,7 @@ async fn inner_html(url: String) -> Result { } } -async fn click_on_point(tab: &ChromeTab, point: &Point) -> Result<(), String> { +async fn click_point(tab: &ChromeTab, point: &Point) -> Result<(), String> { let mapped_point = Point { x: point.x / tab.screenshot_scale_factor, y: point.y / tab.screenshot_scale_factor, @@ -400,8 +400,8 @@ enum Command { Screenshot(ScreenshotArgs), Html(HtmlArgs), Reload(ReloadArgs), - Click(ClickArgs), - InsertText(InsertTextArgs), + ClickAt(ClickAtArgs), + TypeTextAt(TypeTextAtArgs), } impl Command { @@ -452,19 +452,19 @@ impl Command { }; tool_log.push(log); }, - Command::Click(args) => { + Command::ClickAt(args) => { let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match click_on_point(&tab, &args.point).await { - Ok(_) => format!("clicked on `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), - Err(e) => format!("clicked on `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), + let log = match click_point(&tab, &args.point).await { + Ok(_) => format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), + Err(e) => format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), }; tool_log.push(log); }, - Command::InsertText(args) => { + Command::TypeTextAt(args) => { let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match tab.instance.type_str(args.text.as_str()) { - Ok(_) => format!("insert_text `{}` to {}", args.text, tab.state_string()), - Err(e) => format!("insert_text failed to {}: {}", tab.state_string(), e.to_string()), + Ok(_) => format!("type `{}` at {}", args.text, tab.state_string()), + Err(e) => format!("type text failed at {}: {}", tab.state_string(), e.to_string()), }; tool_log.push(log); }, @@ -502,13 +502,13 @@ struct ReloadArgs { } #[derive(Debug)] -struct ClickArgs { +struct ClickAtArgs { point: Point, tab_id: String, } #[derive(Debug)] -struct InsertTextArgs { +struct TypeTextAtArgs { text: String, tab_id: String, } @@ -569,13 +569,13 @@ fn parse_single_command(command: &String) -> Result { tab_id: parsed_args[0].clone(), })) }, - "click" => { + "click_at" => { match parsed_args.as_slice() { [x_str, y_str, tab_id] => { let x = x_str.parse::().map_err(|e| format!("Failed to parse x: {}", e))?; let y = y_str.parse::().map_err(|e| format!("Failed to parse y: {}", e))?; let point = Point { x, y }; - Ok(Command::Click(ClickArgs { + Ok(Command::ClickAt(ClickAtArgs { point, tab_id: tab_id.clone(), })) @@ -585,10 +585,10 @@ fn parse_single_command(command: &String) -> Result { } } }, - "insert_text" => { + "type_text_at" => { match parsed_args.as_slice() { [text, tab_id] => { - Ok(Command::InsertText(InsertTextArgs { + Ok(Command::TypeTextAt(TypeTextAtArgs { text: text.clone(), tab_id: tab_id.clone(), })) From 67edd79a1fcf8d5d43b626bacf321eff8657bb86 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 18:16:03 +0300 Subject: [PATCH 05/18] press_key_at command for chrome --- src/integrations/integr_chrome.rs | 55 +++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index ee4149de3..df4bb1187 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -201,6 +201,7 @@ impl Tool for ToolChrome { supported_commands.extend(vec![ "click_at ", "type_text_at ", + "press_key_at ", ]); } let description = format!( @@ -402,6 +403,7 @@ enum Command { Reload(ReloadArgs), ClickAt(ClickAtArgs), TypeTextAt(TypeTextAtArgs), + PressKeyAt(PressKeyAtArgs), } impl Command { @@ -468,6 +470,14 @@ impl Command { }; tool_log.push(log); }, + Command::PressKeyAt(args) => { + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; + let log = match tab.instance.press_key(args.key.to_string().as_str()) { + Ok(_) => format!("press `{}` at {}", args.key, tab.state_string()), + Err(e) => format!("press `{}` failed at {}: {}", args.key, tab.state_string(), e.to_string()), + }; + tool_log.push(log); + } } Ok((tool_log, multimodal_els)) @@ -513,6 +523,31 @@ struct TypeTextAtArgs { tab_id: String, } +#[derive(Clone, Debug)] +enum Key { + ENTER, + ESC, + PAGEUP, + PAGEDOWN, +} + +impl fmt::Display for Key { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Key::ENTER => write!(f, "Enter"), + Key::ESC => write!(f, "Escape"), + Key::PAGEUP => write!(f, "PageUp"), + Key::PAGEDOWN => write!(f, "PageDown"), + } + } +} + +#[derive(Debug)] +struct PressKeyAtArgs { + key: Key, + tab_id: String, +} + fn parse_single_command(command: &String) -> Result { let args = shell_words::split(&command).map_err(|e| e.to_string())?; if args.is_empty() { @@ -598,6 +633,26 @@ fn parse_single_command(command: &String) -> Result { } } }, + "press_key_at" => { + match parsed_args.as_slice() { + [key_str, tab_id] => { + let key = match key_str.to_lowercase().as_str() { + "enter" => Key::ENTER, + "esc" => Key::ESC, + "pageup" => Key::PAGEUP, + "pagedown" => Key::PAGEDOWN, + _ => return Err(format!("Unknown key: {}", key_str)), + }; + Ok(Command::PressKeyAt(PressKeyAtArgs { + key, + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'key', 'tab_id'".to_string()) + } + } + }, _ => Err(format!("Unknown command: {:?}.", command_name)), } } From c9a5bbedb125c97943ecd758ac698cc71852e5ce Mon Sep 17 00:00:00 2001 From: Oleg Klimov Date: Tue, 12 Nov 2024 09:33:39 +0100 Subject: [PATCH 06/18] multithread fix 1 --- src/integrations/integr_chrome.rs | 299 +++++++++++++++++++----------- 1 file changed, 192 insertions(+), 107 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index df4bb1187..87cb75c60 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -16,7 +16,7 @@ use crate::tools::tools_description::{Tool, ToolDesc, ToolParam}; use reqwest::Client; use std::path::PathBuf; -use headless_chrome::{Browser, LaunchOptions, Tab}; +use headless_chrome::{Browser, LaunchOptions, Tab as HeadlessTab}; use headless_chrome::browser::tab::point::Point; use headless_chrome::protocol::cdp::Page; use headless_chrome::protocol::cdp::Emulation; @@ -61,29 +61,29 @@ impl fmt::Display for DeviceType { #[derive(Clone)] pub struct ChromeTab { - instance: Arc, + headless_tab: Arc, device: DeviceType, tab_id: String, screenshot_scale_factor: f64, } impl ChromeTab { - fn new(instance: Arc, device: &DeviceType, tab_id: &String) -> Self { + fn new(headless_tab: Arc, device: &DeviceType, tab_id: &String) -> Self { Self { - instance, + headless_tab, device: device.clone(), tab_id: tab_id.clone(), screenshot_scale_factor: 1.0, } } pub fn state_string(&self) -> String { - format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.instance.get_url()) + format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.headless_tab.get_url()) } } struct ChromeSession { browser: Browser, - tabs: HashMap, + tabs: HashMap>>, } impl ChromeSession { @@ -148,8 +148,6 @@ impl Tool for ToolChrome { .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? .clone() }; - let mut command_session_locked = command_session.lock().await; - let chrome_session = command_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; let mut mutlimodal_els = vec![]; for command in commands_str.lines().map(|s| s.trim()).collect::>() { @@ -160,7 +158,7 @@ impl Tool for ToolChrome { break } }; - match parsed_command.execute(chrome_session).await { + match chrome_command_exec(&parsed_command, command_session.clone()).await { Ok((execute_log, command_multimodal_els)) => { tool_log.extend(execute_log); mutlimodal_els.extend(command_multimodal_els); @@ -283,17 +281,26 @@ async fn setup_chrome_session( Ok(setup_log) } -async fn navigate_to(instance: &Arc, url: &String) -> Result<(), String> { - instance.navigate_to(url.as_str()).map_err(|e| e.to_string())?; - instance.wait_until_navigated().map_err(|e| e.to_string())?; +async fn navigate_to(tab: Arc>, url: &String) -> Result<(), String> { + let tab_instance = { + let tab_lock = tab.lock().await; + tab_lock.headless_tab.clone() + }; + tab_instance.navigate_to(url.as_str()).map_err(|e| e.to_string())?; + tab_instance.wait_until_navigated().map_err(|e| e.to_string())?; Ok(()) } async fn screenshot_jpeg_base64( - tab: &mut ChromeTab, + tab: Arc>, capture_beyond_viewport: bool, ) -> Result { - let jpeg_base64_data = tab.instance.call_method(Page::CaptureScreenshot { + let chrome_tab = { + let tab_lock = tab.lock().await; + tab_lock.headless_tab.clone() + }; + + let jpeg_base64_data = chrome_tab.call_method(Page::CaptureScreenshot { format: Some(Page::CaptureScreenshotFormatOption::Jpeg), clip: None, quality: Some(75), @@ -312,7 +319,8 @@ async fn screenshot_jpeg_base64( // NOTE: the tool operates on resized image well without a special model notification let (nwidth, nheight) = (scale_factor * image.width() as f32, scale_factor * image.height() as f32); image = image.resize(nwidth as u32, nheight as u32, FilterType::Lanczos3); - tab.screenshot_scale_factor = scale_factor as f64; + let mut tab_lock = tab.lock().await; + tab_lock.screenshot_scale_factor = scale_factor as f64; } data = Vec::new(); @@ -334,13 +342,18 @@ async fn inner_html(url: String) -> Result { } } -async fn click_point(tab: &ChromeTab, point: &Point) -> Result<(), String> { - let mapped_point = Point { - x: point.x / tab.screenshot_scale_factor, - y: point.y / tab.screenshot_scale_factor, +async fn click_point(tab: Arc>, point: &Point) -> Result<(), String> { + let (mapped_point, headless_tab) = { + let tab_lock = tab.lock().await; + let mapped_point = Point { + x: point.x / tab_lock.screenshot_scale_factor, + y: point.y / tab_lock.screenshot_scale_factor, + }; + let headless_tab = tab_lock.headless_tab.clone(); + (mapped_point, headless_tab) }; - tab.instance.click_point(mapped_point).map_err(|e| e.to_string())?; - tab.instance.wait_until_navigated().map_err(|e| e.to_string())?; + headless_tab.click_point(mapped_point).map_err(|e| e.to_string())?; + headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; Ok(()) } @@ -351,13 +364,14 @@ async fn session_open_tab( ) -> Result { match chrome_session.tabs.get(tab_id) { Some(tab) => { - Err(format!("Tab is already opened: {}\n", tab.state_string())) + let tab_lock = tab.lock().await; + Err(format!("Tab is already opened: {}\n", tab_lock.state_string())) }, None => { - let instance = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; + let headless_tab = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; match device { DeviceType::MOBILE => { - instance.call_method(Emulation::SetDeviceMetricsOverride { + headless_tab.call_method(Emulation::SetDeviceMetricsOverride { width: 375, height: 812, device_scale_factor: 0.0, @@ -374,22 +388,23 @@ async fn session_open_tab( }).map_err(|e| e.to_string())?; }, DeviceType::DESKTOP => { - instance.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; + headless_tab.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; } } - let tab = ChromeTab::new(instance, device, tab_id); - chrome_session.tabs.insert(tab.tab_id.clone(), tab.clone()); - Ok(format!("opened a new tab: {}\n", tab.state_string())) + let tab = Arc::new(AMutex::new(ChromeTab::new(headless_tab, device, tab_id))); + let tab_lock = tab.lock().await; + chrome_session.tabs.insert(tab_id.clone(), tab.clone()); + Ok(format!("opened a new tab: {}\n", tab_lock.state_string())) } } } -async fn session_get_tab_mut<'a>( - chrome_session: &'a mut ChromeSession, +async fn session_get_tab_arc( + chrome_session: &ChromeSession, tab_id: &String, -) -> Result<&'a mut ChromeTab, String> { - match chrome_session.tabs.get_mut(tab_id) { - Some(tab) => Ok(tab), +) -> Result>, String> { + match chrome_session.tabs.get(tab_id) { + Some(tab) => Ok(tab.clone()), None => Err(format!("tab_id {} is not opened", tab_id)), } } @@ -406,82 +421,152 @@ enum Command { PressKeyAt(PressKeyAtArgs), } -impl Command { - pub async fn execute( - &self, - chrome_session: &mut ChromeSession, - ) -> Result<(Vec, Vec), String> { - let mut tool_log = vec![]; - let mut multimodal_els = vec![]; - - match self { - Command::OpenTab(args) => { - let log = session_open_tab(chrome_session, &args.tab_id, &args.device).await?; - tool_log.push(log); - }, - Command::NavigateTo(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match navigate_to(&tab.instance, &args.uri).await { - Ok(_) => format!("navigate_to successful: {}", tab.state_string()), - Err(e) => format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()), - }; - tool_log.push(log); - }, - Command::Screenshot(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match screenshot_jpeg_base64(tab, false).await { - Ok(multimodal_el) => { - multimodal_els.push(multimodal_el); - format!("made a screenshot of {}", tab.state_string()) - }, - Err(e) => format!("screenshot failed for {}: {}", tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::Html(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match inner_html(tab.instance.get_url()).await { - Ok(html) => format!("innerHtml of {}:\n\n{}", tab.state_string(), html), - Err(e) => format!("can't fetch innerHtml of {}: {}", tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::Reload(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match tab.instance.reload(false, None) { - Ok(_) => format!("reload of {} successful", tab.state_string()), - Err(e) => format!("reload of {} failed: {}", tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::ClickAt(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match click_point(&tab, &args.point).await { - Ok(_) => format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), - Err(e) => format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::TypeTextAt(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match tab.instance.type_str(args.text.as_str()) { - Ok(_) => format!("type `{}` at {}", args.text, tab.state_string()), - Err(e) => format!("type text failed at {}: {}", tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::PressKeyAt(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match tab.instance.press_key(args.key.to_string().as_str()) { - Ok(_) => format!("press `{}` at {}", args.key, tab.state_string()), - Err(e) => format!("press `{}` failed at {}: {}", args.key, tab.state_string(), e.to_string()), - }; - tool_log.push(log); - } +async fn chrome_command_exec( + cmd: &Command, + chrome_session: Arc>>, +) -> Result<(Vec, Vec), String> { + let mut tool_log = vec![]; + let mut multimodal_els = vec![]; + + match cmd { + Command::OpenTab(args) => { + let log = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_open_tab(chrome_session, &args.tab_id, &args.device).await? + }; + tool_log.push(log); + }, + Command::NavigateTo(args) => { + let tab: Arc> = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match navigate_to(tab.clone(), &args.uri).await { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("navigate_to successful: {}", tab_lock.state_string()) + }, + Err(e) => { + // let tab_lock = tab.lock().await; + format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::Screenshot(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match screenshot_jpeg_base64(tab.clone(), false).await { + Ok(multimodal_el) => { + multimodal_els.push(multimodal_el); + let tab_lock = tab.lock().await; + format!("made a screenshot of {}", tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("screenshot failed for {}: {}", tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::Html(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match inner_html(tab.lock().await.headless_tab.get_url()).await { + Ok(html) => { + let tab_lock = tab.lock().await; + format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("can't fetch innerHtml of {}: {}", tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::Reload(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match tab.lock().await.headless_tab.reload(false, None) { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("reload of {} successful", tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("reload of {} failed: {}", tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::ClickAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match click_point(tab.clone(), &args.point).await { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::TypeTextAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match tab.lock().await.headless_tab.type_str(args.text.as_str()) { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("type `{}` at {}", args.text, tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("type text failed at {}: {}", tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::PressKeyAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match tab.lock().await.headless_tab.press_key(args.key.to_string().as_str()) { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("press `{}` at {}", args.key, tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("press `{}` failed at {}: {}", args.key, tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); } - - Ok((tool_log, multimodal_els)) } + + Ok((tool_log, multimodal_els)) } #[derive(Debug)] From e9311efaa2f01471bf185c08deba8bd99e5edaf1 Mon Sep 17 00:00:00 2001 From: Oleg Klimov Date: Tue, 12 Nov 2024 09:38:22 +0100 Subject: [PATCH 07/18] multithreaded fix 2 --- src/integrations/integr_chrome.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 87cb75c60..91bdf3b51 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -480,7 +480,8 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match inner_html(tab.lock().await.headless_tab.get_url()).await { + let url = tab.lock().await.headless_tab.get_url(); + let log = match inner_html(url).await { Ok(html) => { let tab_lock = tab.lock().await; format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) From e165368949eff426990206cf5e19ed6d95ed2889 Mon Sep 17 00:00:00 2001 From: Oleg Klimov Date: Tue, 12 Nov 2024 09:38:29 +0100 Subject: [PATCH 08/18] warning --- src/tools/tool_patch_aux/tickets_parsing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/tool_patch_aux/tickets_parsing.rs b/src/tools/tool_patch_aux/tickets_parsing.rs index ac3271901..bdf596fcf 100644 --- a/src/tools/tool_patch_aux/tickets_parsing.rs +++ b/src/tools/tool_patch_aux/tickets_parsing.rs @@ -71,7 +71,7 @@ pub struct TicketToApply { } pub fn good_error_text(reason: &str, tickets: &Vec, resolution: Option) -> (String, Option) { - let mut text = format!("Couldn't create patch for tickets: '{}'.\nReason: {reason}", tickets.join(", ")); + let text = format!("Couldn't create patch for tickets: '{}'.\nReason: {reason}", tickets.join(", ")); if let Some(resolution) = resolution { let cd_format = format!("💿 {resolution}"); return (text, Some(cd_format)) From 131e29e88ee35eed33f8381af5073cf61d8c4e13 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 15 Nov 2024 21:46:18 +0300 Subject: [PATCH 09/18] fix deadlocks --- src/integrations/integr_chrome.rs | 220 +++++++++++++++--------------- 1 file changed, 107 insertions(+), 113 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 91bdf3b51..d4bb14b0d 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -281,33 +281,21 @@ async fn setup_chrome_session( Ok(setup_log) } -async fn navigate_to(tab: Arc>, url: &String) -> Result<(), String> { - let tab_instance = { - let tab_lock = tab.lock().await; - tab_lock.headless_tab.clone() - }; - tab_instance.navigate_to(url.as_str()).map_err(|e| e.to_string())?; - tab_instance.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(()) -} - async fn screenshot_jpeg_base64( tab: Arc>, capture_beyond_viewport: bool, ) -> Result { - let chrome_tab = { + let jpeg_base64_data = { let tab_lock = tab.lock().await; - tab_lock.headless_tab.clone() + tab_lock.headless_tab.call_method(Page::CaptureScreenshot { + format: Some(Page::CaptureScreenshotFormatOption::Jpeg), + clip: None, + quality: Some(75), + from_surface: Some(true), + capture_beyond_viewport: Some(capture_beyond_viewport), + }).map_err(|e| e.to_string())?.data }; - let jpeg_base64_data = chrome_tab.call_method(Page::CaptureScreenshot { - format: Some(Page::CaptureScreenshotFormatOption::Jpeg), - clip: None, - quality: Some(75), - from_surface: Some(true), - capture_beyond_viewport: Some(capture_beyond_viewport), - }).map_err(|e| e.to_string())?.data; - let mut data = base64::prelude::BASE64_STANDARD .decode(jpeg_base64_data).map_err(|e| e.to_string())?; let reader = ImageReader::with_format(Cursor::new(data), ImageFormat::Jpeg); @@ -319,6 +307,7 @@ async fn screenshot_jpeg_base64( // NOTE: the tool operates on resized image well without a special model notification let (nwidth, nheight) = (scale_factor * image.width() as f32, scale_factor * image.height() as f32); image = image.resize(nwidth as u32, nheight as u32, FilterType::Lanczos3); + // NOTE: we should store screenshot_scale_factor for every resized screenshot, not for a tab! let mut tab_lock = tab.lock().await; tab_lock.screenshot_scale_factor = scale_factor as f64; } @@ -329,34 +318,6 @@ async fn screenshot_jpeg_base64( MultimodalElement::new("image/jpeg".to_string(), base64::prelude::BASE64_STANDARD.encode(data)) } -async fn inner_html(url: String) -> Result { - let client = Client::builder() - .build() - .map_err(|e| e.to_string())?; - let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; - if response.status().is_success() { - let html = response.text().await.map_err(|e| e.to_string())?; - Ok(html) - } else { - Err(format!("status: {}", response.status())) - } -} - -async fn click_point(tab: Arc>, point: &Point) -> Result<(), String> { - let (mapped_point, headless_tab) = { - let tab_lock = tab.lock().await; - let mapped_point = Point { - x: point.x / tab_lock.screenshot_scale_factor, - y: point.y / tab_lock.screenshot_scale_factor, - }; - let headless_tab = tab_lock.headless_tab.clone(); - (mapped_point, headless_tab) - }; - headless_tab.click_point(mapped_point).map_err(|e| e.to_string())?; - headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(()) -} - async fn session_open_tab( chrome_session: &mut ChromeSession, tab_id: &String, @@ -443,15 +404,20 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match navigate_to(tab.clone(), &args.uri).await { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("navigate_to successful: {}", tab_lock.state_string()) - }, - Err(e) => { - // let tab_lock = tab.lock().await; - format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + match { + tab_lock.headless_tab.navigate_to(args.uri.as_str()).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("navigate_to successful: {}", tab_lock.state_string()) + }, + Err(e) => { + format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()) + }, + } }; tool_log.push(log); }, @@ -461,16 +427,19 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match screenshot_jpeg_base64(tab.clone(), false).await { - Ok(multimodal_el) => { - multimodal_els.push(multimodal_el); - let tab_lock = tab.lock().await; - format!("made a screenshot of {}", tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("screenshot failed for {}: {}", tab_lock.state_string(), e.to_string()) - }, + let log = { + // NOTE: this operation is not atomic, unfortunately + match screenshot_jpeg_base64(tab.clone(), false).await { + Ok(multimodal_el) => { + multimodal_els.push(multimodal_el); + let tab_lock = tab.lock().await; + format!("made a screenshot of {}", tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("screenshot failed for {}: {}", tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -480,16 +449,28 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let url = tab.lock().await.headless_tab.get_url(); - let log = match inner_html(url).await { - Ok(html) => { - let tab_lock = tab.lock().await; - format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("can't fetch innerHtml of {}: {}", tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + let url = tab_lock.headless_tab.get_url(); + match { + let client = Client::builder() + .build() + .map_err(|e| e.to_string())?; + let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; + if response.status().is_success() { + let html = response.text().await.map_err(|e| e.to_string())?; + Ok(html) + } else { + Err(format!("status: {}", response.status())) + } + } { + Ok(html) => { + format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) + }, + Err(e) => { + format!("can't fetch innerHtml of {}: {}", tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -499,15 +480,17 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match tab.lock().await.headless_tab.reload(false, None) { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("reload of {} successful", tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("reload of {} failed: {}", tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + let chrome_tab = tab_lock.headless_tab.clone(); + match chrome_tab.reload(false, None) { + Ok(_) => { + format!("reload of {} successful", tab_lock.state_string()) + }, + Err(e) => { + format!("reload of {} failed: {}", tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -517,15 +500,24 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match click_point(tab.clone(), &args.point).await { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + match { + let mapped_point = Point { + x: args.point.x / tab_lock.screenshot_scale_factor, + y: args.point.y / tab_lock.screenshot_scale_factor, + }; + tab_lock.headless_tab.click_point(mapped_point).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab_lock.state_string()) + }, + Err(e) => { + format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -535,15 +527,16 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match tab.lock().await.headless_tab.type_str(args.text.as_str()) { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("type `{}` at {}", args.text, tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("type text failed at {}: {}", tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + match tab_lock.headless_tab.type_str(args.text.as_str()) { + Ok(_) => { + format!("type `{}` at {}", args.text, tab_lock.state_string()) + }, + Err(e) => { + format!("type text failed at {}: {}", tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -553,15 +546,16 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match tab.lock().await.headless_tab.press_key(args.key.to_string().as_str()) { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("press `{}` at {}", args.key, tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("press `{}` failed at {}: {}", args.key, tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + match tab_lock.headless_tab.press_key(args.key.to_string().as_str()) { + Ok(_) => { + format!("press `{}` at {}", args.key, tab_lock.state_string()) + }, + Err(e) => { + format!("press `{}` failed at {}: {}", args.key, tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); } From db152956921e3216918fb91f08dd2fc04e5c9944 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 15 Nov 2024 21:55:03 +0300 Subject: [PATCH 10/18] disable html command for now --- src/integrations/integr_chrome.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index d4bb14b0d..afad179f8 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -192,7 +192,7 @@ impl Tool for ToolChrome { "open_tab ", "navigate_to ", "screenshot ", - "html ", + // "html ", "reload ", ]; if self.supports_clicks { @@ -444,6 +444,7 @@ async fn chrome_command_exec( tool_log.push(log); }, Command::Html(args) => { + // NOTE: removed from commands list, please rewrite me... let tab = { let mut chrome_session_locked = chrome_session.lock().await; let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; From 626cfed32b067f13c32bd3abe06c25a5aada5a9c Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 15 Nov 2024 21:56:27 +0300 Subject: [PATCH 11/18] default window size 800x600 --- src/integrations/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/integrations/mod.rs b/src/integrations/mod.rs index d004f0247..8ac7254a2 100644 --- a/src/integrations/mod.rs +++ b/src/integrations/mod.rs @@ -54,7 +54,7 @@ chrome: # Or you can give it ws:// path, read more here https://developer.chrome.com/docs/devtools/remote-debugging/local-server/ # In that case start chrome with --remote-debugging-port chrome_path: "ws://127.0.0.1:6006/" - window_size: [1024, 768] + window_size: [800, 600] idle_browser_timeout: 600 From c6be37348c5be5a18f061d85086899587f005b00 Mon Sep 17 00:00:00 2001 From: mitya Date: Tue, 19 Nov 2024 20:37:16 +0100 Subject: [PATCH 12/18] collect all log entries and return using new tab_log command --- src/integrations/integr_chrome.rs | 51 +++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index afad179f8..c98a9fd95 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -20,8 +20,12 @@ use headless_chrome::{Browser, LaunchOptions, Tab as HeadlessTab}; use headless_chrome::browser::tab::point::Point; use headless_chrome::protocol::cdp::Page; use headless_chrome::protocol::cdp::Emulation; +use headless_chrome::protocol::cdp::types::Event; use serde::{Deserialize, Serialize}; +use std::sync::Mutex; use std::fmt; +use tokio::time::sleep; +use chrono::DateTime; use base64::Engine; use std::io::Cursor; @@ -65,6 +69,8 @@ pub struct ChromeTab { device: DeviceType, tab_id: String, screenshot_scale_factor: f64, + // NOTE: logs vector should be at least limited + tab_log: Arc>>, } impl ChromeTab { @@ -74,6 +80,7 @@ impl ChromeTab { device: device.clone(), tab_id: tab_id.clone(), screenshot_scale_factor: 1.0, + tab_log: Arc::new(Mutex::new(Vec::new())), } } pub fn state_string(&self) -> String { @@ -194,6 +201,7 @@ impl Tool for ToolChrome { "screenshot ", // "html ", "reload ", + "tab_log ", ]; if self.supports_clicks { supported_commands.extend(vec![ @@ -354,6 +362,17 @@ async fn session_open_tab( } let tab = Arc::new(AMutex::new(ChromeTab::new(headless_tab, device, tab_id))); let tab_lock = tab.lock().await; + let tab_log = Arc::clone(&tab_lock.tab_log); + tab_lock.headless_tab.enable_log().map_err(|e| e.to_string())?; + tab_lock.headless_tab.add_event_listener(Arc::new(move |event: &Event| { + if let Event::LogEntryAdded(e) = event { + let formatted_ts = { + let dt = DateTime::from_timestamp(e.params.entry.timestamp as i64, 0).unwrap(); + dt.format("%Y-%m-%d %H:%M:%S").to_string() + }; + tab_log.lock().unwrap().push(format!("{} [{:?}]: {}", formatted_ts, e.params.entry.level, e.params.entry.text)); + } + })).map_err(|e| e.to_string())?; chrome_session.tabs.insert(tab_id.clone(), tab.clone()); Ok(format!("opened a new tab: {}\n", tab_lock.state_string())) } @@ -380,6 +399,7 @@ enum Command { ClickAt(ClickAtArgs), TypeTextAt(TypeTextAtArgs), PressKeyAt(PressKeyAtArgs), + TabLog(TabLogArgs), } async fn chrome_command_exec( @@ -559,6 +579,19 @@ async fn chrome_command_exec( } }; tool_log.push(log); + }, + Command::TabLog(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let tab_lock = tab.lock().await; + // NOTE: we're waiting for log to be collected for 3 seconds + sleep(Duration::from_secs(3)).await; + let mut tab_log_lock = tab_lock.tab_log.lock().unwrap(); + tool_log.extend(tab_log_lock.clone()); + tab_log_lock.clear(); } } @@ -629,6 +662,12 @@ struct PressKeyAtArgs { tab_id: String, } +#[derive(Debug)] +struct TabLogArgs { + // wait_secs: u32, + tab_id: String, +} + fn parse_single_command(command: &String) -> Result { let args = shell_words::split(&command).map_err(|e| e.to_string())?; if args.is_empty() { @@ -734,6 +773,18 @@ fn parse_single_command(command: &String) -> Result { } } }, + "tab_log" => { + match parsed_args.as_slice() { + [tab_id] => { + Ok(Command::TabLog(TabLogArgs { + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'tab_id'".to_string()) + } + } + }, _ => Err(format!("Unknown command: {:?}.", command_name)), } } From adef358cd52eb3abc2ba509a3b4c5e535acc0fac Mon Sep 17 00:00:00 2001 From: mitya Date: Wed, 20 Nov 2024 11:21:48 +0100 Subject: [PATCH 13/18] home, end keys + 1 second wait after press key --- src/integrations/integr_chrome.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index c98a9fd95..9d206d079 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -201,13 +201,13 @@ impl Tool for ToolChrome { "screenshot ", // "html ", "reload ", + "press_key_at ", + "type_text_at ", "tab_log ", ]; if self.supports_clicks { supported_commands.extend(vec![ "click_at ", - "type_text_at ", - "press_key_at ", ]); } let description = format!( @@ -569,7 +569,13 @@ async fn chrome_command_exec( }; let log = { let tab_lock = tab.lock().await; - match tab_lock.headless_tab.press_key(args.key.to_string().as_str()) { + match { + tab_lock.headless_tab.press_key(args.key.to_string().as_str()).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + // TODO: sometimes page isn't ready for next step + sleep(Duration::from_secs(1)).await; + Ok::<(), String>(()) + } { Ok(_) => { format!("press `{}` at {}", args.key, tab_lock.state_string()) }, @@ -643,6 +649,8 @@ enum Key { ESC, PAGEUP, PAGEDOWN, + HOME, + END, } impl fmt::Display for Key { @@ -652,6 +660,8 @@ impl fmt::Display for Key { Key::ESC => write!(f, "Escape"), Key::PAGEUP => write!(f, "PageUp"), Key::PAGEDOWN => write!(f, "PageDown"), + Key::HOME => write!(f, "Home"), + Key::END => write!(f, "End"), } } } @@ -761,6 +771,8 @@ fn parse_single_command(command: &String) -> Result { "esc" => Key::ESC, "pageup" => Key::PAGEUP, "pagedown" => Key::PAGEDOWN, + "home" => Key::HOME, + "end" => Key::END, _ => return Err(format!("Unknown key: {}", key_str)), }; Ok(Command::PressKeyAt(PressKeyAtArgs { From ec58ec72d928023d14506d1d12699d843055a47c Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 29 Nov 2024 22:27:29 +0100 Subject: [PATCH 14/18] default output filter for tab log --- src/integrations/integr_chrome.rs | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 9d206d079..1c43a5b45 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -12,6 +12,7 @@ use crate::integrations::sessions::{IntegrationSession, get_session_hashmap_key} use crate::global_context::GlobalContext; use crate::call_validation::{ChatContent, ChatMessage}; use crate::scratchpads::multimodality::MultimodalElement; +use crate::postprocessing::pp_command_output::{CmdlineOutputFilter, output_mini_postprocessing}; use crate::tools::tools_description::{Tool, ToolDesc, ToolParam}; use reqwest::Client; @@ -63,13 +64,14 @@ impl fmt::Display for DeviceType { } } +const MAX_CACHED_LOG_LINES: usize = 1000; + #[derive(Clone)] pub struct ChromeTab { headless_tab: Arc, device: DeviceType, tab_id: String, screenshot_scale_factor: f64, - // NOTE: logs vector should be at least limited tab_log: Arc>>, } @@ -370,7 +372,11 @@ async fn session_open_tab( let dt = DateTime::from_timestamp(e.params.entry.timestamp as i64, 0).unwrap(); dt.format("%Y-%m-%d %H:%M:%S").to_string() }; - tab_log.lock().unwrap().push(format!("{} [{:?}]: {}", formatted_ts, e.params.entry.level, e.params.entry.text)); + let mut tab_log_lock = tab_log.lock().unwrap(); + tab_log_lock.push(format!("{} [{:?}]: {}", formatted_ts, e.params.entry.level, e.params.entry.text)); + if tab_log_lock.len() > MAX_CACHED_LOG_LINES { + tab_log_lock.remove(0); + } } })).map_err(|e| e.to_string())?; chrome_session.tabs.insert(tab_id.clone(), tab.clone()); @@ -592,12 +598,18 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let tab_lock = tab.lock().await; - // NOTE: we're waiting for log to be collected for 3 seconds - sleep(Duration::from_secs(3)).await; - let mut tab_log_lock = tab_lock.tab_log.lock().unwrap(); - tool_log.extend(tab_log_lock.clone()); - tab_log_lock.clear(); + let tab_log = { + let tab_lock = tab.lock().await; + // NOTE: we're waiting for log to be collected for 3 seconds + sleep(Duration::from_secs(3)).await; + let mut tab_log_lock = tab_lock.tab_log.lock().unwrap(); + let tab_log = tab_log_lock.join("\n"); + tab_log_lock.clear(); + tab_log + }; + let filter = CmdlineOutputFilter::default(); + let filtered_log = output_mini_postprocessing(&filter, tab_log.as_str()); + tool_log.push(filtered_log.clone()); } } From 1409bbcca800cb871b9672ae621c477b9a9c309d Mon Sep 17 00:00:00 2001 From: mitya Date: Sat, 30 Nov 2024 00:51:50 +0100 Subject: [PATCH 15/18] WIP: highlighted screenshot for interactive navigation --- src/integrations/integr_chrome.rs | 194 ++++++++++++++++++++++++++---- 1 file changed, 169 insertions(+), 25 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 1c43a5b45..784b5c648 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -197,21 +197,17 @@ impl Tool for ToolChrome { } fn tool_description(&self) -> ToolDesc { - let mut supported_commands = vec![ + let supported_commands = vec![ "open_tab ", "navigate_to ", - "screenshot ", + "screenshot ", // "html ", "reload ", "press_key_at ", "type_text_at ", "tab_log ", + "click_at ", ]; - if self.supports_clicks { - supported_commands.extend(vec![ - "click_at ", - ]); - } let description = format!( "One or several commands separated by newline. \ The is an integer, for example 10, for you to identify the tab later. \ @@ -293,17 +289,34 @@ async fn setup_chrome_session( async fn screenshot_jpeg_base64( tab: Arc>, - capture_beyond_viewport: bool, -) -> Result { + highlight: bool, +) -> Result<(Vec, MultimodalElement), String> { + let mut interactive_element_map = vec![]; let jpeg_base64_data = { let tab_lock = tab.lock().await; - tab_lock.headless_tab.call_method(Page::CaptureScreenshot { - format: Some(Page::CaptureScreenshotFormatOption::Jpeg), - clip: None, - quality: Some(75), - from_surface: Some(true), - capture_beyond_viewport: Some(capture_beyond_viewport), - }).map_err(|e| e.to_string())?.data + match { + if highlight { + interactive_element_map = highlight_elements(&tab_lock.headless_tab) + .await.map_err(|e| e.to_string())?; + } + let data = tab_lock.headless_tab.call_method(Page::CaptureScreenshot { + format: Some(Page::CaptureScreenshotFormatOption::Jpeg), + clip: None, + quality: Some(75), + from_surface: Some(true), + capture_beyond_viewport: Some(false), + }).map_err(|e| e.to_string())?.data; + Ok::(data) + } { + Ok(data) => { + remove_highlight(&tab_lock.headless_tab).await.map_err(|e| e.to_string())?; + data + }, + Err(e) => { + remove_highlight(&tab_lock.headless_tab).await.map_err(|e| e.to_string())?; + return Err(e) + } + } }; let mut data = base64::prelude::BASE64_STANDARD @@ -317,15 +330,35 @@ async fn screenshot_jpeg_base64( // NOTE: the tool operates on resized image well without a special model notification let (nwidth, nheight) = (scale_factor * image.width() as f32, scale_factor * image.height() as f32); image = image.resize(nwidth as u32, nheight as u32, FilterType::Lanczos3); + let mut interactive_element_map_scaled = vec![]; + for (label, x, y) in interactive_element_map { + let (scaled_x, scaled_y) = ((x as f32 * scale_factor) as i32, (y as f32 * scale_factor) as i32); + interactive_element_map_scaled.push((label, scaled_x, scaled_y)); + } + interactive_element_map = interactive_element_map_scaled; // NOTE: we should store screenshot_scale_factor for every resized screenshot, not for a tab! let mut tab_lock = tab.lock().await; tab_lock.screenshot_scale_factor = scale_factor as f64; } + let mut tool_log = vec![]; + if highlight && interactive_element_map.len() > 0 { + tool_log.push("Clickable elements are highlighted with red rectangles and numbered labels at the top left.".to_string()); + tool_log.push("The interactive elements map to the rendered page as : , .".to_string()); + for (label, x, y) in interactive_element_map { + tool_log.push(format!("{}: {}, {}", label, x, y)); + } + } + data = Vec::new(); image.write_to(&mut Cursor::new(&mut data), ImageFormat::Jpeg).map_err(|e| e.to_string())?; - MultimodalElement::new("image/jpeg".to_string(), base64::prelude::BASE64_STANDARD.encode(data)) + let multimodal_el = MultimodalElement::new( + "image/jpeg".to_string(), + base64::prelude::BASE64_STANDARD.encode(data) + ).map_err(|e| e.to_string())?; + + Ok((tool_log, multimodal_el)) } async fn session_open_tab( @@ -453,13 +486,18 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; + let highlight = match args.mode { + ScreenshotMode::HIGHLIGHT => true, + _ => false, + }; let log = { // NOTE: this operation is not atomic, unfortunately - match screenshot_jpeg_base64(tab.clone(), false).await { - Ok(multimodal_el) => { + match screenshot_jpeg_base64(tab.clone(), highlight).await { + Ok((log, multimodal_el)) => { multimodal_els.push(multimodal_el); let tab_lock = tab.lock().await; - format!("made a screenshot of {}", tab_lock.state_string()) + let log_str = log.join("\n"); + vec![log_str, format!("made a screenshot of {}", tab_lock.state_string())].join("\n\n") }, Err(e) => { let tab_lock = tab.lock().await; @@ -628,8 +666,15 @@ struct NavigateToArgs { tab_id: String, } +#[derive(Clone, Debug)] +enum ScreenshotMode { + PLAIN, + HIGHLIGHT, +} + #[derive(Debug)] struct ScreenshotArgs { + mode: ScreenshotMode, tab_id: String, } @@ -723,12 +768,22 @@ fn parse_single_command(command: &String) -> Result { })) }, "screenshot" => { - if parsed_args.len() < 1 { - return Err(format!("`screenshot` requires 1 argument: `tab_id`. Provided: {:?}", parsed_args)); + match parsed_args.as_slice() { + [mode_str, tab_id] => { + let mode = match mode_str.to_lowercase().as_str() { + "plain" => ScreenshotMode::PLAIN, + "highlight_before_click" => ScreenshotMode::HIGHLIGHT, + _ => return Err(format!("Unknown screenshot mode: {}.", mode_str)), + }; + Ok(Command::Screenshot(ScreenshotArgs { + mode: mode.clone(), + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'mode', 'tab_id'".to_string()) + } } - Ok(Command::Screenshot(ScreenshotArgs { - tab_id: parsed_args[0].clone(), - })) }, "html" => { if parsed_args.len() < 1 { @@ -812,3 +867,92 @@ fn parse_single_command(command: &String) -> Result { _ => Err(format!("Unknown command: {:?}.", command_name)), } } + +async fn highlight_elements(tab: &Arc) -> Result, String> { + let func = " + (function () { + const clickableElements = document.querySelectorAll('a, button, [onclick], [role=\"button\"]'); + let results = []; + clickableElements.forEach(element => { + if (element) { + const rect = element.getBoundingClientRect(); + if (rect.width * rect.height > 0) { + element.style.outline = '2px solid red'; + element.setAttribute('browser-user-highlight-id', 'screenshot-highlight'); + const label_text = (results.length + 1).toString(); + + const label = document.createElement('div'); + label.className = 'screenshot-highlight-label'; + label.style.position = 'fixed'; + label.style.background = 'red'; + label.style.color = 'white'; + label.style.padding = '2px 6px'; + label.style.borderRadius = '10px'; + label.style.fontSize = '12px'; + label.style.zIndex = '9999999'; + label.textContent = label_text; + label.style.top = (rect.top - 20) + 'px'; + label.style.left = rect.left + 'px'; + document.body.appendChild(label); + + midpoint_x = rect.left + rect.width / 2; + midpoint_y = rect.top + rect.height / 2; + midpoint_text = `${label_text}: ${parseInt(midpoint_x)}, ${parseInt(midpoint_y)}`; + results.push(midpoint_text); + } + } + }); + return results; + })();"; + + let result = tab.evaluate(func, false).map_err(|e| e.to_string())?; + if let Some(preview) = result.preview { + let mut interactive_element_map = vec![]; + for pp in preview.properties { + if let Some(value) = pp.value.clone() { + let parts: Vec = value.to_string().split(':').map(|x| x.to_string()).collect(); + if parts.len() != 2 { + continue; + } + let label = parts[0].trim().to_string(); + let coords: Vec<&str> = parts[1].trim().split(',').collect(); + if coords.len() != 2 { + continue; + } + let (x, y) = match { + let x = coords[0].trim().parse::().map_err(|e| e.to_string())?; + let y = coords[1].trim().parse::().map_err(|e| e.to_string())?; + Ok::<(i32, i32), String>((x, y)) + } { + Ok((x, y)) => (x, y), + Err(_) => continue, + }; + interactive_element_map.push((label, x, y)); + } + } + return Ok(interactive_element_map); + } + if let Some(e) = result.description { + return Err(e); + } + Err("Unexpected error while highlighting clickable elements".to_string()) +} + +async fn remove_highlight(tab: &Arc) -> Result<(), String> { + let func = " + (function () { + const highlightedElements = document.querySelectorAll('[browser-user-highlight-id=\"screenshot-highlight\"]'); + highlightedElements.forEach(element => { + element.style.outline = ''; + element.removeAttribute('browser-user-highlight-id'); + }); + const labels = document.querySelectorAll('.screenshot-highlight-label'); + labels.forEach(label => label.remove()); + })();"; + + let result = tab.evaluate(func, false).map_err(|e| e.to_string())?; + if let Some(e) = result.description { + return Err(e); + } + Ok(()) +} From 34be9db08b46e2f264f37567dffa1164b030204f Mon Sep 17 00:00:00 2001 From: mitya Date: Sat, 30 Nov 2024 14:10:51 +0100 Subject: [PATCH 16/18] notes about screenshot modes --- src/integrations/integr_chrome.rs | 24 +++++++++++++----------- src/tools/tools_description.rs | 4 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 784b5c648..ed246ca96 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -46,7 +46,6 @@ fn default_headless() -> bool { true } pub struct ToolChrome { integration_chrome: IntegrationChrome, - supports_clicks: bool, } #[derive(Clone, Debug)] @@ -117,15 +116,12 @@ impl IntegrationSession for ChromeSession } impl ToolChrome { - pub fn new_from_yaml(v: &serde_yaml::Value, supports_clicks: bool,) -> Result { + pub fn new_from_yaml(v: &serde_yaml::Value) -> Result { let integration_chrome = serde_yaml::from_value::(v.clone()).map_err(|e| { let location = e.location().map(|loc| format!(" at line {}, column {}", loc.line(), loc.column())).unwrap_or_default(); format!("{}{}", e.to_string(), location) })?; - Ok(Self { - integration_chrome, - supports_clicks, - }) + Ok(Self { integration_chrome }) } } @@ -197,10 +193,16 @@ impl Tool for ToolChrome { } fn tool_description(&self) -> ToolDesc { + let tool_description = vec![ + "A real web browser with graphical interface.", + "Notes about screenshot modes:", + "- plain mode is for visual validation and exploration;", + "- highlight mode gets clickable elements map to use it for click command.", + ].join("\n"); let supported_commands = vec![ "open_tab ", "navigate_to ", - "screenshot ", + "screenshot ", // "html ", "reload ", "press_key_at ", @@ -208,7 +210,7 @@ impl Tool for ToolChrome { "tab_log ", "click_at ", ]; - let description = format!( + let commands_description = format!( "One or several commands separated by newline. \ The is an integer, for example 10, for you to identify the tab later. \ Supported commands:\n{}", supported_commands.join("\n")); @@ -216,11 +218,11 @@ impl Tool for ToolChrome { name: "chrome".to_string(), agentic: true, experimental: true, - description: "A real web browser with graphical interface.".to_string(), + description: tool_description, parameters: vec![ToolParam { name: "commands".to_string(), param_type: "string".to_string(), - description, + description: commands_description, }], parameters_required: vec!["commands".to_string()], } @@ -772,7 +774,7 @@ fn parse_single_command(command: &String) -> Result { [mode_str, tab_id] => { let mode = match mode_str.to_lowercase().as_str() { "plain" => ScreenshotMode::PLAIN, - "highlight_before_click" => ScreenshotMode::HIGHLIGHT, + "highlight" => ScreenshotMode::HIGHLIGHT, _ => return Err(format!("Unknown screenshot mode: {}.", mode_str)), }; Ok(Command::Screenshot(ScreenshotArgs { diff --git a/src/tools/tools_description.rs b/src/tools/tools_description.rs index 772aee0a7..15e802049 100644 --- a/src/tools/tools_description.rs +++ b/src/tools/tools_description.rs @@ -80,7 +80,7 @@ pub async fn read_integrations_yaml(gcx: Arc>) -> Result< pub async fn tools_merged_and_filtered( gcx: Arc>, - supports_clicks: bool, + _supports_clicks: bool, ) -> Result>>>, String> { let (ast_on, vecdb_on, allow_experimental) = { let gcx_locked = gcx.read().await; @@ -133,7 +133,7 @@ pub async fn tools_merged_and_filtered( tools_all.insert("pdb".to_string(), Arc::new(AMutex::new(Box::new(ToolPdb::new_from_yaml(pdb_config)?) as Box))); } if let Some(chrome_config) = integrations_value.get("chrome") { - tools_all.insert("chrome".to_string(), Arc::new(AMutex::new(Box::new(ToolChrome::new_from_yaml(chrome_config, supports_clicks)?) as Box))); + tools_all.insert("chrome".to_string(), Arc::new(AMutex::new(Box::new(ToolChrome::new_from_yaml(chrome_config)?) as Box))); } if let Some(postgres_config) = integrations_value.get("postgres") { tools_all.insert("postgres".to_string(), Arc::new(AMutex::new(Box::new(ToolPostgres::new_from_yaml(postgres_config)?) as Box))); From 9b99ebb65a4bd0402ea10ac9dd62afef3fb24ed1 Mon Sep 17 00:00:00 2001 From: mitya Date: Sat, 30 Nov 2024 16:52:12 +0100 Subject: [PATCH 17/18] some improvements for highlight --- src/integrations/integr_chrome.rs | 40 ++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index ed246ca96..7fa42629d 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -343,12 +343,19 @@ async fn screenshot_jpeg_base64( tab_lock.screenshot_scale_factor = scale_factor as f64; } + let mut interactive_element_map_visible = vec![]; + for (label, x, y) in interactive_element_map { + if x < image.width() as i32 && y < image.height() as i32 { + interactive_element_map_visible.push((label, x, y)); + } + } + let mut tool_log = vec![]; - if highlight && interactive_element_map.len() > 0 { + if highlight && interactive_element_map_visible.len() > 0 { tool_log.push("Clickable elements are highlighted with red rectangles and numbered labels at the top left.".to_string()); - tool_log.push("The interactive elements map to the rendered page as : , .".to_string()); - for (label, x, y) in interactive_element_map { - tool_log.push(format!("{}: {}, {}", label, x, y)); + tool_log.push("The interactive elements map to the rendered page:".to_string()); + for (label, x, y) in interactive_element_map_visible { + tool_log.push(format!("label `{}` center is ({}, {})", label, x, y)); } } @@ -871,31 +878,42 @@ fn parse_single_command(command: &String) -> Result { } async fn highlight_elements(tab: &Arc) -> Result, String> { + // NOTE: for now there is the problem with input, no label for it unfortunately let func = " (function () { - const clickableElements = document.querySelectorAll('a, button, [onclick], [role=\"button\"]'); + const clickableElements = document.querySelectorAll( + 'a, button, details, embed, input, menu, menuitem, object, select, textarea, summary, [onclick], [role=\"button\"]' + ); let results = []; clickableElements.forEach(element => { if (element) { const rect = element.getBoundingClientRect(); - if (rect.width * rect.height > 0) { + if (rect.left >= 0 && rect.top >= 0 && rect.width * rect.height > 0) { element.style.outline = '2px solid red'; element.setAttribute('browser-user-highlight-id', 'screenshot-highlight'); + const label_text = (results.length + 1).toString(); const label = document.createElement('div'); label.className = 'screenshot-highlight-label'; - label.style.position = 'fixed'; label.style.background = 'red'; label.style.color = 'white'; label.style.padding = '2px 6px'; label.style.borderRadius = '10px'; label.style.fontSize = '12px'; - label.style.zIndex = '9999999'; label.textContent = label_text; - label.style.top = (rect.top - 20) + 'px'; - label.style.left = rect.left + 'px'; - document.body.appendChild(label); + + label.style.position = 'absolute'; + label.style.zIndex = '999999'; + label.style.top = '0'; + label.style.left = '0'; + + // Set the parent element's position to relative if not already set + if (getComputedStyle(element).position === 'static') { + element.style.position = 'relative'; // Establish a positioning context + } + + element.appendChild(label); midpoint_x = rect.left + rect.width / 2; midpoint_y = rect.top + rect.height / 2; From e55df6b9a6b02193c214fbe02f33937747a1e314 Mon Sep 17 00:00:00 2001 From: mitya Date: Sat, 30 Nov 2024 16:58:42 +0100 Subject: [PATCH 18/18] move to png from jpeg for screenshot --- src/integrations/integr_chrome.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 7fa42629d..68cd21f6a 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -289,12 +289,12 @@ async fn setup_chrome_session( Ok(setup_log) } -async fn screenshot_jpeg_base64( +async fn capture_screenshot_base64( tab: Arc>, highlight: bool, ) -> Result<(Vec, MultimodalElement), String> { let mut interactive_element_map = vec![]; - let jpeg_base64_data = { + let base64_data = { let tab_lock = tab.lock().await; match { if highlight { @@ -302,9 +302,9 @@ async fn screenshot_jpeg_base64( .await.map_err(|e| e.to_string())?; } let data = tab_lock.headless_tab.call_method(Page::CaptureScreenshot { - format: Some(Page::CaptureScreenshotFormatOption::Jpeg), + format: Some(Page::CaptureScreenshotFormatOption::Png), clip: None, - quality: Some(75), + quality: None, from_surface: Some(true), capture_beyond_viewport: Some(false), }).map_err(|e| e.to_string())?.data; @@ -322,8 +322,8 @@ async fn screenshot_jpeg_base64( }; let mut data = base64::prelude::BASE64_STANDARD - .decode(jpeg_base64_data).map_err(|e| e.to_string())?; - let reader = ImageReader::with_format(Cursor::new(data), ImageFormat::Jpeg); + .decode(base64_data).map_err(|e| e.to_string())?; + let reader = ImageReader::with_format(Cursor::new(data), ImageFormat::Png); let mut image = reader.decode().map_err(|e| e.to_string())?; let max_dimension = 800.0; @@ -360,10 +360,10 @@ async fn screenshot_jpeg_base64( } data = Vec::new(); - image.write_to(&mut Cursor::new(&mut data), ImageFormat::Jpeg).map_err(|e| e.to_string())?; + image.write_to(&mut Cursor::new(&mut data), ImageFormat::Png).map_err(|e| e.to_string())?; let multimodal_el = MultimodalElement::new( - "image/jpeg".to_string(), + "image/png".to_string(), base64::prelude::BASE64_STANDARD.encode(data) ).map_err(|e| e.to_string())?; @@ -501,7 +501,7 @@ async fn chrome_command_exec( }; let log = { // NOTE: this operation is not atomic, unfortunately - match screenshot_jpeg_base64(tab.clone(), highlight).await { + match capture_screenshot_base64(tab.clone(), highlight).await { Ok((log, multimodal_el)) => { multimodal_els.push(multimodal_el); let tab_lock = tab.lock().await;