From 256257c7055808c548c4f1ce2bfccbc2d79f267d Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 14:15:25 +0300 Subject: [PATCH 01/14] new open tab command and chrome tool refactor --- src/integrations/integr_chrome.rs | 320 +++++++++++++++++------------- 1 file changed, 186 insertions(+), 134 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 4f4075169..074bd61a5 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -21,7 +21,7 @@ use headless_chrome::browser::tab::point::Point; use headless_chrome::protocol::cdp::Page; use headless_chrome::protocol::cdp::Emulation; use serde::{Deserialize, Serialize}; - +use std::fmt; #[derive(Clone, Serialize, Deserialize, Debug)] pub struct IntegrationChrome { @@ -39,9 +39,37 @@ pub struct ToolChrome { supports_clicks: bool, } +#[derive(Clone, Debug)] +enum DeviceType { + DESKTOP, + MOBILE, +} + +impl fmt::Display for DeviceType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + DeviceType::DESKTOP => write!(f, "desktop"), + DeviceType::MOBILE => write!(f, "mobile"), + } + } +} + +#[derive(Clone)] +pub struct ChromeTab { + instance: Arc, + device: DeviceType, + tab_id: String, +} + +impl ChromeTab { + pub fn state_string(&self) -> String { + format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.instance.get_url()) + } +} + struct ChromeSession { browser: Browser, - tabs: HashMap>, + tabs: HashMap, } impl ChromeSession { @@ -140,15 +168,23 @@ impl Tool for ToolChrome { } fn tool_description(&self) -> ToolDesc { - let mut commands_desc = r#"One or several commands separated by newline. The is an integer, for example 10, for you to identify the tab later. Supported commands: -navigate_to -screenshot -html -reload -device "#.to_string(); + let mut supported_commands = vec![ + "open_tab ", + "navigate_to ", + "screenshot ", + "html ", + "reload ", + ]; if self.supports_clicks { - commands_desc = format!("{}\nclick \ninsert_text \n", commands_desc); + supported_commands.extend(vec![ + "click ", + "insert_text ", + ]); } + let description = format!( + "One or several commands separated by newline. \ + The is an integer, for example 10, for you to identify the tab later. \ + Supported commands:\n{}", supported_commands.join("\n")); ToolDesc { name: "chrome".to_string(), agentic: true, @@ -157,7 +193,7 @@ device "#.to_string(); parameters: vec![ToolParam { name: "commands".to_string(), param_type: "string".to_string(), - description: commands_desc, + description, }], parameters_required: vec!["commands".to_string()], } @@ -224,49 +260,103 @@ async fn setup_chrome_session( Ok(setup_log) } -async fn navigate_to(tab: &Arc, url: &String) -> Result { - tab.navigate_to(url.as_str()).map_err(|e| e.to_string())?; - tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(format!("Chrome tab navigated to {}", tab.get_url())) +async fn navigate_to(instance: &Arc, url: &String) -> Result<(), String> { + instance.navigate_to(url.as_str()).map_err(|e| e.to_string())?; + instance.wait_until_navigated().map_err(|e| e.to_string())?; + Ok(()) } -async fn click_on_point(tab: &Arc, point: &Point) -> Result { - tab.click_point(point.clone()).map_err(|e| e.to_string())?; - tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(format!("clicked on `{} {}`", point.x, point.y)) +async fn screenshot_jpeg_base64(instance: &Arc, capture_beyond_viewport: bool) -> Result { + let jpeg_data = instance.call_method(Page::CaptureScreenshot { + format: Some(Page::CaptureScreenshotFormatOption::Jpeg), + clip: None, + quality: Some(75), + from_surface: Some(true), + capture_beyond_viewport: Some(capture_beyond_viewport), + }).map_err(|e| e.to_string())?.data; + MultimodalElement::new("image/jpeg".to_string(), jpeg_data) +} + +async fn inner_html(url: String) -> Result { + let client = Client::builder() + .build() + .map_err(|e| e.to_string())?; + let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; + if response.status().is_success() { + let html = response.text().await.map_err(|e| e.to_string())?; + Ok(html) + } else { + Err(format!("status: {}", response.status())) + } } -async fn insert_text(tab: &Arc, text: &String) -> Result { - tab.type_str(text.as_str()).map_err(|e| e.to_string())?; - Ok(format!("inserted text `{}`", text.clone())) +async fn click_on_point(instance: &Arc, point: &Point) -> Result<(), String> { + instance.click_point(point.clone()).map_err(|e| e.to_string())?; + instance.wait_until_navigated().map_err(|e| e.to_string())?; + Ok(()) } async fn session_open_tab( chrome_session: &mut ChromeSession, - tab_name: &String, -) -> Result<(Arc, String), String> { - match chrome_session.tabs.get(tab_name) { + tab_id: &String, + device: &DeviceType, +) -> Result { + match chrome_session.tabs.get(tab_id) { Some(tab) => { - Ok((tab.clone(), format!("Using opened tab {}\n", tab_name.clone()))) + Err(format!("Tab is already opened: {}\n", tab.state_string())) }, None => { - let tab = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; - chrome_session.tabs.insert(tab_name.clone(), tab.clone()); - Ok((tab, format!("Opened new tab {}\n", tab_name.clone()))) + let instance = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; + match device { + DeviceType::MOBILE => { + instance.call_method(Emulation::SetDeviceMetricsOverride { + width: 375, + height: 812, + device_scale_factor: 0.0, + mobile: true, + scale: None, + screen_width: None, + screen_height: None, + position_x: None, + position_y: None, + dont_set_visible_size: None, + screen_orientation: None, + viewport: None, + display_feature: None, + }).map_err(|e| e.to_string())?; + }, + DeviceType::DESKTOP => { + instance.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; + } + } + let tab = ChromeTab{ + instance, + tab_id: tab_id.clone(), + device: device.clone(), + }; + chrome_session.tabs.insert(tab_id.clone(), tab.clone()); + Ok(format!("opened a new tab: {}\n", tab.state_string())) } } } +async fn session_get_tab( + chrome_session: &mut ChromeSession, + tab_id: &String, +) -> Result { + match chrome_session.tabs.get(tab_id) { + Some(tab) => Ok(tab.clone()), + None => Err(format!("tab_id {} is not opened", tab_id)), + } +} + #[derive(Debug)] enum Command { - // TODO: probably we need connect command - // if we're tying to operate on non-existing tab (no connection or something like this) - // we should not auto-open connection again + OpenTab(OpenTabArgs), NavigateTo(NavigateToArgs), Screenshot(ScreenshotArgs), Html(HtmlArgs), Reload(ReloadArgs), - Device(DeviceArgs), Click(ClickArgs), InsertText(InsertTextArgs), } @@ -274,86 +364,66 @@ enum Command { impl Command { pub async fn execute( &self, - chrome_session: &mut ChromeSession + chrome_session: &mut ChromeSession, ) -> Result<(Vec, Vec), String> { let mut tool_log = vec![]; let mut multimodal_els = vec![]; match self { + Command::OpenTab(args) => { + let log = session_open_tab(chrome_session, &args.tab_id, &args.device).await?; + tool_log.push(log); + }, Command::NavigateTo(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = navigate_to(&tab, &args.uri).await.map_err( - |e| format!("Can't navigate_to `{}` on tab `{}`: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, args.tab_id, e) - )?; - tool_log.push(content); + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match navigate_to(&tab.instance, &args.uri).await { + Ok(_) => format!("navigate_to successful: {}", tab.state_string()), + Err(e) => format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()), + }; + tool_log.push(log); }, Command::Screenshot(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let screenshot = screenshot_jpeg_base64(&tab, false).await?; - tool_log.push(format!("Made a screenshot of {}", tab.get_url())); - multimodal_els.push(screenshot); + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match screenshot_jpeg_base64(&tab.instance, false).await { + Ok(multimodal_el) => { + multimodal_els.push(multimodal_el); + format!("made a screenshot of {}", tab.state_string()) + }, + Err(e) => format!("screenshot failed for {}: {}", tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, Command::Html(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let client = Client::builder() - .build() - .map_err(|e| e.to_string())?; - let url = tab.get_url(); - let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; - if !response.status().is_success() { - tool_log.push(format!("Unable to fetch url: {}; status: {}", url, response.status())); - } else { - tool_log.push(response.text().await.map_err(|e| e.to_string())?); - } + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match inner_html(tab.instance.get_url()).await { + Ok(html) => format!("innerHtml of {}:\n\n{}", tab.state_string(), html), + Err(e) => format!("can't fetch innerHtml of {}: {}", tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, Command::Reload(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - tab.reload(false, None).map_err(|e| e.to_string())?; - tool_log.push(format!("Page `{}` on tab `{}` reloaded", tab.get_url(), args.tab_id)); - }, - Command::Device(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - match args.device { - DeviceType::MOBILE => { - tab.call_method(Emulation::SetDeviceMetricsOverride { - width: 375, - height: 812, - device_scale_factor: 0.0, - mobile: true, - scale: None, - screen_width: None, - screen_height: None, - position_x: None, - position_y: None, - dont_set_visible_size: None, - screen_orientation: None, - viewport: None, - display_feature: None, - }).map_err(|e| e.to_string())?; - tool_log.push(format!("Tab `{}` set to mobile view", args.tab_id)); - }, - DeviceType::DESKTOP => { - tab.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; - tool_log.push(format!("Tab `{}` set to desktop view", args.tab_id)); - } - } + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match tab.instance.reload(false, None) { + Ok(_) => format!("reload of {} successful", tab.state_string()), + Err(e) => format!("reload of {} failed: {}", tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, Command::Click(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = click_on_point(&tab, &args.point).await?; - tool_log.push(content); + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match click_on_point(&tab.instance, &args.point).await { + Ok(_) => format!("clicked on `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), + Err(e) => format!("clicked on `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, Command::InsertText(args) => { - let (tab, open_tab_log) = session_open_tab(chrome_session, &args.tab_id).await?; - tool_log.push(open_tab_log); - let content = insert_text(&tab, &args.text).await?; - tool_log.push(content); + let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let log = match tab.instance.type_str(args.text.as_str()) { + Ok(_) => format!("insert_text `{}` to {}", args.text, tab.state_string()), + Err(e) => format!("insert_text failed to {}: {}", tab.state_string(), e.to_string()), + }; + tool_log.push(log); }, } @@ -361,6 +431,12 @@ impl Command { } } +#[derive(Debug)] +struct OpenTabArgs { + device: DeviceType, + tab_id: String, +} + #[derive(Debug)] struct NavigateToArgs { uri: String, @@ -394,19 +470,6 @@ struct InsertTextArgs { tab_id: String, } - -#[derive(Debug)] -enum DeviceType { - DESKTOP, - MOBILE, -} - -#[derive(Debug)] -struct DeviceArgs { - device: DeviceType, - tab_id: String, -} - fn parse_single_command(command: &String) -> Result { let args = shell_words::split(&command).map_err(|e| e.to_string())?; if args.is_empty() { @@ -416,6 +479,20 @@ fn parse_single_command(command: &String) -> Result { let (command_name, parsed_args) = (args[0].clone(), args[1..].to_vec()); match command_name.as_str() { + "open_tab" => { + if parsed_args.len() < 2 { + return Err(format!("`open_tab` requires 2 arguments: `` and `tab_id`. Provided: {:?}", parsed_args)); + } + let device = match parsed_args[0].as_str() { + "desktop" => DeviceType::DESKTOP, + "mobile" => DeviceType::MOBILE, + _ => return Err(format!("unknown device type: {}. Should be either `desktop` or `mobile`.", parsed_args[0])) + }; + Ok(Command::OpenTab(OpenTabArgs { + device, + tab_id: parsed_args[1].clone(), + })) + }, "navigate_to" => { if parsed_args.len() < 2 { return Err(format!("`navigate_to` requires 2 arguments: `uri` and `tab_id`. Provided: {:?}", parsed_args)); @@ -449,19 +526,6 @@ fn parse_single_command(command: &String) -> Result { tab_id: parsed_args[0].clone(), })) }, - "device" => { - if parsed_args.len() < 2 { - return Err(format!("`device` requires 2 arguments: `desktop|mobile` and `tab_id`. Provided: {:?}", parsed_args)); - } - Ok(Command::Device(DeviceArgs { - device: match parsed_args[0].as_str() { - "desktop" => DeviceType::DESKTOP, - "mobile" => DeviceType::MOBILE, - _ => return Err(format!("Unknown device type: {}. Should be either `desktop` or `mobile`.", parsed_args[0])) - }, - tab_id: parsed_args[1].clone(), - })) - }, "click" => { match parsed_args.as_slice() { [x_str, y_str, tab_id] => { @@ -522,15 +586,3 @@ async fn interact_with_chrome( Ok(multimodal_els) } - -async fn screenshot_jpeg_base64(tab: &Arc, capture_beyond_viewport: bool) -> Result { - let jpeg_data = tab.call_method(Page::CaptureScreenshot { - format: Some(Page::CaptureScreenshotFormatOption::Jpeg), - clip: None, - quality: Some(75), - from_surface: Some(true), - capture_beyond_viewport: Some(capture_beyond_viewport), - }).map_err(|e| e.to_string())?.data; - - MultimodalElement::new("image/jpeg".to_string(), jpeg_data) -} From 7d9afcfaa3eba62761a7382c1813089b0ad34d7d Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 14:33:46 +0300 Subject: [PATCH 02/14] chrome suppress command logs into one text element --- src/integrations/integr_chrome.rs | 70 +++++++++++-------------------- 1 file changed, 25 insertions(+), 45 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 074bd61a5..7052079a4 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -125,37 +125,45 @@ impl Tool for ToolChrome { None => return Err("Missing argument `commands`".to_string()) }; - let mut content = vec![]; + let session_hashmap_key = get_session_hashmap_key("chrome", &chat_id); + let mut tool_log = setup_chrome_session(gcx.clone(), &self.integration_chrome, &session_hashmap_key).await?; + + let command_session = { + let gcx_locked = gcx.read().await; + gcx_locked.integration_sessions.get(&session_hashmap_key) + .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? + .clone() + }; + let mut command_session_locked = command_session.lock().await; + let chrome_session = command_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + + let mut mutlimodal_els = vec![]; for command in commands_str.lines().map(|s| s.trim()).collect::>() { let parsed_command = match parse_single_command(&command.to_string()) { Ok(command) => command, Err(e) => { - content.push(MultimodalElement::new( - "text".to_string(), - format!("Failed to parse command: {}. Error: {}.", command, e) - )?); + tool_log.push(format!("failed to parse command `{}`: {}.", command, e)); break } }; - match interact_with_chrome( - gcx.clone(), - &chat_id, - &self.integration_chrome, - &parsed_command, - ).await { - Ok(command_content) => { - content.extend(command_content); + match parsed_command.execute(chrome_session).await { + Ok((execute_log, command_multimodal_els)) => { + tool_log.extend(execute_log); + mutlimodal_els.extend(command_multimodal_els); }, Err(e) => { - content.push(MultimodalElement::new( - "text".to_string(), - format!("Failed to execute command: {}. Error: {}.", command, e) - )?); + tool_log.push(format!("failed to execute command `{}`: {}.", command, e)); break } }; } + let mut content= vec![]; + content.push(MultimodalElement::new( + "text".to_string(), tool_log.join("\n") + )?); + content.extend(mutlimodal_els); + let msg = ContextEnum::ChatMessage(ChatMessage { role: "tool".to_string(), content: ChatContent::Multimodal(content), @@ -558,31 +566,3 @@ fn parse_single_command(command: &String) -> Result { _ => Err(format!("Unknown command: {:?}.", command_name)), } } - -async fn interact_with_chrome( - gcx: Arc>, - chat_id: &String, - integration_chrome: &IntegrationChrome, - command: &Command, -) -> Result, String> { - let session_hashmap_key = get_session_hashmap_key("chrome", &chat_id); - let setup_log = setup_chrome_session(gcx.clone(), &integration_chrome, &session_hashmap_key).await?; - - let command_session = { - let gcx_locked = gcx.read().await; - gcx_locked.integration_sessions.get(&session_hashmap_key) - .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? - .clone() - }; - let mut command_session_locked = command_session.lock().await; - let chrome_session = command_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; - - let (execute_log, mut multimodal_els) = command.execute(chrome_session).await?; - - let tool_log = setup_log.iter().chain(execute_log.iter()).map(|s| s.clone()).collect::>(); - multimodal_els.push(MultimodalElement::new( - "text".to_string(), tool_log.join("\n") - )?); - - Ok(multimodal_els) -} From 5f9fb5116e07859d53447d11f6c7015bcb260792 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 17:25:46 +0300 Subject: [PATCH 03/14] resize of screenshot --- src/integrations/integr_chrome.rs | 85 ++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 7052079a4..9316391c6 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -23,6 +23,11 @@ use headless_chrome::protocol::cdp::Emulation; use serde::{Deserialize, Serialize}; use std::fmt; +use base64::Engine; +use std::io::Cursor; +use image::imageops::FilterType; +use image::{ImageFormat, ImageReader}; + #[derive(Clone, Serialize, Deserialize, Debug)] pub struct IntegrationChrome { pub chrome_path: Option, @@ -59,9 +64,18 @@ pub struct ChromeTab { instance: Arc, device: DeviceType, tab_id: String, + screenshot_scale_factor: f64, } impl ChromeTab { + fn new(instance: Arc, device: &DeviceType, tab_id: &String) -> Self { + Self { + instance, + device: device.clone(), + tab_id: tab_id.clone(), + screenshot_scale_factor: 1.0, + } + } pub fn state_string(&self) -> String { format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.instance.get_url()) } @@ -274,15 +288,36 @@ async fn navigate_to(instance: &Arc, url: &String) -> Result<(), String> { Ok(()) } -async fn screenshot_jpeg_base64(instance: &Arc, capture_beyond_viewport: bool) -> Result { - let jpeg_data = instance.call_method(Page::CaptureScreenshot { +async fn screenshot_jpeg_base64( + tab: &mut ChromeTab, + capture_beyond_viewport: bool, +) -> Result { + let jpeg_base64_data = tab.instance.call_method(Page::CaptureScreenshot { format: Some(Page::CaptureScreenshotFormatOption::Jpeg), clip: None, quality: Some(75), from_surface: Some(true), capture_beyond_viewport: Some(capture_beyond_viewport), }).map_err(|e| e.to_string())?.data; - MultimodalElement::new("image/jpeg".to_string(), jpeg_data) + + let mut data = base64::prelude::BASE64_STANDARD + .decode(jpeg_base64_data).map_err(|e| e.to_string())?; + let reader = ImageReader::with_format(Cursor::new(data), ImageFormat::Jpeg); + let mut image = reader.decode().map_err(|e| e.to_string())?; + + let max_dimension = 800.0; + let scale_factor = max_dimension / std::cmp::max(image.width(), image.height()) as f32; + if scale_factor < 1.0 { + // NOTE: the tool operates on resized image well without a special model notification + let (nwidth, nheight) = (scale_factor * image.width() as f32, scale_factor * image.height() as f32); + image = image.resize(nwidth as u32, nheight as u32, FilterType::Lanczos3); + tab.screenshot_scale_factor = scale_factor as f64; + } + + data = Vec::new(); + image.write_to(&mut Cursor::new(&mut data), ImageFormat::Jpeg).map_err(|e| e.to_string())?; + + MultimodalElement::new("image/jpeg".to_string(), base64::prelude::BASE64_STANDARD.encode(data)) } async fn inner_html(url: String) -> Result { @@ -298,9 +333,13 @@ async fn inner_html(url: String) -> Result { } } -async fn click_on_point(instance: &Arc, point: &Point) -> Result<(), String> { - instance.click_point(point.clone()).map_err(|e| e.to_string())?; - instance.wait_until_navigated().map_err(|e| e.to_string())?; +async fn click_on_point(tab: &ChromeTab, point: &Point) -> Result<(), String> { + let mapped_point = Point { + x: point.x / tab.screenshot_scale_factor, + y: point.y / tab.screenshot_scale_factor, + }; + tab.instance.click_point(mapped_point).map_err(|e| e.to_string())?; + tab.instance.wait_until_navigated().map_err(|e| e.to_string())?; Ok(()) } @@ -337,23 +376,19 @@ async fn session_open_tab( instance.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; } } - let tab = ChromeTab{ - instance, - tab_id: tab_id.clone(), - device: device.clone(), - }; - chrome_session.tabs.insert(tab_id.clone(), tab.clone()); + let tab = ChromeTab::new(instance, device, tab_id); + chrome_session.tabs.insert(tab.tab_id.clone(), tab.clone()); Ok(format!("opened a new tab: {}\n", tab.state_string())) } } } -async fn session_get_tab( - chrome_session: &mut ChromeSession, +async fn session_get_tab_mut<'a>( + chrome_session: &'a mut ChromeSession, tab_id: &String, -) -> Result { - match chrome_session.tabs.get(tab_id) { - Some(tab) => Ok(tab.clone()), +) -> Result<&'a mut ChromeTab, String> { + match chrome_session.tabs.get_mut(tab_id) { + Some(tab) => Ok(tab), None => Err(format!("tab_id {} is not opened", tab_id)), } } @@ -383,7 +418,7 @@ impl Command { tool_log.push(log); }, Command::NavigateTo(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match navigate_to(&tab.instance, &args.uri).await { Ok(_) => format!("navigate_to successful: {}", tab.state_string()), Err(e) => format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()), @@ -391,8 +426,8 @@ impl Command { tool_log.push(log); }, Command::Screenshot(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; - let log = match screenshot_jpeg_base64(&tab.instance, false).await { + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; + let log = match screenshot_jpeg_base64(tab, false).await { Ok(multimodal_el) => { multimodal_els.push(multimodal_el); format!("made a screenshot of {}", tab.state_string()) @@ -402,7 +437,7 @@ impl Command { tool_log.push(log); }, Command::Html(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match inner_html(tab.instance.get_url()).await { Ok(html) => format!("innerHtml of {}:\n\n{}", tab.state_string(), html), Err(e) => format!("can't fetch innerHtml of {}: {}", tab.state_string(), e.to_string()), @@ -410,7 +445,7 @@ impl Command { tool_log.push(log); }, Command::Reload(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match tab.instance.reload(false, None) { Ok(_) => format!("reload of {} successful", tab.state_string()), Err(e) => format!("reload of {} failed: {}", tab.state_string(), e.to_string()), @@ -418,15 +453,15 @@ impl Command { tool_log.push(log); }, Command::Click(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; - let log = match click_on_point(&tab.instance, &args.point).await { + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; + let log = match click_on_point(&tab, &args.point).await { Ok(_) => format!("clicked on `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), Err(e) => format!("clicked on `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), }; tool_log.push(log); }, Command::InsertText(args) => { - let tab = session_get_tab(chrome_session, &args.tab_id).await?; + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match tab.instance.type_str(args.text.as_str()) { Ok(_) => format!("insert_text `{}` to {}", args.text, tab.state_string()), Err(e) => format!("insert_text failed to {}: {}", tab.state_string(), e.to_string()), From 6f0b68474e57a84ddabc2e763d9ead9f2dfe1a14 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 17:42:16 +0300 Subject: [PATCH 04/14] click -> click_at, insert_text -> insert_text_at --- src/integrations/integr_chrome.rs | 36 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 9316391c6..ee4149de3 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -199,8 +199,8 @@ impl Tool for ToolChrome { ]; if self.supports_clicks { supported_commands.extend(vec![ - "click ", - "insert_text ", + "click_at ", + "type_text_at ", ]); } let description = format!( @@ -333,7 +333,7 @@ async fn inner_html(url: String) -> Result { } } -async fn click_on_point(tab: &ChromeTab, point: &Point) -> Result<(), String> { +async fn click_point(tab: &ChromeTab, point: &Point) -> Result<(), String> { let mapped_point = Point { x: point.x / tab.screenshot_scale_factor, y: point.y / tab.screenshot_scale_factor, @@ -400,8 +400,8 @@ enum Command { Screenshot(ScreenshotArgs), Html(HtmlArgs), Reload(ReloadArgs), - Click(ClickArgs), - InsertText(InsertTextArgs), + ClickAt(ClickAtArgs), + TypeTextAt(TypeTextAtArgs), } impl Command { @@ -452,19 +452,19 @@ impl Command { }; tool_log.push(log); }, - Command::Click(args) => { + Command::ClickAt(args) => { let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match click_on_point(&tab, &args.point).await { - Ok(_) => format!("clicked on `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), - Err(e) => format!("clicked on `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), + let log = match click_point(&tab, &args.point).await { + Ok(_) => format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), + Err(e) => format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), }; tool_log.push(log); }, - Command::InsertText(args) => { + Command::TypeTextAt(args) => { let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; let log = match tab.instance.type_str(args.text.as_str()) { - Ok(_) => format!("insert_text `{}` to {}", args.text, tab.state_string()), - Err(e) => format!("insert_text failed to {}: {}", tab.state_string(), e.to_string()), + Ok(_) => format!("type `{}` at {}", args.text, tab.state_string()), + Err(e) => format!("type text failed at {}: {}", tab.state_string(), e.to_string()), }; tool_log.push(log); }, @@ -502,13 +502,13 @@ struct ReloadArgs { } #[derive(Debug)] -struct ClickArgs { +struct ClickAtArgs { point: Point, tab_id: String, } #[derive(Debug)] -struct InsertTextArgs { +struct TypeTextAtArgs { text: String, tab_id: String, } @@ -569,13 +569,13 @@ fn parse_single_command(command: &String) -> Result { tab_id: parsed_args[0].clone(), })) }, - "click" => { + "click_at" => { match parsed_args.as_slice() { [x_str, y_str, tab_id] => { let x = x_str.parse::().map_err(|e| format!("Failed to parse x: {}", e))?; let y = y_str.parse::().map_err(|e| format!("Failed to parse y: {}", e))?; let point = Point { x, y }; - Ok(Command::Click(ClickArgs { + Ok(Command::ClickAt(ClickAtArgs { point, tab_id: tab_id.clone(), })) @@ -585,10 +585,10 @@ fn parse_single_command(command: &String) -> Result { } } }, - "insert_text" => { + "type_text_at" => { match parsed_args.as_slice() { [text, tab_id] => { - Ok(Command::InsertText(InsertTextArgs { + Ok(Command::TypeTextAt(TypeTextAtArgs { text: text.clone(), tab_id: tab_id.clone(), })) From 67edd79a1fcf8d5d43b626bacf321eff8657bb86 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 8 Nov 2024 18:16:03 +0300 Subject: [PATCH 05/14] press_key_at command for chrome --- src/integrations/integr_chrome.rs | 55 +++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index ee4149de3..df4bb1187 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -201,6 +201,7 @@ impl Tool for ToolChrome { supported_commands.extend(vec![ "click_at ", "type_text_at ", + "press_key_at ", ]); } let description = format!( @@ -402,6 +403,7 @@ enum Command { Reload(ReloadArgs), ClickAt(ClickAtArgs), TypeTextAt(TypeTextAtArgs), + PressKeyAt(PressKeyAtArgs), } impl Command { @@ -468,6 +470,14 @@ impl Command { }; tool_log.push(log); }, + Command::PressKeyAt(args) => { + let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; + let log = match tab.instance.press_key(args.key.to_string().as_str()) { + Ok(_) => format!("press `{}` at {}", args.key, tab.state_string()), + Err(e) => format!("press `{}` failed at {}: {}", args.key, tab.state_string(), e.to_string()), + }; + tool_log.push(log); + } } Ok((tool_log, multimodal_els)) @@ -513,6 +523,31 @@ struct TypeTextAtArgs { tab_id: String, } +#[derive(Clone, Debug)] +enum Key { + ENTER, + ESC, + PAGEUP, + PAGEDOWN, +} + +impl fmt::Display for Key { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Key::ENTER => write!(f, "Enter"), + Key::ESC => write!(f, "Escape"), + Key::PAGEUP => write!(f, "PageUp"), + Key::PAGEDOWN => write!(f, "PageDown"), + } + } +} + +#[derive(Debug)] +struct PressKeyAtArgs { + key: Key, + tab_id: String, +} + fn parse_single_command(command: &String) -> Result { let args = shell_words::split(&command).map_err(|e| e.to_string())?; if args.is_empty() { @@ -598,6 +633,26 @@ fn parse_single_command(command: &String) -> Result { } } }, + "press_key_at" => { + match parsed_args.as_slice() { + [key_str, tab_id] => { + let key = match key_str.to_lowercase().as_str() { + "enter" => Key::ENTER, + "esc" => Key::ESC, + "pageup" => Key::PAGEUP, + "pagedown" => Key::PAGEDOWN, + _ => return Err(format!("Unknown key: {}", key_str)), + }; + Ok(Command::PressKeyAt(PressKeyAtArgs { + key, + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'key', 'tab_id'".to_string()) + } + } + }, _ => Err(format!("Unknown command: {:?}.", command_name)), } } From c9a5bbedb125c97943ecd758ac698cc71852e5ce Mon Sep 17 00:00:00 2001 From: Oleg Klimov Date: Tue, 12 Nov 2024 09:33:39 +0100 Subject: [PATCH 06/14] multithread fix 1 --- src/integrations/integr_chrome.rs | 299 +++++++++++++++++++----------- 1 file changed, 192 insertions(+), 107 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index df4bb1187..87cb75c60 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -16,7 +16,7 @@ use crate::tools::tools_description::{Tool, ToolDesc, ToolParam}; use reqwest::Client; use std::path::PathBuf; -use headless_chrome::{Browser, LaunchOptions, Tab}; +use headless_chrome::{Browser, LaunchOptions, Tab as HeadlessTab}; use headless_chrome::browser::tab::point::Point; use headless_chrome::protocol::cdp::Page; use headless_chrome::protocol::cdp::Emulation; @@ -61,29 +61,29 @@ impl fmt::Display for DeviceType { #[derive(Clone)] pub struct ChromeTab { - instance: Arc, + headless_tab: Arc, device: DeviceType, tab_id: String, screenshot_scale_factor: f64, } impl ChromeTab { - fn new(instance: Arc, device: &DeviceType, tab_id: &String) -> Self { + fn new(headless_tab: Arc, device: &DeviceType, tab_id: &String) -> Self { Self { - instance, + headless_tab, device: device.clone(), tab_id: tab_id.clone(), screenshot_scale_factor: 1.0, } } pub fn state_string(&self) -> String { - format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.instance.get_url()) + format!("tab_id `{}` device `{}` uri `{}`", self.tab_id.clone(), self.device, self.headless_tab.get_url()) } } struct ChromeSession { browser: Browser, - tabs: HashMap, + tabs: HashMap>>, } impl ChromeSession { @@ -148,8 +148,6 @@ impl Tool for ToolChrome { .ok_or(format!("Error getting chrome session for chat: {}", chat_id))? .clone() }; - let mut command_session_locked = command_session.lock().await; - let chrome_session = command_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; let mut mutlimodal_els = vec![]; for command in commands_str.lines().map(|s| s.trim()).collect::>() { @@ -160,7 +158,7 @@ impl Tool for ToolChrome { break } }; - match parsed_command.execute(chrome_session).await { + match chrome_command_exec(&parsed_command, command_session.clone()).await { Ok((execute_log, command_multimodal_els)) => { tool_log.extend(execute_log); mutlimodal_els.extend(command_multimodal_els); @@ -283,17 +281,26 @@ async fn setup_chrome_session( Ok(setup_log) } -async fn navigate_to(instance: &Arc, url: &String) -> Result<(), String> { - instance.navigate_to(url.as_str()).map_err(|e| e.to_string())?; - instance.wait_until_navigated().map_err(|e| e.to_string())?; +async fn navigate_to(tab: Arc>, url: &String) -> Result<(), String> { + let tab_instance = { + let tab_lock = tab.lock().await; + tab_lock.headless_tab.clone() + }; + tab_instance.navigate_to(url.as_str()).map_err(|e| e.to_string())?; + tab_instance.wait_until_navigated().map_err(|e| e.to_string())?; Ok(()) } async fn screenshot_jpeg_base64( - tab: &mut ChromeTab, + tab: Arc>, capture_beyond_viewport: bool, ) -> Result { - let jpeg_base64_data = tab.instance.call_method(Page::CaptureScreenshot { + let chrome_tab = { + let tab_lock = tab.lock().await; + tab_lock.headless_tab.clone() + }; + + let jpeg_base64_data = chrome_tab.call_method(Page::CaptureScreenshot { format: Some(Page::CaptureScreenshotFormatOption::Jpeg), clip: None, quality: Some(75), @@ -312,7 +319,8 @@ async fn screenshot_jpeg_base64( // NOTE: the tool operates on resized image well without a special model notification let (nwidth, nheight) = (scale_factor * image.width() as f32, scale_factor * image.height() as f32); image = image.resize(nwidth as u32, nheight as u32, FilterType::Lanczos3); - tab.screenshot_scale_factor = scale_factor as f64; + let mut tab_lock = tab.lock().await; + tab_lock.screenshot_scale_factor = scale_factor as f64; } data = Vec::new(); @@ -334,13 +342,18 @@ async fn inner_html(url: String) -> Result { } } -async fn click_point(tab: &ChromeTab, point: &Point) -> Result<(), String> { - let mapped_point = Point { - x: point.x / tab.screenshot_scale_factor, - y: point.y / tab.screenshot_scale_factor, +async fn click_point(tab: Arc>, point: &Point) -> Result<(), String> { + let (mapped_point, headless_tab) = { + let tab_lock = tab.lock().await; + let mapped_point = Point { + x: point.x / tab_lock.screenshot_scale_factor, + y: point.y / tab_lock.screenshot_scale_factor, + }; + let headless_tab = tab_lock.headless_tab.clone(); + (mapped_point, headless_tab) }; - tab.instance.click_point(mapped_point).map_err(|e| e.to_string())?; - tab.instance.wait_until_navigated().map_err(|e| e.to_string())?; + headless_tab.click_point(mapped_point).map_err(|e| e.to_string())?; + headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; Ok(()) } @@ -351,13 +364,14 @@ async fn session_open_tab( ) -> Result { match chrome_session.tabs.get(tab_id) { Some(tab) => { - Err(format!("Tab is already opened: {}\n", tab.state_string())) + let tab_lock = tab.lock().await; + Err(format!("Tab is already opened: {}\n", tab_lock.state_string())) }, None => { - let instance = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; + let headless_tab = chrome_session.browser.new_tab().map_err(|e| e.to_string())?; match device { DeviceType::MOBILE => { - instance.call_method(Emulation::SetDeviceMetricsOverride { + headless_tab.call_method(Emulation::SetDeviceMetricsOverride { width: 375, height: 812, device_scale_factor: 0.0, @@ -374,22 +388,23 @@ async fn session_open_tab( }).map_err(|e| e.to_string())?; }, DeviceType::DESKTOP => { - instance.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; + headless_tab.call_method(Emulation::ClearDeviceMetricsOverride(None)).map_err(|e| e.to_string())?; } } - let tab = ChromeTab::new(instance, device, tab_id); - chrome_session.tabs.insert(tab.tab_id.clone(), tab.clone()); - Ok(format!("opened a new tab: {}\n", tab.state_string())) + let tab = Arc::new(AMutex::new(ChromeTab::new(headless_tab, device, tab_id))); + let tab_lock = tab.lock().await; + chrome_session.tabs.insert(tab_id.clone(), tab.clone()); + Ok(format!("opened a new tab: {}\n", tab_lock.state_string())) } } } -async fn session_get_tab_mut<'a>( - chrome_session: &'a mut ChromeSession, +async fn session_get_tab_arc( + chrome_session: &ChromeSession, tab_id: &String, -) -> Result<&'a mut ChromeTab, String> { - match chrome_session.tabs.get_mut(tab_id) { - Some(tab) => Ok(tab), +) -> Result>, String> { + match chrome_session.tabs.get(tab_id) { + Some(tab) => Ok(tab.clone()), None => Err(format!("tab_id {} is not opened", tab_id)), } } @@ -406,82 +421,152 @@ enum Command { PressKeyAt(PressKeyAtArgs), } -impl Command { - pub async fn execute( - &self, - chrome_session: &mut ChromeSession, - ) -> Result<(Vec, Vec), String> { - let mut tool_log = vec![]; - let mut multimodal_els = vec![]; - - match self { - Command::OpenTab(args) => { - let log = session_open_tab(chrome_session, &args.tab_id, &args.device).await?; - tool_log.push(log); - }, - Command::NavigateTo(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match navigate_to(&tab.instance, &args.uri).await { - Ok(_) => format!("navigate_to successful: {}", tab.state_string()), - Err(e) => format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()), - }; - tool_log.push(log); - }, - Command::Screenshot(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match screenshot_jpeg_base64(tab, false).await { - Ok(multimodal_el) => { - multimodal_els.push(multimodal_el); - format!("made a screenshot of {}", tab.state_string()) - }, - Err(e) => format!("screenshot failed for {}: {}", tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::Html(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match inner_html(tab.instance.get_url()).await { - Ok(html) => format!("innerHtml of {}:\n\n{}", tab.state_string(), html), - Err(e) => format!("can't fetch innerHtml of {}: {}", tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::Reload(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match tab.instance.reload(false, None) { - Ok(_) => format!("reload of {} successful", tab.state_string()), - Err(e) => format!("reload of {} failed: {}", tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::ClickAt(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match click_point(&tab, &args.point).await { - Ok(_) => format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab.state_string()), - Err(e) => format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::TypeTextAt(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match tab.instance.type_str(args.text.as_str()) { - Ok(_) => format!("type `{}` at {}", args.text, tab.state_string()), - Err(e) => format!("type text failed at {}: {}", tab.state_string(), e.to_string()), - }; - tool_log.push(log); - }, - Command::PressKeyAt(args) => { - let tab = session_get_tab_mut(chrome_session, &args.tab_id).await?; - let log = match tab.instance.press_key(args.key.to_string().as_str()) { - Ok(_) => format!("press `{}` at {}", args.key, tab.state_string()), - Err(e) => format!("press `{}` failed at {}: {}", args.key, tab.state_string(), e.to_string()), - }; - tool_log.push(log); - } +async fn chrome_command_exec( + cmd: &Command, + chrome_session: Arc>>, +) -> Result<(Vec, Vec), String> { + let mut tool_log = vec![]; + let mut multimodal_els = vec![]; + + match cmd { + Command::OpenTab(args) => { + let log = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_open_tab(chrome_session, &args.tab_id, &args.device).await? + }; + tool_log.push(log); + }, + Command::NavigateTo(args) => { + let tab: Arc> = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match navigate_to(tab.clone(), &args.uri).await { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("navigate_to successful: {}", tab_lock.state_string()) + }, + Err(e) => { + // let tab_lock = tab.lock().await; + format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::Screenshot(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match screenshot_jpeg_base64(tab.clone(), false).await { + Ok(multimodal_el) => { + multimodal_els.push(multimodal_el); + let tab_lock = tab.lock().await; + format!("made a screenshot of {}", tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("screenshot failed for {}: {}", tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::Html(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match inner_html(tab.lock().await.headless_tab.get_url()).await { + Ok(html) => { + let tab_lock = tab.lock().await; + format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("can't fetch innerHtml of {}: {}", tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::Reload(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match tab.lock().await.headless_tab.reload(false, None) { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("reload of {} successful", tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("reload of {} failed: {}", tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::ClickAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match click_point(tab.clone(), &args.point).await { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::TypeTextAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match tab.lock().await.headless_tab.type_str(args.text.as_str()) { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("type `{}` at {}", args.text, tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("type text failed at {}: {}", tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); + }, + Command::PressKeyAt(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let log = match tab.lock().await.headless_tab.press_key(args.key.to_string().as_str()) { + Ok(_) => { + let tab_lock = tab.lock().await; + format!("press `{}` at {}", args.key, tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("press `{}` failed at {}: {}", args.key, tab_lock.state_string(), e.to_string()) + }, + }; + tool_log.push(log); } - - Ok((tool_log, multimodal_els)) } + + Ok((tool_log, multimodal_els)) } #[derive(Debug)] From e9311efaa2f01471bf185c08deba8bd99e5edaf1 Mon Sep 17 00:00:00 2001 From: Oleg Klimov Date: Tue, 12 Nov 2024 09:38:22 +0100 Subject: [PATCH 07/14] multithreaded fix 2 --- src/integrations/integr_chrome.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 87cb75c60..91bdf3b51 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -480,7 +480,8 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match inner_html(tab.lock().await.headless_tab.get_url()).await { + let url = tab.lock().await.headless_tab.get_url(); + let log = match inner_html(url).await { Ok(html) => { let tab_lock = tab.lock().await; format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) From e165368949eff426990206cf5e19ed6d95ed2889 Mon Sep 17 00:00:00 2001 From: Oleg Klimov Date: Tue, 12 Nov 2024 09:38:29 +0100 Subject: [PATCH 08/14] warning --- src/tools/tool_patch_aux/tickets_parsing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/tool_patch_aux/tickets_parsing.rs b/src/tools/tool_patch_aux/tickets_parsing.rs index ac3271901..bdf596fcf 100644 --- a/src/tools/tool_patch_aux/tickets_parsing.rs +++ b/src/tools/tool_patch_aux/tickets_parsing.rs @@ -71,7 +71,7 @@ pub struct TicketToApply { } pub fn good_error_text(reason: &str, tickets: &Vec, resolution: Option) -> (String, Option) { - let mut text = format!("Couldn't create patch for tickets: '{}'.\nReason: {reason}", tickets.join(", ")); + let text = format!("Couldn't create patch for tickets: '{}'.\nReason: {reason}", tickets.join(", ")); if let Some(resolution) = resolution { let cd_format = format!("💿 {resolution}"); return (text, Some(cd_format)) From 131e29e88ee35eed33f8381af5073cf61d8c4e13 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 15 Nov 2024 21:46:18 +0300 Subject: [PATCH 09/14] fix deadlocks --- src/integrations/integr_chrome.rs | 220 +++++++++++++++--------------- 1 file changed, 107 insertions(+), 113 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 91bdf3b51..d4bb14b0d 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -281,33 +281,21 @@ async fn setup_chrome_session( Ok(setup_log) } -async fn navigate_to(tab: Arc>, url: &String) -> Result<(), String> { - let tab_instance = { - let tab_lock = tab.lock().await; - tab_lock.headless_tab.clone() - }; - tab_instance.navigate_to(url.as_str()).map_err(|e| e.to_string())?; - tab_instance.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(()) -} - async fn screenshot_jpeg_base64( tab: Arc>, capture_beyond_viewport: bool, ) -> Result { - let chrome_tab = { + let jpeg_base64_data = { let tab_lock = tab.lock().await; - tab_lock.headless_tab.clone() + tab_lock.headless_tab.call_method(Page::CaptureScreenshot { + format: Some(Page::CaptureScreenshotFormatOption::Jpeg), + clip: None, + quality: Some(75), + from_surface: Some(true), + capture_beyond_viewport: Some(capture_beyond_viewport), + }).map_err(|e| e.to_string())?.data }; - let jpeg_base64_data = chrome_tab.call_method(Page::CaptureScreenshot { - format: Some(Page::CaptureScreenshotFormatOption::Jpeg), - clip: None, - quality: Some(75), - from_surface: Some(true), - capture_beyond_viewport: Some(capture_beyond_viewport), - }).map_err(|e| e.to_string())?.data; - let mut data = base64::prelude::BASE64_STANDARD .decode(jpeg_base64_data).map_err(|e| e.to_string())?; let reader = ImageReader::with_format(Cursor::new(data), ImageFormat::Jpeg); @@ -319,6 +307,7 @@ async fn screenshot_jpeg_base64( // NOTE: the tool operates on resized image well without a special model notification let (nwidth, nheight) = (scale_factor * image.width() as f32, scale_factor * image.height() as f32); image = image.resize(nwidth as u32, nheight as u32, FilterType::Lanczos3); + // NOTE: we should store screenshot_scale_factor for every resized screenshot, not for a tab! let mut tab_lock = tab.lock().await; tab_lock.screenshot_scale_factor = scale_factor as f64; } @@ -329,34 +318,6 @@ async fn screenshot_jpeg_base64( MultimodalElement::new("image/jpeg".to_string(), base64::prelude::BASE64_STANDARD.encode(data)) } -async fn inner_html(url: String) -> Result { - let client = Client::builder() - .build() - .map_err(|e| e.to_string())?; - let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; - if response.status().is_success() { - let html = response.text().await.map_err(|e| e.to_string())?; - Ok(html) - } else { - Err(format!("status: {}", response.status())) - } -} - -async fn click_point(tab: Arc>, point: &Point) -> Result<(), String> { - let (mapped_point, headless_tab) = { - let tab_lock = tab.lock().await; - let mapped_point = Point { - x: point.x / tab_lock.screenshot_scale_factor, - y: point.y / tab_lock.screenshot_scale_factor, - }; - let headless_tab = tab_lock.headless_tab.clone(); - (mapped_point, headless_tab) - }; - headless_tab.click_point(mapped_point).map_err(|e| e.to_string())?; - headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; - Ok(()) -} - async fn session_open_tab( chrome_session: &mut ChromeSession, tab_id: &String, @@ -443,15 +404,20 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match navigate_to(tab.clone(), &args.uri).await { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("navigate_to successful: {}", tab_lock.state_string()) - }, - Err(e) => { - // let tab_lock = tab.lock().await; - format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + match { + tab_lock.headless_tab.navigate_to(args.uri.as_str()).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("navigate_to successful: {}", tab_lock.state_string()) + }, + Err(e) => { + format!("navigate_to `{}` failed: {}. If you're trying to open a local file, add a file:// prefix.", args.uri, e.to_string()) + }, + } }; tool_log.push(log); }, @@ -461,16 +427,19 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match screenshot_jpeg_base64(tab.clone(), false).await { - Ok(multimodal_el) => { - multimodal_els.push(multimodal_el); - let tab_lock = tab.lock().await; - format!("made a screenshot of {}", tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("screenshot failed for {}: {}", tab_lock.state_string(), e.to_string()) - }, + let log = { + // NOTE: this operation is not atomic, unfortunately + match screenshot_jpeg_base64(tab.clone(), false).await { + Ok(multimodal_el) => { + multimodal_els.push(multimodal_el); + let tab_lock = tab.lock().await; + format!("made a screenshot of {}", tab_lock.state_string()) + }, + Err(e) => { + let tab_lock = tab.lock().await; + format!("screenshot failed for {}: {}", tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -480,16 +449,28 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let url = tab.lock().await.headless_tab.get_url(); - let log = match inner_html(url).await { - Ok(html) => { - let tab_lock = tab.lock().await; - format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("can't fetch innerHtml of {}: {}", tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + let url = tab_lock.headless_tab.get_url(); + match { + let client = Client::builder() + .build() + .map_err(|e| e.to_string())?; + let response = client.get(url.clone()).send().await.map_err(|e| e.to_string())?; + if response.status().is_success() { + let html = response.text().await.map_err(|e| e.to_string())?; + Ok(html) + } else { + Err(format!("status: {}", response.status())) + } + } { + Ok(html) => { + format!("innerHtml of {}:\n\n{}", tab_lock.state_string(), html) + }, + Err(e) => { + format!("can't fetch innerHtml of {}: {}", tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -499,15 +480,17 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match tab.lock().await.headless_tab.reload(false, None) { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("reload of {} successful", tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("reload of {} failed: {}", tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + let chrome_tab = tab_lock.headless_tab.clone(); + match chrome_tab.reload(false, None) { + Ok(_) => { + format!("reload of {} successful", tab_lock.state_string()) + }, + Err(e) => { + format!("reload of {} failed: {}", tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -517,15 +500,24 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match click_point(tab.clone(), &args.point).await { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + match { + let mapped_point = Point { + x: args.point.x / tab_lock.screenshot_scale_factor, + y: args.point.y / tab_lock.screenshot_scale_factor, + }; + tab_lock.headless_tab.click_point(mapped_point).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + Ok::<(), String>(()) + } { + Ok(_) => { + format!("clicked `{} {}` at {}", args.point.x, args.point.y, tab_lock.state_string()) + }, + Err(e) => { + format!("clicked `{} {}` failed at {}: {}", args.point.x, args.point.y, tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -535,15 +527,16 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match tab.lock().await.headless_tab.type_str(args.text.as_str()) { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("type `{}` at {}", args.text, tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("type text failed at {}: {}", tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + match tab_lock.headless_tab.type_str(args.text.as_str()) { + Ok(_) => { + format!("type `{}` at {}", args.text, tab_lock.state_string()) + }, + Err(e) => { + format!("type text failed at {}: {}", tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); }, @@ -553,15 +546,16 @@ async fn chrome_command_exec( let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; session_get_tab_arc(chrome_session, &args.tab_id).await? }; - let log = match tab.lock().await.headless_tab.press_key(args.key.to_string().as_str()) { - Ok(_) => { - let tab_lock = tab.lock().await; - format!("press `{}` at {}", args.key, tab_lock.state_string()) - }, - Err(e) => { - let tab_lock = tab.lock().await; - format!("press `{}` failed at {}: {}", args.key, tab_lock.state_string(), e.to_string()) - }, + let log = { + let tab_lock = tab.lock().await; + match tab_lock.headless_tab.press_key(args.key.to_string().as_str()) { + Ok(_) => { + format!("press `{}` at {}", args.key, tab_lock.state_string()) + }, + Err(e) => { + format!("press `{}` failed at {}: {}", args.key, tab_lock.state_string(), e.to_string()) + }, + } }; tool_log.push(log); } From db152956921e3216918fb91f08dd2fc04e5c9944 Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 15 Nov 2024 21:55:03 +0300 Subject: [PATCH 10/14] disable html command for now --- src/integrations/integr_chrome.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index d4bb14b0d..afad179f8 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -192,7 +192,7 @@ impl Tool for ToolChrome { "open_tab ", "navigate_to ", "screenshot ", - "html ", + // "html ", "reload ", ]; if self.supports_clicks { @@ -444,6 +444,7 @@ async fn chrome_command_exec( tool_log.push(log); }, Command::Html(args) => { + // NOTE: removed from commands list, please rewrite me... let tab = { let mut chrome_session_locked = chrome_session.lock().await; let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; From 626cfed32b067f13c32bd3abe06c25a5aada5a9c Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 15 Nov 2024 21:56:27 +0300 Subject: [PATCH 11/14] default window size 800x600 --- src/integrations/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/integrations/mod.rs b/src/integrations/mod.rs index d004f0247..8ac7254a2 100644 --- a/src/integrations/mod.rs +++ b/src/integrations/mod.rs @@ -54,7 +54,7 @@ chrome: # Or you can give it ws:// path, read more here https://developer.chrome.com/docs/devtools/remote-debugging/local-server/ # In that case start chrome with --remote-debugging-port chrome_path: "ws://127.0.0.1:6006/" - window_size: [1024, 768] + window_size: [800, 600] idle_browser_timeout: 600 From c6be37348c5be5a18f061d85086899587f005b00 Mon Sep 17 00:00:00 2001 From: mitya Date: Tue, 19 Nov 2024 20:37:16 +0100 Subject: [PATCH 12/14] collect all log entries and return using new tab_log command --- src/integrations/integr_chrome.rs | 51 +++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index afad179f8..c98a9fd95 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -20,8 +20,12 @@ use headless_chrome::{Browser, LaunchOptions, Tab as HeadlessTab}; use headless_chrome::browser::tab::point::Point; use headless_chrome::protocol::cdp::Page; use headless_chrome::protocol::cdp::Emulation; +use headless_chrome::protocol::cdp::types::Event; use serde::{Deserialize, Serialize}; +use std::sync::Mutex; use std::fmt; +use tokio::time::sleep; +use chrono::DateTime; use base64::Engine; use std::io::Cursor; @@ -65,6 +69,8 @@ pub struct ChromeTab { device: DeviceType, tab_id: String, screenshot_scale_factor: f64, + // NOTE: logs vector should be at least limited + tab_log: Arc>>, } impl ChromeTab { @@ -74,6 +80,7 @@ impl ChromeTab { device: device.clone(), tab_id: tab_id.clone(), screenshot_scale_factor: 1.0, + tab_log: Arc::new(Mutex::new(Vec::new())), } } pub fn state_string(&self) -> String { @@ -194,6 +201,7 @@ impl Tool for ToolChrome { "screenshot ", // "html ", "reload ", + "tab_log ", ]; if self.supports_clicks { supported_commands.extend(vec![ @@ -354,6 +362,17 @@ async fn session_open_tab( } let tab = Arc::new(AMutex::new(ChromeTab::new(headless_tab, device, tab_id))); let tab_lock = tab.lock().await; + let tab_log = Arc::clone(&tab_lock.tab_log); + tab_lock.headless_tab.enable_log().map_err(|e| e.to_string())?; + tab_lock.headless_tab.add_event_listener(Arc::new(move |event: &Event| { + if let Event::LogEntryAdded(e) = event { + let formatted_ts = { + let dt = DateTime::from_timestamp(e.params.entry.timestamp as i64, 0).unwrap(); + dt.format("%Y-%m-%d %H:%M:%S").to_string() + }; + tab_log.lock().unwrap().push(format!("{} [{:?}]: {}", formatted_ts, e.params.entry.level, e.params.entry.text)); + } + })).map_err(|e| e.to_string())?; chrome_session.tabs.insert(tab_id.clone(), tab.clone()); Ok(format!("opened a new tab: {}\n", tab_lock.state_string())) } @@ -380,6 +399,7 @@ enum Command { ClickAt(ClickAtArgs), TypeTextAt(TypeTextAtArgs), PressKeyAt(PressKeyAtArgs), + TabLog(TabLogArgs), } async fn chrome_command_exec( @@ -559,6 +579,19 @@ async fn chrome_command_exec( } }; tool_log.push(log); + }, + Command::TabLog(args) => { + let tab = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + session_get_tab_arc(chrome_session, &args.tab_id).await? + }; + let tab_lock = tab.lock().await; + // NOTE: we're waiting for log to be collected for 3 seconds + sleep(Duration::from_secs(3)).await; + let mut tab_log_lock = tab_lock.tab_log.lock().unwrap(); + tool_log.extend(tab_log_lock.clone()); + tab_log_lock.clear(); } } @@ -629,6 +662,12 @@ struct PressKeyAtArgs { tab_id: String, } +#[derive(Debug)] +struct TabLogArgs { + // wait_secs: u32, + tab_id: String, +} + fn parse_single_command(command: &String) -> Result { let args = shell_words::split(&command).map_err(|e| e.to_string())?; if args.is_empty() { @@ -734,6 +773,18 @@ fn parse_single_command(command: &String) -> Result { } } }, + "tab_log" => { + match parsed_args.as_slice() { + [tab_id] => { + Ok(Command::TabLog(TabLogArgs { + tab_id: tab_id.clone(), + })) + }, + _ => { + Err("Missing one or several arguments 'tab_id'".to_string()) + } + } + }, _ => Err(format!("Unknown command: {:?}.", command_name)), } } From adef358cd52eb3abc2ba509a3b4c5e535acc0fac Mon Sep 17 00:00:00 2001 From: mitya Date: Wed, 20 Nov 2024 11:21:48 +0100 Subject: [PATCH 13/14] home, end keys + 1 second wait after press key --- src/integrations/integr_chrome.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index c98a9fd95..9d206d079 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -201,13 +201,13 @@ impl Tool for ToolChrome { "screenshot ", // "html ", "reload ", + "press_key_at ", + "type_text_at ", "tab_log ", ]; if self.supports_clicks { supported_commands.extend(vec![ "click_at ", - "type_text_at ", - "press_key_at ", ]); } let description = format!( @@ -569,7 +569,13 @@ async fn chrome_command_exec( }; let log = { let tab_lock = tab.lock().await; - match tab_lock.headless_tab.press_key(args.key.to_string().as_str()) { + match { + tab_lock.headless_tab.press_key(args.key.to_string().as_str()).map_err(|e| e.to_string())?; + tab_lock.headless_tab.wait_until_navigated().map_err(|e| e.to_string())?; + // TODO: sometimes page isn't ready for next step + sleep(Duration::from_secs(1)).await; + Ok::<(), String>(()) + } { Ok(_) => { format!("press `{}` at {}", args.key, tab_lock.state_string()) }, @@ -643,6 +649,8 @@ enum Key { ESC, PAGEUP, PAGEDOWN, + HOME, + END, } impl fmt::Display for Key { @@ -652,6 +660,8 @@ impl fmt::Display for Key { Key::ESC => write!(f, "Escape"), Key::PAGEUP => write!(f, "PageUp"), Key::PAGEDOWN => write!(f, "PageDown"), + Key::HOME => write!(f, "Home"), + Key::END => write!(f, "End"), } } } @@ -761,6 +771,8 @@ fn parse_single_command(command: &String) -> Result { "esc" => Key::ESC, "pageup" => Key::PAGEUP, "pagedown" => Key::PAGEDOWN, + "home" => Key::HOME, + "end" => Key::END, _ => return Err(format!("Unknown key: {}", key_str)), }; Ok(Command::PressKeyAt(PressKeyAtArgs { From e39b538aa28cba4dbba0bfe5bea5d62cad0daadf Mon Sep 17 00:00:00 2001 From: mitya Date: Fri, 22 Nov 2024 17:44:58 +0100 Subject: [PATCH 14/14] raw cdp method calls on a tab --- Cargo.toml | 4 +- src/integrations/integr_chrome.rs | 123 +++++++++++++++++++++++++++++- 2 files changed, 123 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 129ca9966..eca79df6a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -107,4 +107,6 @@ image = "0.25.2" headless_chrome = "1.0.15" nix = { version = "0.29.0", features = ["signal"] } resvg = "0.44.0" -async-tar = "0.5.0" \ No newline at end of file +async-tar = "0.5.0" +tokio-tungstenite = "0.24.0" +tungstenite = "0.24.0" diff --git a/src/integrations/integr_chrome.rs b/src/integrations/integr_chrome.rs index 9d206d079..8c233cdb8 100644 --- a/src/integrations/integr_chrome.rs +++ b/src/integrations/integr_chrome.rs @@ -27,6 +27,13 @@ use std::fmt; use tokio::time::sleep; use chrono::DateTime; +use headless_chrome::protocol::cdp::Target; +use headless_chrome::protocol::cdp::types::Method; + +use std::net::TcpStream; +use tungstenite::protocol::WebSocketConfig; +use tungstenite::stream::MaybeTlsStream; + use base64::Engine; use std::io::Cursor; use image::imageops::FilterType; @@ -91,6 +98,7 @@ impl ChromeTab { struct ChromeSession { browser: Browser, tabs: HashMap>>, + web_socket_addr: Option, } impl ChromeSession { @@ -204,6 +212,7 @@ impl Tool for ToolChrome { "press_key_at ", "type_text_at ", "tab_log ", + "chrome_dev_tools_protocol ", ]; if self.supports_clicks { supported_commands.extend(vec![ @@ -282,7 +291,11 @@ async fn setup_chrome_session( // NOTE: we're not register any tabs because they can be used by another chat setup_log.push("No opened tabs.".to_string()); - let command_session: Box = Box::new(ChromeSession { browser, tabs: HashMap::new() }); + let command_session: Box = Box::new(ChromeSession { + browser, + tabs: HashMap::new(), + web_socket_addr: args.chrome_path.clone(), + }); gcx.write().await.integration_sessions.insert( session_hashmap_key.clone(), Arc::new(AMutex::new(command_session)) ); @@ -389,6 +402,72 @@ async fn session_get_tab_arc( } } +pub fn websocket_connection( + ws_url: &String, +) -> Result>, String> { + let mut client = tungstenite::client::connect_with_config( + ws_url.as_str(), + Some(WebSocketConfig { + max_message_size: None, + max_frame_size: None, + accept_unmasked_frames: true, + ..Default::default() + }), + u8::MAX - 1, + ).map_err(|e| e.to_string())?; + let stream = client.0.get_mut(); + let stream = match stream { + MaybeTlsStream::Plain(s) => s, + _ => todo!(), + }; + stream.set_read_timeout(Some(Duration::from_millis(100))).map_err(|e| e.to_string())?; + Ok(client.0) +} + +async fn raw_cdp_call_method( + web_socket_addr: &Option, + target_id: &String, + method: &Value, +) -> Result { + let mut connection = { + if let Some(url) = web_socket_addr { + websocket_connection(&url).map_err(|e| e.to_string())? + } else { + todo!("we can't get ws address directly from browser") + } + }; + let session_id = { + let target_method = Target::AttachToTarget { + target_id: target_id.clone(), + flatten: None, + }; + let message_text = serde_json::to_string(&target_method.to_method_call(9001)).map_err(|e| e.to_string())?; + let message = tungstenite::protocol::Message::text(message_text); + connection.send(message).map_err(|e| e.to_string())?; + let result = connection.read().map_err(|e| e.to_string())?; + let json_result = result.to_string().parse::().map_err(|e| e.to_string())?; + if let Value::String(session_id) = json_result["params"]["sessionId"].clone() { + session_id + } else { + return Err(format!("Failed to get session_id for {}", target_id)); + } + }; + let result = { + let mut target_message = method.clone(); + target_message["id"] = Value::Number(serde_json::Number::from(9002)); + let target_method = Target::SendMessageToTarget { + message: target_message.to_string(), + target_id: None, + session_id: Some(session_id), + }; + let message_text = serde_json::to_string(&target_method.to_method_call(9003)).map_err(|e| e.to_string())?; + let message = tungstenite::protocol::Message::text(message_text); + connection.send(message).map_err(|e| e.to_string())?; + connection.read().map_err(|e| e.to_string())? + }; + Ok(result.to_string()) +} + #[derive(Debug)] enum Command { OpenTab(OpenTabArgs), @@ -400,6 +479,7 @@ enum Command { TypeTextAt(TypeTextAtArgs), PressKeyAt(PressKeyAtArgs), TabLog(TabLogArgs), + TabCDP(TabCDPArgs), } async fn chrome_command_exec( @@ -598,7 +678,26 @@ async fn chrome_command_exec( let mut tab_log_lock = tab_lock.tab_log.lock().unwrap(); tool_log.extend(tab_log_lock.clone()); tab_log_lock.clear(); - } + }, + Command::TabCDP(args) => { + let (tab, maybe_web_socket_addr) = { + let mut chrome_session_locked = chrome_session.lock().await; + let chrome_session = chrome_session_locked.as_any_mut().downcast_mut::().ok_or("Failed to downcast to ChromeSession")?; + (session_get_tab_arc(chrome_session, &args.tab_id).await?, chrome_session.web_socket_addr.clone()) + }; + let tab_lock = tab.lock().await; + let target_id = tab_lock.headless_tab.get_target_id(); + let log = match raw_cdp_call_method(&maybe_web_socket_addr, &target_id, &args.method).await { + Ok(result) => { + format!("CDP method `{}` called for {}: {}", args.method, tab_lock.state_string(), result) + }, + Err(e) => { + format!("failed to execute CDP method `{}` at {}: {}", args.method, tab_lock.state_string(), e.to_string()) + } + }; + sleep(Duration::from_millis(100)).await; + tool_log.push(log); + }, } Ok((tool_log, multimodal_els)) @@ -678,8 +777,14 @@ struct TabLogArgs { tab_id: String, } +#[derive(Debug)] +struct TabCDPArgs { + tab_id: String, + method: Value, +} + fn parse_single_command(command: &String) -> Result { - let args = shell_words::split(&command).map_err(|e| e.to_string())?; + let args = command.split(" ").map(|e| e.to_string()).collect::>(); if args.is_empty() { return Err("Command is empty".to_string()); } @@ -797,6 +902,18 @@ fn parse_single_command(command: &String) -> Result { } } }, + "chrome_dev_tools_protocol" => { + if parsed_args.len() < 1 { + return Err("Missing 'tab_id'".to_string()); + } + let tab_id = parsed_args[0].clone(); + let method = serde_json::from_str(parsed_args[1..].join(" ").as_str()) + .map_err(|e| format!("Can't parse cdp method: {}", e.to_string()))?; + Ok(Command::TabCDP(TabCDPArgs { + tab_id, + method, + })) + } _ => Err(format!("Unknown command: {:?}.", command_name)), } }