From 20b1002925feac12891988333e84b4c88684a7b5 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 26 Mar 2026 12:06:03 +0800 Subject: [PATCH 1/7] Simplify confirm actions and improve stale snapshot errors --- extension/src/background/index.ts | 9 ++- extension/src/commands/element-actions.ts | 16 ++++- .../big_model/element_interaction_tool.j2 | 8 +-- .../small_model/element_interaction_tool.j2 | 5 +- server/agent/tools/base.py | 10 +-- server/agent/tools/browser_executor.py | 60 ++++++++-------- .../agent/tools/element_interaction_tool.py | 8 +-- .../tests/unit/test_agent_browser_executor.py | 68 ++++++++++++++++--- server/tests/unit/test_base_classes.py | 12 ++-- server/tests/unit/test_prompt_contracts.py | 8 +++ 10 files changed, 139 insertions(+), 65 deletions(-) diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index 921fd5c..965ad65 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -126,8 +126,9 @@ function buildStoredHighlightPages(options: { function buildSnapshotPageRefreshScript(options: { elements: InteractiveElement[]; expectedDocumentId?: string; + highlightSnapshotId?: number; }): string { - const { elements, expectedDocumentId } = options; + const { elements, expectedDocumentId, highlightSnapshotId } = options; const refreshTargets = elements.map((element) => ({ id: element.id, selector: element.selector, @@ -137,6 +138,7 @@ function buildSnapshotPageRefreshScript(options: { return ` (() => { const expectedDocumentId = ${JSON.stringify(expectedDocumentId || '')}; + const highlightSnapshotId = ${highlightSnapshotId ?? "null"}; const refreshTargets = ${JSON.stringify(refreshTargets)}; function normalizeIdentityWhitespace(value, maxLength = 240) { @@ -259,7 +261,7 @@ function buildSnapshotPageRefreshScript(options: { ok: false, stale: true, error: - 'Highlight snapshot is stale because the document changed. Call highlight_elements() again.', + \`Highlight snapshot \${highlightSnapshotId} is stale because the document changed. Call highlight_elements() again.\`, }; } @@ -350,6 +352,7 @@ async function renderHighlightSnapshotPage(options: { buildSnapshotPageRefreshScript({ elements, expectedDocumentId, + highlightSnapshotId, }), true, false, @@ -2830,7 +2833,7 @@ async function handleCommand(command: Command): Promise { ok: false, stale: true, error: - "Highlight snapshot is stale because the document changed. Call highlight_elements() again." + "Highlight snapshot ${highlightSnapshotId} is stale because the document changed. Call highlight_elements() again." }; } if (!el) { diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts index 869acb6..05082e3 100644 --- a/extension/src/commands/element-actions.ts +++ b/extension/src/commands/element-actions.ts @@ -145,14 +145,14 @@ function buildSnapshotIdentityHelpersScript(): string { return overlap >= Math.max(2, Math.min(4, Math.ceil(expectedTokens.length * 0.5))); } - function validateSnapshotElement(expectedDocumentId, expectedFingerprint, el) { + function validateSnapshotElement(expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el) { const currentDocumentId = getCurrentDocumentId(); if (expectedDocumentId && currentDocumentId !== expectedDocumentId) { return { ok: false, stale: true, error: - 'Highlight snapshot is stale because the document changed. Call highlight_elements() again.', + \`Highlight snapshot \${expectedHighlightSnapshotId} is stale because the document changed. Call highlight_elements() again.\`, }; } @@ -419,6 +419,7 @@ export async function performElementClick( const script = ` (async function() { const selector = "${escapedSelector}"; + const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; ${buildEditableActivationHelpersScript()} @@ -429,6 +430,7 @@ export async function performElementClick( } const snapshotValidation = validateSnapshotElement( + expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el, @@ -705,6 +707,7 @@ export async function performElementHover( const script = ` (function() { const selector = "${escapedSelector}"; + const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; ${buildSnapshotIdentityHelpersScript()} @@ -715,6 +718,7 @@ export async function performElementHover( } const snapshotValidation = validateSnapshotElement( + expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el, @@ -1002,6 +1006,7 @@ export async function performElementScroll( script = ` (function() { const selector = "${escapedSelector}"; + const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const el = document.querySelector(selector); @@ -1014,6 +1019,7 @@ export async function performElementScroll( } const snapshotValidation = validateSnapshotElement( + expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el, @@ -1336,6 +1342,7 @@ export async function performElementSwipe( const script = ` (async function() { const selector = "${escapedSelector}"; + const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const direction = "${direction}"; @@ -1348,6 +1355,7 @@ export async function performElementSwipe( } const snapshotValidation = validateSnapshotElement( + expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el, @@ -2512,6 +2520,7 @@ export async function performKeyboardInput( const script = ` (function() { const selector = "${escapedSelector}"; + const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const text = "${escapedText}"; @@ -2523,6 +2532,7 @@ export async function performKeyboardInput( } const snapshotValidation = validateSnapshotElement( + expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el, @@ -2813,6 +2823,7 @@ export async function performElementSelect( const script = ` (function() { const selector = "${escapedSelector}"; + const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const value = ${valueJson}; @@ -2825,6 +2836,7 @@ export async function performElementSelect( } const snapshotValidation = validateSnapshotElement( + expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el, diff --git a/server/agent/prompts/big_model/element_interaction_tool.j2 b/server/agent/prompts/big_model/element_interaction_tool.j2 index 18496b2..4a2e901 100644 --- a/server/agent/prompts/big_model/element_interaction_tool.j2 +++ b/server/agent/prompts/big_model/element_interaction_tool.j2 @@ -43,7 +43,7 @@ Direct-execution actions **Practical rule**: Expect a confirmation step only for `click` and `keyboard_input`. **Hard rule for inputable targets**: Always `click` the target first and complete that confirmation before `keyboard_input`. -**Snapshot rule**: Every element-targeted action must include both `highlight_snapshot_id` and `element_id`. `element_id` is page-local within one highlight snapshot response and is not valid by itself. +**Snapshot rule**: Every initiating or direct element-targeted action must include both `highlight_snapshot_id` and `element_id`. `element_id` is page-local within one highlight snapshot response and is not valid by itself. `confirm_click` and `confirm_keyboard_input` operate on the current pending confirmation and do not need those fields repeated. **Discovery rule**: If you do not yet have the correct `element_id`, continue discovery with highlight pagination instead of replacing the missing target with guessed keywords. On the same unchanged page state, your default next step is another `highlight` call with `element_type: "any"`, the next page, and the previous `highlight_snapshot_id`. After any significant page-state change caused by your last action, restart discovery with `highlight` on `element_type: "any"` page 1 without reusing the old snapshot, because it exposes extension-derived page insight you cannot infer reliably from intent alone. Do not jump straight to `keywords` or another narrower type on that changed page. Use `keywords` only when you already see the target's exact literal text on the target itself in the current screenshot and can copy it verbatim. If a control itself shows an icon plus `52`, the literal keyword is `52`, not guessed icon words like `star`, `favorite`, or `bookmark`. @@ -57,7 +57,7 @@ Initiate a click on an element by its visual ID. // → Returns an ORANGE preview screenshot // → Review the orange highlight + HTML // → Then confirm: -{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 } +{ "action": "confirm_click" } ``` Use this for buttons, links, and other trigger controls you identified from the highlight tool. @@ -148,7 +148,7 @@ Type text into an input element by its visual ID. // → Returns an ORANGE preview screenshot // → Review the orange highlight + HTML // → Then confirm: -{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 } +{ "action": "confirm_keyboard_input" } ``` Use this for text inputs, textareas, and search boxes, but only after you already clicked the same input target and completed that click confirmation. @@ -204,7 +204,7 @@ Each action has a corresponding confirm action (use the same `element_interactio - `confirm_click` (for click action) - `confirm_keyboard_input` (for keyboard_input action) -**Parameters**: Use the same `highlight_snapshot_id` and `element_id`. `tab_id` is optional. For `confirm_keyboard_input`, the `text` comes from the pending confirmation created by the initiating action. +**Parameters**: No `element_id`, `highlight_snapshot_id`, or `tab_id` is required. The system executes the current pending confirmation. For `confirm_keyboard_input`, the `text` also comes from that pending confirmation. **Behavior**: Executes the action after visual verification. diff --git a/server/agent/prompts/small_model/element_interaction_tool.j2 b/server/agent/prompts/small_model/element_interaction_tool.j2 index 5dbf2c4..18f68cf 100644 --- a/server/agent/prompts/small_model/element_interaction_tool.j2 +++ b/server/agent/prompts/small_model/element_interaction_tool.j2 @@ -27,14 +27,14 @@ If the page did not significantly change, stay on the current `any` inventory an ```json { "action": "click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 } -{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 } +{ "action": "confirm_click" } ``` ### keyboard_input ```json { "action": "keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "text": "hello@example.com", "tab_id": 123 } -{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 } +{ "action": "confirm_keyboard_input" } ``` Use `keyboard_input` only after you already clicked the same input target and completed that click confirmation. @@ -91,6 +91,7 @@ These are content directions, not hand or finger movement directions. Do not rei - `click` and `keyboard_input` require confirmation - For any `inputable` target, the required sequence is `click` first, then `keyboard_input` +- `confirm_click` and `confirm_keyboard_input` operate on the current pending confirmation; do not repeat `element_id` or `highlight_snapshot_id` - If ORANGE preview appears, confirm only when the box position and HTML both match your target - If they do not match, do not confirm; highlight again diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py index 8e01f03..7a04083 100644 --- a/server/agent/tools/base.py +++ b/server/agent/tools/base.py @@ -115,20 +115,14 @@ def _pending_confirmation_llm_content( highlight_snapshot_id = str( pending.get("highlight_snapshot_id", self.highlight_snapshot_id or "unknown") ) - snapshot_json_value = ( - highlight_snapshot_id - if highlight_snapshot_id.isdigit() - else f'"{highlight_snapshot_id}"' - ) - confirm_cmd = ( - f'{{"action": "confirm_{action_type}", "highlight_snapshot_id": {snapshot_json_value}, "element_id": "{element_id}"}}' - ) + confirm_cmd = f'{{"action": "confirm_{action_type}"}}' text_parts = [ "## Pending Confirmation", "", "Inspect the screenshot first.", "Confirm only if the single highlighted element is exactly the intended target.", + "The pending element and highlight snapshot are already stored by the system.", "", f"**Highlight Snapshot ID**: {highlight_snapshot_id}", f"**Element ID**: {element_id}", diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py index 60c57f1..dac0f4b 100644 --- a/server/agent/tools/browser_executor.py +++ b/server/agent/tools/browser_executor.py @@ -515,35 +515,37 @@ def _execute_element_interaction_action( raise ValueError( "No pending click confirmation found. Please call click first." ) - if action.highlight_snapshot_id is None: - raise ValueError("confirm_click requires highlight_snapshot_id parameter") - if pending["element_id"] != action.element_id: + pending_element_id = pending.get("element_id") + pending_snapshot_id = pending.get("highlight_snapshot_id") + pending_extra_data = pending.get("extra_data", {}) + if pending_snapshot_id is None: + pending_snapshot_id = pending_extra_data.get("highlight_snapshot_id") + if not pending_element_id: raise ValueError( - f"Element ID mismatch. Expected {pending['element_id']}, got {action.element_id}" + "Pending click confirmation is missing element_id state." ) - if pending["highlight_snapshot_id"] != action.highlight_snapshot_id: + if pending_snapshot_id is None: raise ValueError( - f"Highlight snapshot mismatch. Expected {pending['highlight_snapshot_id']}, got {action.highlight_snapshot_id}" + "Pending click confirmation is missing highlight_snapshot_id state." ) # Execute actual click command = ClickElementCommand( - element_id=action.element_id, - highlight_snapshot_id=action.highlight_snapshot_id - or pending["extra_data"].get("highlight_snapshot_id"), + element_id=pending_element_id, + highlight_snapshot_id=pending_snapshot_id, conversation_id=self.conversation_id, - tab_id=action.tab_id or pending["extra_data"].get("tab_id"), + tab_id=pending_extra_data.get("tab_id"), ) result_dict = self._execute_command_sync(command) if not result_dict or not result_dict.get("success"): ext_error = self._extract_result_error(result_dict) raise RuntimeError(f"Failed to click element: {ext_error}") - message = f"Confirmed and clicked element: {action.element_id}" + message = f"Confirmed and clicked element: {pending_element_id}" self._clear_pending_confirmation() return self._build_observation_from_result( result_dict, message, - element_id=action.element_id, - highlight_snapshot_id=action.highlight_snapshot_id, + element_id=pending_element_id, + highlight_snapshot_id=pending_snapshot_id, ) elif action_type == "confirm_keyboard_input": @@ -552,37 +554,37 @@ def _execute_element_interaction_action( raise ValueError( "No pending keyboard_input confirmation found. Please call keyboard_input first." ) - if action.highlight_snapshot_id is None: - raise ValueError( - "confirm_keyboard_input requires highlight_snapshot_id parameter" - ) - if pending["element_id"] != action.element_id: + pending_element_id = pending.get("element_id") + pending_snapshot_id = pending.get("highlight_snapshot_id") + pending_extra_data = pending.get("extra_data", {}) + if pending_snapshot_id is None: + pending_snapshot_id = pending_extra_data.get("highlight_snapshot_id") + if not pending_element_id: raise ValueError( - f"Element ID mismatch. Expected {pending['element_id']}, got {action.element_id}" + "Pending keyboard_input confirmation is missing element_id state." ) - if pending["highlight_snapshot_id"] != action.highlight_snapshot_id: + if pending_snapshot_id is None: raise ValueError( - f"Highlight snapshot mismatch. Expected {pending['highlight_snapshot_id']}, got {action.highlight_snapshot_id}" + "Pending keyboard_input confirmation is missing highlight_snapshot_id state." ) command = KeyboardInputCommand( - element_id=action.element_id, - highlight_snapshot_id=action.highlight_snapshot_id - or pending["extra_data"].get("highlight_snapshot_id"), - text=pending["extra_data"].get("text", ""), + element_id=pending_element_id, + highlight_snapshot_id=pending_snapshot_id, + text=pending_extra_data.get("text", ""), conversation_id=self.conversation_id, - tab_id=action.tab_id or pending["extra_data"].get("tab_id"), + tab_id=pending_extra_data.get("tab_id"), ) result_dict = self._execute_command_sync(command) if not result_dict or not result_dict.get("success"): ext_error = self._extract_result_error(result_dict) raise RuntimeError(f"Failed to input text: {ext_error}") - message = f"Confirmed and input text to element: {action.element_id}" + message = f"Confirmed and input text to element: {pending_element_id}" self._clear_pending_confirmation() return self._build_observation_from_result( result_dict, message, - element_id=action.element_id, - highlight_snapshot_id=action.highlight_snapshot_id, + element_id=pending_element_id, + highlight_snapshot_id=pending_snapshot_id, ) else: diff --git a/server/agent/tools/element_interaction_tool.py b/server/agent/tools/element_interaction_tool.py index 57fac02..f68b83c 100644 --- a/server/agent/tools/element_interaction_tool.py +++ b/server/agent/tools/element_interaction_tool.py @@ -42,15 +42,15 @@ class ElementInteractionAction(OpenBrowserAction): "confirm_click", "confirm_keyboard_input", ] = Field( - description="Element interaction action (click and keyboard_input require confirm_* follow-up; hover/scroll/swipe/select execute directly)" + description="Element interaction action (click and keyboard_input require confirm_* follow-up; confirm_* executes the current pending confirmation; hover/scroll/swipe/select execute directly)" ) element_id: Optional[str] = Field( default=None, - description="Element ID (page-local numeric string) from a specific highlight_elements snapshot", + description="Element ID (page-local numeric string) from a specific highlight_elements snapshot. Required for click, hover, element scroll/swipe, keyboard_input, and select. Ignored for confirm_* actions.", ) highlight_snapshot_id: Optional[int] = Field( default=None, - description="Highlight snapshot ID returned by highlight_elements. Required for all element-targeted actions.", + description="Highlight snapshot ID returned by highlight_elements. Required for click, hover, element scroll/swipe, keyboard_input, and select. Ignored for confirm_* actions.", ) direction: Optional[Literal["up", "down", "left", "right", "next", "prev"]] = Field( default="down", @@ -83,7 +83,7 @@ class ElementInteractionAction(OpenBrowserAction): ) tab_id: Optional[int] = Field( default=None, - description="Tab ID (optional, uses active tab if not specified)", + description="Tab ID (optional, uses active tab if not specified). confirm_* actions use the tab stored in the pending confirmation.", ) diff --git a/server/tests/unit/test_agent_browser_executor.py b/server/tests/unit/test_agent_browser_executor.py index 20edab1..ed9d3fb 100644 --- a/server/tests/unit/test_agent_browser_executor.py +++ b/server/tests/unit/test_agent_browser_executor.py @@ -37,6 +37,7 @@ def test_execute_command_sync_promotes_nested_error_to_top_level(monkeypatch) -> result = executor._execute_command_sync( SwipeElementCommand( element_id="swp123", + highlight_snapshot_id=17, direction="next", conversation_id="conv-swipe-error", ) @@ -61,6 +62,7 @@ def fake_execute(command): ElementInteractionAction( action="hover", element_id="hov123", + highlight_snapshot_id=17, conversation_id="conv-hover-direct", ) ) @@ -79,7 +81,10 @@ def test_keyboard_input_sets_pending_confirmation(monkeypatch) -> None: monkeypatch.setattr( executor, "_get_element_full_html", - lambda element_id: ('', "data:image/png;base64,pending"), + lambda element_id, highlight_snapshot_id: ( + '', + "data:image/png;base64,pending", + ), ) monkeypatch.setattr( executor, @@ -91,6 +96,7 @@ def test_keyboard_input_sets_pending_confirmation(monkeypatch) -> None: ElementInteractionAction( action="keyboard_input", element_id="inp123", + highlight_snapshot_id=17, text="hello", conversation_id="conv-input-pending", ) @@ -101,6 +107,41 @@ def test_keyboard_input_sets_pending_confirmation(monkeypatch) -> None: assert observation.pending_confirmation is not None assert observation.pending_confirmation["action_type"] == "keyboard_input" assert observation.pending_confirmation["element_id"] == "inp123" + assert observation.pending_confirmation["highlight_snapshot_id"] == 17 + + +def test_confirm_click_uses_pending_confirmation_state(monkeypatch) -> None: + executor = BrowserExecutor() + executor.conversation_id = "conv-click-confirm" + executor._set_pending_confirmation( + element_id="btn13", + highlight_snapshot_id=39, + action_type="click", + full_html="", + extra_data={"tab_id": 456, "highlight_snapshot_id": 39}, + ) + + captured = {} + + def fake_execute(command): + captured["command"] = command + return {"success": True, "data": {"screenshot": "data:image/png;base64,clicked"}} + + monkeypatch.setattr(executor, "_execute_command_sync", fake_execute) + + observation = executor._execute_action_sync( + ElementInteractionAction( + action="confirm_click", + conversation_id="conv-click-confirm", + ) + ) + + assert observation.success is True + assert observation.message == "Confirmed and clicked element: btn13" + assert captured["command"].element_id == "btn13" + assert captured["command"].highlight_snapshot_id == 39 + assert captured["command"].tab_id == 456 + assert executor._get_pending_confirmation() is None def test_confirm_keyboard_input_reports_nested_extension_error(monkeypatch) -> None: @@ -108,25 +149,31 @@ def test_confirm_keyboard_input_reports_nested_extension_error(monkeypatch) -> N executor.conversation_id = "conv-input-error" executor._set_pending_confirmation( element_id="inp123", + highlight_snapshot_id=17, action_type="keyboard_input", full_html='', - extra_data={"text": "hello world"}, + extra_data={ + "text": "hello world", + "tab_id": 321, + "highlight_snapshot_id": 17, + }, ) - monkeypatch.setattr( - executor, - "_execute_command_sync", - lambda command: { + captured = {} + + def fake_execute(command): + captured["command"] = command + return { "success": False, "error": None, "data": {"error": "Input element is detached"}, - }, - ) + } + + monkeypatch.setattr(executor, "_execute_command_sync", fake_execute) observation = executor._execute_action_sync( ElementInteractionAction( action="confirm_keyboard_input", - element_id="inp123", conversation_id="conv-input-error", ) ) @@ -136,4 +183,7 @@ def test_confirm_keyboard_input_reports_nested_extension_error(monkeypatch) -> N observation.error == "Failed to input text: Input element is detached" ) + assert captured["command"].element_id == "inp123" + assert captured["command"].highlight_snapshot_id == 17 + assert captured["command"].tab_id == 321 assert "None" not in observation.message diff --git a/server/tests/unit/test_base_classes.py b/server/tests/unit/test_base_classes.py index 559225d..0793763 100644 --- a/server/tests/unit/test_base_classes.py +++ b/server/tests/unit/test_base_classes.py @@ -176,8 +176,10 @@ def test_pending_confirmation_includes_follow_up_command(self) -> None: assert "## Pending Confirmation" in text assert "Inspect the screenshot first." in text - assert '"action": "confirm_click"' in text - assert '"element_id": "a1b2c3"' in text + assert '{"action": "confirm_click"}' in text + assert '"highlight_snapshot_id"' not in text + assert '"element_id"' not in text + assert "**Element ID**: a1b2c3" in text def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) -> None: observation = OpenBrowserObservation( @@ -191,8 +193,10 @@ def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) -> text = _text_content(observation) - assert '"action": "confirm_keyboard_input"' in text - assert '"element_id": "inp789"' in text + assert '{"action": "confirm_keyboard_input"}' in text + assert '"highlight_snapshot_id"' not in text + assert '"element_id"' not in text + assert "**Element ID**: inp789" in text def test_pending_confirmation_with_screenshot_is_image_first_and_text_minimal(self) -> None: observation = OpenBrowserObservation( diff --git a/server/tests/unit/test_prompt_contracts.py b/server/tests/unit/test_prompt_contracts.py index 09cfe2d..e26a17d 100644 --- a/server/tests/unit/test_prompt_contracts.py +++ b/server/tests/unit/test_prompt_contracts.py @@ -152,6 +152,14 @@ def test_element_interaction_prompt_explains_swipe_semantics(self) -> None: assert '`direction: "prev"` means show the previous picture' in description assert "not finger or gesture directions" in description + def test_element_interaction_confirm_examples_use_pending_state_only(self) -> None: + description = get_element_interaction_tool_description() + + assert '{ "action": "confirm_click" }' in description + assert '{ "action": "confirm_keyboard_input" }' in description + assert '{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 }' not in description + assert '{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 }' not in description + def test_element_interaction_action_schema_explains_swipe_semantics(self) -> None: description = ElementInteractionAction.model_fields["direction"].description From 4e7f088f9ae5ca65581d71917b39bf57f23b0d87 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 26 Mar 2026 13:00:17 +0800 Subject: [PATCH 2/7] Add please_help_me user assistance flow Co-authored-by: openhands --- frontend/index.html | 503 ++++++++++++++++-- pyproject.toml | 4 +- server/agent/api.py | 9 +- server/agent/manager.py | 21 +- server/agent/tools/help_tool.py | 111 ++++ server/agent/user_help.py | 54 ++ server/agent/visualizer.py | 16 + server/core/browser_executor_bundle.py | 9 +- .../tests/unit/test_agent_manager_process.py | 12 +- server/tests/unit/test_user_help.py | 80 +++ uv.lock | 8 +- 11 files changed, 765 insertions(+), 62 deletions(-) create mode 100644 server/agent/tools/help_tool.py create mode 100644 server/agent/user_help.py create mode 100644 server/tests/unit/test_user_help.py diff --git a/frontend/index.html b/frontend/index.html index 9c31966..9af5c57 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -1350,6 +1350,129 @@ visibility: visible; } + .help-request-modal { + position: fixed; + inset: 0; + display: flex; + align-items: center; + justify-content: center; + padding: 24px; + background: rgba(8, 6, 4, 0.72); + backdrop-filter: blur(10px); + z-index: 3200; + opacity: 0; + visibility: hidden; + transition: opacity 0.2s ease, visibility 0.2s ease; + } + + .help-request-modal.show { + opacity: 1; + visibility: visible; + } + + .help-request-card { + width: min(640px, 100%); + display: flex; + flex-direction: column; + gap: 16px; + padding: 24px; + border-radius: 24px; + border: 1px solid rgba(255, 210, 140, 0.32); + background: linear-gradient(180deg, rgba(42, 29, 17, 0.97), rgba(29, 21, 15, 0.98)); + box-shadow: 0 32px 90px rgba(0, 0, 0, 0.42); + } + + .help-request-kicker { + color: #ffd28c; + font-size: 11px; + font-weight: 700; + letter-spacing: 0.16em; + text-transform: uppercase; + } + + .help-request-title { + font-size: 28px; + line-height: 1.05; + letter-spacing: -0.04em; + } + + .help-request-copy, + .help-request-conversation { + color: var(--text-muted); + font-size: 14px; + line-height: 1.6; + } + + .help-request-message { + padding: 16px 18px; + border-radius: 18px; + border: 1px solid rgba(255, 210, 140, 0.18); + background: rgba(255, 210, 140, 0.08); + color: var(--text-primary); + font-size: 15px; + line-height: 1.7; + white-space: pre-wrap; + } + + .help-request-textarea { + min-height: 112px; + width: 100%; + padding: 14px 16px; + border-radius: 16px; + border: 1px solid var(--panel-border); + background: rgba(21, 17, 13, 0.92); + color: var(--text-primary); + font: inherit; + font-size: 14px; + line-height: 1.6; + resize: vertical; + } + + .help-request-textarea:focus { + outline: none; + border-color: rgba(255, 210, 140, 0.42); + box-shadow: 0 0 0 4px rgba(255, 210, 140, 0.08); + } + + .help-request-actions { + display: flex; + flex-wrap: wrap; + gap: 10px; + } + + .help-request-primary, + .help-request-secondary { + border: none; + border-radius: 999px; + padding: 12px 18px; + cursor: pointer; + font: inherit; + font-size: 14px; + font-weight: 700; + transition: transform 0.18s ease, opacity 0.18s ease; + } + + .help-request-primary { + background: #ffd28c; + color: #2b1b0c; + } + + .help-request-secondary { + background: rgba(255, 255, 255, 0.08); + color: var(--text-secondary); + border: 1px solid var(--panel-border); + } + + .help-request-primary:hover, + .help-request-secondary:hover { + transform: translateY(-1px); + } + + body.help-needed .terminal-header { + border-color: rgba(255, 210, 140, 0.34); + box-shadow: 0 28px 90px rgba(0, 0, 0, 0.38), 0 0 0 1px rgba(255, 210, 140, 0.14); + } + .empty-state { text-align: center; padding: 40px 20px; @@ -2635,6 +2758,37 @@
+ +
+
+
Human Help Needed
+
The agent is waiting for you
+
Complete the manual browser step below, then resume the same conversation.
+
Conversation: Current conversation
+
The agent needs your help before it can continue.
+ +
+ + +
+
+
@@ -3132,6 +3286,9 @@

Sisyphus

let latestAgentStep = 'Waiting for a task.'; let latestAgentThinking = 'Waiting for agent reasoning.'; let latestAgentTimestamp = null; + let pendingHelpRequest = null; + let helpAttentionTimer = null; + let defaultDocumentTitle = document.title; // Sisyphus mode variables let sisyphusEnabled = false; @@ -3149,6 +3306,8 @@

Sisyphus

const MODEL_ALIAS_STORAGE_KEY = 'openbrowser_model_alias'; const AGENT_STEP_PLACEHOLDER = 'Waiting for a task.'; const AGENT_THINKING_PLACEHOLDER = 'Waiting for agent reasoning.'; + const DEFAULT_HELP_RESUME_MESSAGE = 'I have completed the human step you needed. Please continue based on the current page state.'; + const HELP_REQUEST_TITLE = 'Human Help Needed · OpenBrowser'; // Helper function to build API URL function apiUrl(path) { @@ -3173,6 +3332,214 @@

Sisyphus

return `${value.substring(0, 8)}...${value.substring(value.length - 4)}`; } + function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async function waitForProcessingToFinish(timeoutMs = 5000) { + const start = Date.now(); + while (isProcessing && Date.now() - start < timeoutMs) { + await sleep(50); + } + } + + function getHelpRequestText(event) { + return ( + event.help_request || + event.message || + event.text || + 'The agent needs your help before it can continue.' + ); + } + + function clearHelpAttention() { + if (helpAttentionTimer) { + clearInterval(helpAttentionTimer); + helpAttentionTimer = null; + } + document.title = defaultDocumentTitle; + document.body.classList.remove('help-needed'); + } + + function startHelpAttention() { + clearHelpAttention(); + document.body.classList.add('help-needed'); + + let highlighted = false; + document.title = HELP_REQUEST_TITLE; + helpAttentionTimer = setInterval(() => { + highlighted = !highlighted; + document.title = highlighted ? HELP_REQUEST_TITLE : defaultDocumentTitle; + }, 1200); + } + + function focusHelpRequestUi() { + try { + window.focus(); + } catch (error) { + console.debug('Unable to focus window:', error); + } + + setTimeout(() => { + try { + window.focus(); + } catch (error) { + console.debug('Unable to refocus window:', error); + } + const customInput = document.getElementById('help-request-input'); + if (customInput) { + customInput.focus(); + } + }, 120); + } + + function hideHelpRequestModal() { + const modal = document.getElementById('help-request-modal'); + if (modal) { + modal.classList.remove('show'); + } + } + + function clearPendingHelpRequest() { + pendingHelpRequest = null; + clearHelpAttention(); + hideHelpRequestModal(); + } + + function showHelpRequestModal(request, options = {}) { + if ( + pendingHelpRequest && + pendingHelpRequest.toolCallId && + request.toolCallId && + pendingHelpRequest.toolCallId === request.toolCallId + ) { + if (!options.silent) { + focusHelpRequestUi(); + } + return; + } + + pendingHelpRequest = request; + + const modal = document.getElementById('help-request-modal'); + const messageNode = document.getElementById('help-request-message'); + const conversationNode = document.getElementById('help-request-conversation'); + const inputNode = document.getElementById('help-request-input'); + + if (!modal || !messageNode || !conversationNode || !inputNode) { + return; + } + + messageNode.textContent = request.message; + conversationNode.textContent = shortDisplay( + request.conversationId || currentConversationId, + 'Current conversation' + ); + inputNode.value = request.customReply || ''; + modal.classList.add('show'); + startHelpAttention(); + + if (!options.silent) { + focusHelpRequestUi(); + } + + if (sisyphusRunning) { + stopSisyphus(); + addEvent({ + type: 'SystemEvent', + text: 'Sisyphus stopped because the agent requested human help.', + timestamp: new Date().toISOString() + }); + } + } + + async function resolvePendingHelpRequest(messageText) { + if (!pendingHelpRequest) { + return; + } + + const trimmed = (messageText || '').trim(); + if (!trimmed) { + addEvent({ + type: 'ErrorEvent', + text: 'Please enter a reply for the agent, or use “I\'ve resolved it”.', + timestamp: new Date().toISOString() + }); + return; + } + + const request = pendingHelpRequest; + clearPendingHelpRequest(); + await waitForProcessingToFinish(); + + if (isProcessing) { + showHelpRequestModal(request, { silent: true }); + addEvent({ + type: 'ErrorEvent', + text: 'The previous turn is still closing. Please try again in a moment.', + timestamp: new Date().toISOString() + }); + return; + } + + if (request.conversationId && currentConversationId && request.conversationId !== currentConversationId) { + addEvent({ + type: 'ErrorEvent', + text: 'The pending help request belongs to a different conversation.', + timestamp: new Date().toISOString() + }); + return; + } + + const input = document.getElementById('command-input'); + if (input) { + input.value = trimmed; + syncCommandInputHeight(); + } + + await sendCommandText(trimmed, { preserveInput: false }); + } + + async function resolveHelpWithDefaultReply() { + await resolvePendingHelpRequest(DEFAULT_HELP_RESUME_MESSAGE); + } + + function isHelpRequestEvent(event) { + return ( + event.type === 'ActionEvent' && + event.tool_name === 'please_help_me' + ); + } + + function normalizeFrontendEvent(data, fallback = {}) { + const eventType = data.type || fallback.type || 'SystemEvent'; + let source = data.source || fallback.source || 'agent'; + + if (eventType === 'MessageEvent') { + source = data.source || fallback.source || (data.role === 'user' ? 'user' : 'agent'); + } + + return { + id: fallback.id || `event-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`, + type: eventType, + text: data.text || fallback.text || '', + timestamp: data.timestamp || fallback.timestamp || new Date().toISOString(), + image: data.image || fallback.image || null, + success: data.success, + message: data.message, + error: data.error, + role: data.role || fallback.role || null, + activated_skills: data.activated_skills || fallback.activated_skills || null, + sender: data.sender || fallback.sender || null, + source: source, + summary: data.summary || fallback.summary || null, + tool_name: data.tool_name || fallback.tool_name || null, + tool_call_id: data.tool_call_id || fallback.tool_call_id || null, + help_request: data.help_request || fallback.help_request || null, + awaiting_user_help: Boolean(data.awaiting_user_help || fallback.awaiting_user_help) + }; + } + function truncateForSummary(value, maxLength = 140) { const normalized = (value || '').replace(/\s+/g, ' ').trim(); if (!normalized) { @@ -3217,6 +3584,10 @@

Sisyphus

} function getAgentVoiceFallback() { + if (pendingHelpRequest) { + return pendingHelpRequest.message; + } + if (isProcessing) { return 'The agent is working. Follow the browser stage and the step feed below.'; } @@ -3390,6 +3761,20 @@

Sisyphus

latestAgentTimestamp = timestamp; shouldResetStageScroll = true; } else if (event.type === 'ActionEvent') { + if (event.tool_name === 'please_help_me') { + const helpText = getHelpRequestText(event); + latestAgentStep = 'Needs human help'; + latestAgentThinking = truncateForSummary(helpText, 220); + latestAgentMessage = helpText; + latestAgentTimestamp = timestamp; + shouldResetStageScroll = true; + renderAgentVoice(); + if (shouldResetStageScroll) { + resetStageConversationScroll(); + } + return; + } + const parsed = parseActionSections(content); latestAgentStep = truncateForSummary( event.summary || parsed.summary || content, @@ -3483,6 +3868,8 @@

Sisyphus

if (event.type === 'MessageEvent' && event.source === 'user') { label = 'User task'; + } else if (event.tool_name === 'please_help_me') { + label = 'Help'; } else if (event.type === 'SystemEvent') { label = 'System'; } @@ -3512,6 +3899,8 @@

Sisyphus

let nextStep = 'Paste the browser UUID to connect this browser window.'; if (!appConfig?.is_configured) { nextStep = 'Open Settings and add a model before running tasks.'; + } else if (pendingHelpRequest) { + nextStep = 'Human help requested. Complete the manual step and resume the same conversation.'; } else if (sisyphusEnabled && sisyphusRunning) { nextStep = 'Sisyphus is running. Watch the browser stage or stop the loop.'; } else if (sisyphusEnabled) { @@ -3590,6 +3979,7 @@

Sisyphus

currentConversationId ) { const oldConversationId = currentConversationId; + clearPendingHelpRequest(); currentConversationId = null; sisyphusCurrentConversationId = null; updateConversationIdDisplay(); @@ -3697,6 +4087,7 @@

Sisyphus

} const oldConversationId = currentConversationId; + clearPendingHelpRequest(); currentConversationId = null; sisyphusCurrentConversationId = null; updateConversationIdDisplay(); @@ -4305,6 +4696,7 @@

Sisyphus

// If deleted session was current, clear it if (sessionId === currentConversationId) { + clearPendingHelpRequest(); currentConversationId = null; localStorage.removeItem('openbrowser_conversation_id'); updateConversationIdDisplay(); @@ -4391,6 +4783,9 @@

Sisyphus

resetEventCounters(); resetRecentActivity(); resetAgentVoice(); + clearPendingHelpRequest(); + + let unresolvedHelpRequest = null; // Replay events for (const event of data.events) { @@ -4402,15 +4797,27 @@

Sisyphus

} // Convert to frontend event format - const frontendEvent = { - type: eventData.type || event.event_type, - text: eventData.text || eventData.action || eventData.observation || '', - timestamp: eventData.timestamp || event.created_at, - role: eventData.role - }; + const frontendEvent = normalizeFrontendEvent(eventData, { + type: event.event_type, + timestamp: event.created_at + }); updateEventCounters(frontendEvent.type); addEvent(frontendEvent); + + if (frontendEvent.type === 'MessageEvent' && frontendEvent.source === 'user') { + unresolvedHelpRequest = null; + } else if (isHelpRequestEvent(frontendEvent)) { + unresolvedHelpRequest = { + conversationId: conversationId, + toolCallId: frontendEvent.tool_call_id, + message: getHelpRequestText(frontendEvent) + }; + } + } + + if (unresolvedHelpRequest) { + showHelpRequestModal(unresolvedHelpRequest, { silent: true }); } console.log(`Loaded ${data.events.length} events for session ${conversationId}`); @@ -4573,6 +4980,7 @@

Sisyphus

// When cwd changes, clear current conversation to force new creation if (currentConversationId) { console.log(`[Frontend] CWD changed to "${this.value}", clearing current conversation`); + clearPendingHelpRequest(); currentConversationId = null; localStorage.removeItem('openbrowser_conversation_id'); updateConversationIdDisplay(); @@ -4640,15 +5048,25 @@

Sisyphus

// Send command to agent async function sendCommand() { const input = document.getElementById('command-input'); - const command = input.value.trim(); - + return sendCommandText(input.value.trim(), { inputElement: input }); + } + + async function sendCommandText(command, options = {}) { + const input = options.inputElement || document.getElementById('command-input'); + if (!command || isProcessing) { return; } - + + if (pendingHelpRequest) { + clearPendingHelpRequest(); + } + // Clear input - input.value = ''; - syncCommandInputHeight(); + if (!options.preserveInput && input) { + input.value = ''; + syncCommandInputHeight(); + } // Add user command to terminal (always show user message) console.log(`[Frontend] Sending command: "${command}"`); @@ -4828,9 +5246,10 @@

Sisyphus

processEvent(data); } else if (eventType === 'complete') { let message = 'Conversation completed'; + let completionData = null; try { - const data = JSON.parse(eventData); - message = data.message || message; + completionData = JSON.parse(eventData); + message = completionData.message || message; } catch (e) { // Use default message } @@ -4851,6 +5270,14 @@

Sisyphus

isProcessing = false; isConversationRunning = false; updateGuidedPanels(); + + if (completionData && completionData.needs_user_help) { + showHelpRequestModal({ + conversationId: completionData.conversation_id || currentConversationId, + toolCallId: completionData.tool_call_id || null, + message: completionData.help_request || message + }); + } } else if (eventType === 'error') { let errorMessage = 'SSE connection error'; try { @@ -5108,32 +5535,7 @@

Sisyphus

return; } - // Determine source based on role or event type - let source = 'agent'; // default - if (eventType === 'MessageEvent') { - // Use role field for MessageEvent - source = data.role === 'user' ? 'user' : 'agent'; - } else if (data.text && data.text.includes('user:')) { - // Fallback for other event types that might have user text - source = 'user'; - } - - // Create event object - const event = { - id: `event-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`, - type: eventType, - text: data.text || '', - timestamp: data.timestamp || new Date().toISOString(), - image: data.image || null, - success: data.success, - message: data.message, - error: data.error, - role: data.role || null, // Include role field - activated_skills: data.activated_skills || null, - sender: data.sender || null, - source: source, - summary: data.summary || null // Include summary field for collapsed display - }; + const event = normalizeFrontendEvent(data); console.log(`[Frontend] Created event:`, event); @@ -5150,6 +5552,16 @@

Sisyphus

// Render event (without image in the event line) addEvent(event); + + if (isHelpRequestEvent(event)) { + if (!pendingHelpRequest || pendingHelpRequest.toolCallId !== event.tool_call_id) { + showHelpRequestModal({ + conversationId: currentConversationId, + toolCallId: event.tool_call_id, + message: getHelpRequestText(event) + }); + } + } } // Update event counters and display @@ -5216,6 +5628,11 @@

Sisyphus

typeColor = '#ffb8b8'; typeLabel = 'ERROR'; } + + if (event.tool_name === 'please_help_me') { + typeColor = '#ffd28c'; + typeLabel = 'HELP'; + } // Format timestamp const timestamp = new Date(event.timestamp); @@ -5225,9 +5642,13 @@

Sisyphus

let content = event.text || event.message || ''; if (event.error) { content = `ERROR: ${event.error}`; + } else if (event.tool_name === 'please_help_me') { + content = getHelpRequestText(event); } const headerSummary = event.summary || ( - event.type === 'MessageEvent' + event.tool_name === 'please_help_me' + ? truncateForSummary(getHelpRequestText(event), 96) + : event.type === 'MessageEvent' ? truncateForSummary(content, 72) : null ); @@ -5446,6 +5867,7 @@

Sisyphus

async function refreshConversationId() { // Clear current conversation ID to force creation of a new one on next command const oldConversationId = currentConversationId; + clearPendingHelpRequest(); currentConversationId = null; localStorage.removeItem('openbrowser_conversation_id'); updateConversationIdDisplay(); @@ -5708,6 +6130,7 @@

Sisyphus

if (currentConversationId) { const oldConversationId = currentConversationId; + clearPendingHelpRequest(); currentConversationId = null; sisyphusCurrentConversationId = null; updateConversationIdDisplay(); diff --git a/pyproject.toml b/pyproject.toml index 8399707..cc66190 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ dependencies = [ "pillow>=10.0.0", "numpy>=1.24.0", "requests>=2.31.0", - "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@4e6dab44f253b3c03c46f7469dc58aba97540c88#subdirectory=openhands-sdk", - "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@4e6dab44f253b3c03c46f7469dc58aba97540c88#subdirectory=openhands-tools", + "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#subdirectory=openhands-sdk", + "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#subdirectory=openhands-tools", "litellm @ git+https://github.com/softpudding/litellm.git@bfba5e3889829067baeab3b12d38008360913771", ] diff --git a/server/agent/api.py b/server/agent/api.py index 5a67545..6e07fc6 100644 --- a/server/agent/api.py +++ b/server/agent/api.py @@ -16,6 +16,7 @@ from server.api.sse import SSEEvent from server.agent.manager import agent_manager from server.agent.conversation import ConversationState +from server.agent.user_help import build_completion_event_payload from server.core.session_manager import session_manager, SessionStatus logger = logging.getLogger(__name__) @@ -280,10 +281,10 @@ def run_conversation(): event_queue.put( SSEEvent( "complete", - { - "conversation_id": conversation_id, - "message": "Conversation completed", - }, + build_completion_event_payload( + conversation_id, + conv_state.conversation.state.events, + ), ) ) logger.debug(f"DEBUG: Complete event put into queue") diff --git a/server/agent/manager.py b/server/agent/manager.py index c05d140..b046cec 100644 --- a/server/agent/manager.py +++ b/server/agent/manager.py @@ -30,6 +30,8 @@ from server.api.sse import SSEEvent from server.agent.visualizer import QueueVisualizer from server.agent.conversation import ConversationState +from server.agent.user_help import PLEASE_HELP_ME_TOOL_NAME +import server.agent.tools.help_tool # noqa: F401 from server.agent.tools.browser_executor import remove_browser_executor from server.core.ipc_router import IPCRouter from server.core.ipc_types import BrowserCommandMessage @@ -97,11 +99,26 @@ def __init__(self, multi_process_mode: bool = False): Tool(name="javascript"), # JavaScript execution fallback ] self.general_tools = [ + Tool(name=PLEASE_HELP_ME_TOOL_NAME), Tool(name=TerminalTool.name), # Terminal access Tool(name=FileEditorTool.name), # File editing Tool(name=TaskTrackerTool.name), # Task tracking ] + def _build_agent_context(self) -> AgentContext: + """Create the shared AgentContext used for OpenBrowser conversations.""" + help_suffix = ( + "When you need a human to complete a manual step, do not only say it in " + "assistant text. Call `please_help_me` with a concise, actionable " + "message, then stop and wait for the user's next message. Use this for " + "CAPTCHA, MFA, approvals, account-specific choices, or any step that " + "requires the real human user." + ) + return AgentContext( + current_datetime=datetime.now(), + system_message_suffix=help_suffix, + ) + def _resolve_llm_settings( self, model: Optional[str] = None, @@ -282,7 +299,7 @@ def _create_conversation_in_process( The conversation ID """ # Create agent with tools - agent_context = AgentContext(current_datetime=datetime.now()) + agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) agent = Agent( @@ -508,7 +525,7 @@ def get_or_create_conversation( # Create new conversation with the given ID # Create agent with tools - agent_context = AgentContext(current_datetime=datetime.now()) + agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) agent = Agent( diff --git a/server/agent/tools/help_tool.py b/server/agent/tools/help_tool.py new file mode 100644 index 0000000..1a87881 --- /dev/null +++ b/server/agent/tools/help_tool.py @@ -0,0 +1,111 @@ +"""Tool for explicit agent-to-human handoff requests.""" + +from collections.abc import Sequence +from typing import Any + +from pydantic import Field +from rich.text import Text + +from openhands.sdk import Action, TextContent +from openhands.sdk.conversation.state import ConversationExecutionStatus +from openhands.sdk.tool import ( + ToolAnnotations, + ToolDefinition, + ToolExecutor, + register_tool, +) + +from server.agent.user_help import PLEASE_HELP_ME_TOOL_NAME +from server.agent.tools.base import OpenBrowserObservation + + +class PleaseHelpMeAction(Action): + """Action asking the human user to complete a manual step.""" + + message: str = Field( + description=( + "The exact message to show the human user, describing what they need " + "to do manually before you can continue." + ) + ) + + @property + def visualize(self) -> Text: + content = Text() + content.append("Please help me:\n", style="bold yellow") + content.append(self.message) + return content + + +class PleaseHelpMeObservation(OpenBrowserObservation): + """Observation shown after a help request is emitted.""" + + help_message: str = Field( + description="The help request shown to the user for this pending turn." + ) + pending_user_decision: bool = Field( + default=True, + description="Whether the agent is waiting for the user's manual help.", + ) + + @property + def visualize(self) -> Text: + return Text("Pending user's decision") + + @property + def to_llm_content(self) -> Sequence[TextContent]: + return [TextContent(text="Pending user's decision")] + + +class PleaseHelpMeExecutor( + ToolExecutor[PleaseHelpMeAction, PleaseHelpMeObservation] +): + """Executor that pauses the current turn and surfaces a user-help request.""" + + def __call__( + self, action: PleaseHelpMeAction, conversation: Any = None + ) -> PleaseHelpMeObservation: + if conversation is not None: + conversation.state.execution_status = ConversationExecutionStatus.FINISHED + + return PleaseHelpMeObservation( + success=True, + message="Pending user's decision", + help_message=action.message, + pending_user_decision=True, + ) + + +class PleaseHelpMeTool( + ToolDefinition[PleaseHelpMeAction, PleaseHelpMeObservation] +): + """Tool for requesting human intervention.""" + + name = PLEASE_HELP_ME_TOOL_NAME + + @classmethod + def create(cls, conv_state=None, **params) -> Sequence["PleaseHelpMeTool"]: + if params: + raise ValueError("PleaseHelpMeTool doesn't accept parameters") + + return [ + cls( + description=( + "Ask the human user to complete a manual step such as CAPTCHA, " + "MFA, approval, or any browser action you cannot safely do." + ), + action_type=PleaseHelpMeAction, + observation_type=PleaseHelpMeObservation, + executor=PleaseHelpMeExecutor(), + annotations=ToolAnnotations( + title=PLEASE_HELP_ME_TOOL_NAME, + readOnlyHint=True, + destructiveHint=False, + idempotentHint=True, + openWorldHint=False, + ), + ) + ] + + +register_tool(PLEASE_HELP_ME_TOOL_NAME, PleaseHelpMeTool.create) diff --git a/server/agent/user_help.py b/server/agent/user_help.py new file mode 100644 index 0000000..7e3c29d --- /dev/null +++ b/server/agent/user_help.py @@ -0,0 +1,54 @@ +"""Helpers for explicit agent-to-human help requests.""" + +from collections.abc import Sequence +from typing import Any + +from openhands.sdk.event import ActionEvent, MessageEvent +from openhands.sdk.event.base import Event + + +PLEASE_HELP_ME_TOOL_NAME = "please_help_me" + + +def get_pending_user_help_request(events: Sequence[Event]) -> dict[str, Any] | None: + """Return the latest unresolved help request for the current turn, if any.""" + for event in reversed(events): + if isinstance(event, MessageEvent) and event.source == "user": + return None + + if ( + isinstance(event, ActionEvent) + and event.source == "agent" + and event.tool_name == PLEASE_HELP_ME_TOOL_NAME + ): + action = event.action + message = getattr(action, "message", None) + if not isinstance(message, str) or not message.strip(): + message = "The agent needs your help before it can continue." + + return { + "message": message, + "tool_call_id": event.tool_call_id, + } + + return None + + +def build_completion_event_payload( + conversation_id: str, events: Sequence[Event] +) -> dict[str, Any]: + """Build the SSE completion payload for a finished run.""" + help_request = get_pending_user_help_request(events) + if help_request is not None: + return { + "conversation_id": conversation_id, + "message": "Agent is waiting for user help", + "needs_user_help": True, + "help_request": help_request["message"], + "tool_call_id": help_request["tool_call_id"], + } + + return { + "conversation_id": conversation_id, + "message": "Conversation completed", + } diff --git a/server/agent/visualizer.py b/server/agent/visualizer.py index a037fc9..d5671d6 100644 --- a/server/agent/visualizer.py +++ b/server/agent/visualizer.py @@ -19,6 +19,7 @@ ) from server.api.sse import SSEEvent +from server.agent.user_help import PLEASE_HELP_ME_TOOL_NAME from server.core.session_manager import session_manager logger = logging.getLogger(__name__) @@ -88,6 +89,7 @@ def on_event(self, event: Event) -> None: "type": event_type, "text": text_content, "timestamp": getattr(event, "timestamp", None), + "source": getattr(event, "source", None), } # Handle different event types using isinstance for clarity @@ -97,13 +99,22 @@ def on_event(self, event: Event) -> None: # Process specific event types (mutually exclusive) if isinstance(event, ActionEvent): + sse_data["tool_name"] = event.tool_name + sse_data["tool_call_id"] = event.tool_call_id # ActionEvent has action attribute if event.action: sse_data["action"] = str(event.action) if event.summary: sse_data["summary"] = str(event.summary) + if event.tool_name == PLEASE_HELP_ME_TOOL_NAME and event.action: + help_request = getattr(event.action, "message", None) + if isinstance(help_request, str) and help_request.strip(): + sse_data["help_request"] = help_request + sse_data["awaiting_user_help"] = True elif isinstance(event, ObservationEvent): + sse_data["tool_name"] = event.tool_name + sse_data["tool_call_id"] = event.tool_call_id # ObservationEvent has observation attribute with possible image content obs = event.observation # Extract observation properties (same as original hasattr checks) @@ -121,6 +132,11 @@ def on_event(self, event: Event) -> None: sse_data["image"] = obs.image_url elif hasattr(obs, "image") and obs.image: sse_data["image"] = obs.image + if event.tool_name == PLEASE_HELP_ME_TOOL_NAME: + sse_data["awaiting_user_help"] = True + help_request = getattr(obs, "help_message", None) + if isinstance(help_request, str) and help_request.strip(): + sse_data["help_request"] = help_request elif isinstance(event, MessageEvent): # MessageEvent has llm_message with role information diff --git a/server/core/browser_executor_bundle.py b/server/core/browser_executor_bundle.py index 9391570..da1b78b 100644 --- a/server/core/browser_executor_bundle.py +++ b/server/core/browser_executor_bundle.py @@ -19,6 +19,7 @@ from server.agent.manager import OpenBrowserAgentManager from server.api.sse import SSEEvent +from server.agent.user_help import build_completion_event_payload from server.core.processor import CommandProcessor from server.models.commands import parse_command, CommandResponse @@ -300,10 +301,10 @@ async def execute_agent_message( event_queue.put( SSEEvent( "complete", - { - "conversation_id": self.conversation_id, - "message": "Conversation completed", - }, + build_completion_event_payload( + self.conversation_id, + conv_state.conversation.state.events, + ), ) ) diff --git a/server/tests/unit/test_agent_manager_process.py b/server/tests/unit/test_agent_manager_process.py index 3af806f..6a0166e 100644 --- a/server/tests/unit/test_agent_manager_process.py +++ b/server/tests/unit/test_agent_manager_process.py @@ -65,9 +65,8 @@ def test_multi_process_mode_initializes_infrastructure(self) -> None: def test_large_models_keep_full_browser_toolset(self) -> None: """Large models should keep javascript plus general tools.""" - manager = OpenBrowserAgentManager() - with patch("server.agent.manager.llm_config_manager") as mock_llm_config: + manager = OpenBrowserAgentManager() mock_llm_config.reload_config.return_value = MagicMock() mock_llm_config.get_llm_config.return_value = MagicMock( model="dashscope/qwen3.5-plus", @@ -86,6 +85,7 @@ def test_large_models_keep_full_browser_toolset(self) -> None: "element_interaction", "dialog", "javascript", + "please_help_me", "terminal", "file_editor", "task_tracker", @@ -93,9 +93,8 @@ def test_large_models_keep_full_browser_toolset(self) -> None: def test_small_models_drop_javascript_only(self) -> None: """Small models keep general tools but lose javascript.""" - manager = OpenBrowserAgentManager() - with patch("server.agent.manager.llm_config_manager") as mock_llm_config: + manager = OpenBrowserAgentManager() mock_llm_config.reload_config.return_value = MagicMock() mock_llm_config.get_llm_config.return_value = MagicMock( model="dashscope/qwen3.5-flash", @@ -113,6 +112,7 @@ def test_small_models_drop_javascript_only(self) -> None: "highlight", "element_interaction", "dialog", + "please_help_me", "terminal", "file_editor", "task_tracker", @@ -120,9 +120,8 @@ def test_small_models_drop_javascript_only(self) -> None: def test_unknown_models_default_to_large_profile(self) -> None: """Unconfigured models should keep the large-model toolset.""" - manager = OpenBrowserAgentManager() - with patch("server.agent.manager.llm_config_manager") as mock_llm_config: + manager = OpenBrowserAgentManager() mock_llm_config.reload_config.return_value = MagicMock() mock_llm_config.get_llm_config.return_value = MagicMock( model="dashscope/qwen3.5-plus", @@ -135,6 +134,7 @@ def test_unknown_models_default_to_large_profile(self) -> None: ] assert "javascript" in tool_names + assert "please_help_me" in tool_names def test_system_prompt_kwargs_follow_large_model_profile(self) -> None: """Large models should advertise full browser freedom in system prompt.""" diff --git a/server/tests/unit/test_user_help.py b/server/tests/unit/test_user_help.py new file mode 100644 index 0000000..c4c185a --- /dev/null +++ b/server/tests/unit/test_user_help.py @@ -0,0 +1,80 @@ +"""Tests for explicit user-help requests.""" + +import queue +from unittest.mock import patch + +from openhands.sdk.event import ActionEvent, MessageEvent +from openhands.sdk.llm import Message, MessageToolCall, TextContent + +from server.agent.user_help import build_completion_event_payload +from server.agent.visualizer import QueueVisualizer +from server.agent.tools.help_tool import PleaseHelpMeAction + + +def _help_action_event(message: str) -> ActionEvent: + tool_call = MessageToolCall( + id="call-help-1", + name="please_help_me", + arguments='{"message": "ignored"}', + origin="completion", + ) + return ActionEvent( + source="agent", + thought=[TextContent(text="Need help")], + action=PleaseHelpMeAction(message=message), + tool_name="please_help_me", + tool_call_id="call-help-1", + tool_call=tool_call, + llm_response_id="resp-help-1", + ) + + +def test_build_completion_event_payload_marks_pending_user_help() -> None: + payload = build_completion_event_payload( + "conv-help-1", + [_help_action_event("Please solve the CAPTCHA, then reply ok.")], + ) + + assert payload == { + "conversation_id": "conv-help-1", + "message": "Agent is waiting for user help", + "needs_user_help": True, + "help_request": "Please solve the CAPTCHA, then reply ok.", + "tool_call_id": "call-help-1", + } + + +def test_build_completion_event_payload_clears_after_user_reply() -> None: + payload = build_completion_event_payload( + "conv-help-2", + [ + _help_action_event("Please approve the login."), + MessageEvent( + source="user", + llm_message=Message(role="user", content=[TextContent(text="done")]), + ), + ], + ) + + assert payload == { + "conversation_id": "conv-help-2", + "message": "Conversation completed", + } + + +def test_visualizer_surfaces_structured_help_request_fields() -> None: + event_queue: queue.Queue = queue.Queue() + visualizer = QueueVisualizer(event_queue=event_queue, conversation_id="conv-help-3") + + with patch("server.agent.visualizer.session_manager.save_event", return_value=True): + visualizer.on_event(_help_action_event("Please complete MFA and reply done.")) + + sse_event = event_queue.get_nowait() + + assert sse_event.event_type == "agent_event" + assert sse_event.data["type"] == "ActionEvent" + assert sse_event.data["tool_name"] == "please_help_me" + assert sse_event.data["tool_call_id"] == "call-help-1" + assert sse_event.data["help_request"] == "Please complete MFA and reply done." + assert sse_event.data["awaiting_user_help"] is True + assert sse_event.data["source"] == "agent" diff --git a/uv.lock b/uv.lock index 9eeff06..61dda0c 100644 --- a/uv.lock +++ b/uv.lock @@ -1675,8 +1675,8 @@ requires-dist = [ { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=bfba5e3889829067baeab3b12d38008360913771" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, - { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=4e6dab44f253b3c03c46f7469dc58aba97540c88" }, - { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=4e6dab44f253b3c03c46f7469dc58aba97540c88" }, + { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" }, + { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "pydantic", specifier = ">=2.5.0" }, @@ -2221,7 +2221,7 @@ wheels = [ [[package]] name = "openhands-sdk" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=4e6dab44f253b3c03c46f7469dc58aba97540c88#4e6dab44f253b3c03c46f7469dc58aba97540c88" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" } dependencies = [ { name = "agent-client-protocol" }, { name = "deprecation" }, @@ -2241,7 +2241,7 @@ dependencies = [ [[package]] name = "openhands-tools" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=4e6dab44f253b3c03c46f7469dc58aba97540c88#4e6dab44f253b3c03c46f7469dc58aba97540c88" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" } dependencies = [ { name = "bashlex" }, { name = "binaryornot" }, From 66249e0384d2120982401d99e2ba0af020525f79 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 26 Mar 2026 13:30:43 +0800 Subject: [PATCH 3/7] fix(extension): keep semantic controls clickable in swipe regions --- .../src/__tests__/highlight-detection.test.ts | 6 +++- .../commands/highlight-detection.injected.js | 31 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/extension/src/__tests__/highlight-detection.test.ts b/extension/src/__tests__/highlight-detection.test.ts index 4c16134..eb0c6c6 100644 --- a/extension/src/__tests__/highlight-detection.test.ts +++ b/extension/src/__tests__/highlight-detection.test.ts @@ -213,15 +213,19 @@ describe('highlight-detection helpers', () => { expect(script).toContain('interactionHints'); }); - test('buildHighlightDetectionScript hides clickable when swipe or scroll affordances are present', () => { + test('buildHighlightDetectionScript keeps semantic controls clickable in swipe regions', () => { const script = buildHighlightDetectionScript({ elementType: 'any' }); const start = script.indexOf('function toInteractiveElement'); const end = script.indexOf('function countVisibleClickableCandidates', start); const toInteractiveElementSource = script.slice(start, end); + expect(script).toContain('function isSemanticControlElement'); expect(toInteractiveElementSource).toContain( "candidate.type === 'clickable'", ); + expect(toInteractiveElementSource).toContain( + '!isSemanticControlElement(candidate.element)', + ); expect(toInteractiveElementSource).toContain( "interactionHints.includes('swipable')", ); diff --git a/extension/src/commands/highlight-detection.injected.js b/extension/src/commands/highlight-detection.injected.js index 80d16f7..a499e71 100644 --- a/extension/src/commands/highlight-detection.injected.js +++ b/extension/src/commands/highlight-detection.injected.js @@ -450,6 +450,36 @@ function getSemanticClickableSignal(el) { return null; } +function isSemanticControlElement(el) { + if (!(el instanceof HTMLElement)) { + return false; + } + + const tag = el.tagName.toLowerCase(); + const role = (el.getAttribute('role') || '').toLowerCase(); + + if (tag === 'button' || tag === 'summary') { + return true; + } + + if (tag === 'a' && (el.getAttribute('href') || el.hasAttribute('target'))) { + return true; + } + + if (tag === 'input') { + const inputType = (el.getAttribute('type') || 'text').toLowerCase(); + if ( + ['button', 'submit', 'reset', 'image', 'checkbox', 'radio'].includes( + inputType, + ) + ) { + return true; + } + } + + return POINTER_ROLE_SET.has(role); +} + function isMeaningfulPointerCandidate(el) { if (!(el instanceof HTMLElement)) { return false; @@ -1439,6 +1469,7 @@ function toInteractiveElement(candidate) { const interactionHints = getInteractionHints(candidate.element); const displayType = candidate.type === 'clickable' && + !isSemanticControlElement(candidate.element) && (interactionHints.includes('swipable') || isScrollableCandidate(candidate.element)) ? 'scrollable' From 69c8c38b9ec31fb0e69ce0d4b79562581bf4eee2 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 26 Mar 2026 21:20:14 +0800 Subject: [PATCH 4/7] Remove JavaScript browser tool and pin agent-sdk --- AGENTS.md | 38 +----- pyproject.toml | 4 +- server/AGENTS.md | 16 +-- server/agent/api.py | 4 +- server/agent/manager.py | 21 +--- .../big_model/element_interaction_tool.j2 | 6 +- .../prompts/big_model/javascript_tool.j2 | 60 ---------- .../prompts/small_model/javascript_tool.j2 | 60 ---------- server/agent/tools/browser_executor.py | 52 +------- server/agent/tools/javascript_tool.py | 111 ------------------ server/agent/tools/prompt_context.py | 6 - server/agent/tools/toolset.py | 21 ++-- .../integration/test_toolset_integration.py | 33 ++++-- .../tests/unit/test_agent_manager_process.py | 19 ++- server/tests/unit/test_prompt_context.py | 1 - server/tests/unit/test_toolset.py | 9 +- uv.lock | 8 +- 17 files changed, 64 insertions(+), 405 deletions(-) delete mode 100644 server/agent/prompts/big_model/javascript_tool.j2 delete mode 100644 server/agent/prompts/small_model/javascript_tool.j2 delete mode 100644 server/agent/tools/javascript_tool.py diff --git a/AGENTS.md b/AGENTS.md index f7226fe..ad00f55 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -39,8 +39,7 @@ OpenBrowser/ | Highlight tool | `server/agent/tools/highlight_tool.py` | HighlightTool for element discovery | | Element interaction | `server/agent/tools/element_interaction_tool.py` | ElementInteractionTool with 2PC flow | | Dialog tool | `server/agent/tools/dialog_tool.py` | DialogTool for dialog handling | -| JavaScript tool | `server/agent/tools/javascript_tool.py` | JavaScriptTool for fallback execution | -| ToolSet aggregator | `server/agent/tools/toolset.py` | OpenBrowserToolSet aggregates all 5 tools | +| ToolSet aggregator | `server/agent/tools/toolset.py` | OpenBrowserToolSet aggregates all 4 tools | | Extension entry | `extension/src/background/index.ts` | Command handler, dialog processing | | Dialog manager | `extension/src/commands/dialog.ts` | CDP dialog events, cascading | | JavaScript execution | `extension/src/commands/javascript.ts` | CDP Runtime.evaluate, dialog race | @@ -154,26 +153,15 @@ OpenBrowser uses Jinja2 templates for agent prompts, enabling dynamic content in ### Template Structure - **Location**: `server/agent/prompts/` directory - **Format**: `.j2` extension with Jinja2 syntax -- **5 Tool Templates**: Each of the 5 focused tools has its own template: +- **4 Tool Templates**: Each of the 4 focused tools has its own template: - `tab_tool.j2` - Tab management documentation - `highlight_tool.j2` - Element discovery with color coding - `element_interaction_tool.j2` - 2PC flow with orange confirmations - `dialog_tool.j2` - Dialog handling - - `javascript_tool.j2` - JavaScript fallback - -### Dynamic JavaScript Control -The `javascript_execute` command can be disabled via environment variable: -```bash -export OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE=1 -``` -When disabled: -- Template removes all `javascript_execute` references using `{% if not disable_javascript %}` conditionals -- `OpenBrowserAction.type` description excludes `'javascript_execute'` -- Command execution returns error if attempted ### Template Features - **Conditional rendering**: Use `{% if %}` blocks for configurable sections -- **Variable injection**: Pass context variables like `disable_javascript` at render time +- **Variable injection**: Pass context variables like model profile flags at render time - **Clean output**: `trim_blocks=True` and `lstrip_blocks=True` remove extra whitespace - **Caching**: Templates are cached after first load for performance @@ -246,8 +234,8 @@ Elements are identified by a 6-character hash string: | `scroll_element` | Scroll by element ID | `{element_id: "m5k2p8", direction: "down"}` | | `keyboard_input` | Type into element | `{element_id: "j4n7q1", text: "hello"}` | -### Tool Mapping (5-Tool Architecture) -The visual interaction workflow is implemented across 5 focused tools: +### Tool Mapping (4-Tool Architecture) +The visual interaction workflow is implemented across 4 focused tools: | Tool | Commands | Purpose | |------|----------|---------| @@ -255,25 +243,9 @@ The visual interaction workflow is implemented across 5 focused tools: | `highlight` | `highlight_elements` | Element discovery with blue overlays | | `element_interaction` | `click_element`, `confirm_click_element`, `hover_element`, `scroll_element`, `keyboard_input`, `confirm_keyboard_input`, `select_element` | Element interaction with 2PC only for click and keyboard input | | `dialog` | `handle_dialog` | Dialog handling (accept/dismiss) | -| `javascript` | `javascript_execute` | JavaScript fallback execution | ## UNIQUE PATTERNS -### JavaScript-First Automation (Fallback) -For complex interactions not covered by visual commands: -```javascript -// Click by visible text (universal pattern) -(() => { - const text = 'YOUR_TEXT'; - const leaf = Array.from(document.querySelectorAll('*')) - .find(el => el.children.length === 0 && el.textContent.includes(text)); - if (!leaf) return 'not found'; - const target = leaf.closest('a, button, [role="button"]') || leaf; - target.click(); - return 'clicked: ' + target.tagName; -})() -``` - ### Multi-Session Tab Isolation - `tab init ` creates managed session with tab group - `conversation_id` ties all commands to session diff --git a/pyproject.toml b/pyproject.toml index cc66190..b955a1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ dependencies = [ "pillow>=10.0.0", "numpy>=1.24.0", "requests>=2.31.0", - "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#subdirectory=openhands-sdk", - "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#subdirectory=openhands-tools", + "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@56c315b34a8d3f0b3d1c89cba57115dd859edbb0#subdirectory=openhands-sdk", + "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@56c315b34a8d3f0b3d1c89cba57115dd859edbb0#subdirectory=openhands-tools", "litellm @ git+https://github.com/softpudding/litellm.git@bfba5e3889829067baeab3b12d38008360913771", ] diff --git a/server/AGENTS.md b/server/AGENTS.md index 9f085fe..5e620ca 100644 --- a/server/AGENTS.md +++ b/server/AGENTS.md @@ -22,7 +22,6 @@ FastAPI backend handling REST API, WebSocket communication with Chrome extension | Highlight tool | `agent/tools/highlight_tool.py` | `HighlightTool`, `HighlightAction` | | Element interaction | `agent/tools/element_interaction_tool.py` | `ElementInteractionTool`, `ElementInteractionAction` | | Dialog tool | `agent/tools/dialog_tool.py` | `DialogTool`, `DialogHandleAction` | -| JavaScript tool | `agent/tools/javascript_tool.py` | `JavaScriptTool`, `JavaScriptAction` | | ToolSet aggregator | `agent/tools/toolset.py` | `OpenBrowserToolSet` | ## STRUCTURE @@ -209,9 +208,9 @@ After the screenshot refactor, the server layer no longer proactively triggers s **Best Practice:** Use `tab view` when you need a clean screenshot after navigation or JavaScript execution. -## 5-TOOL ARCHITECTURE +## 4-TOOL ARCHITECTURE -OpenBrowser now uses 5 focused tools instead of a single monolithic tool: +OpenBrowser now uses 4 focused tools instead of a single monolithic tool: ### 1. Tab Tool (`tab`) - **Purpose**: Browser tab management with session isolation @@ -235,22 +234,17 @@ OpenBrowser now uses 5 focused tools instead of a single monolithic tool: - **Dialog types**: `alert` (auto-accepted), `confirm`, `prompt`, `beforeunload` - **Required**: Handle dialogs before continuing browser operations -### 5. JavaScript Tool (`javascript`) -- **Purpose**: Custom JavaScript execution as fallback mechanism -- **When to use**: When visual commands fail (2-Strike Rule) or for complex DOM manipulation -- **Guidelines**: Return JSON-serializable values, 30-second timeout - ### Shared Architecture -- **Shared executor**: All 5 tools share executor for 2PC state management +- **Shared executor**: All 4 tools share executor for 2PC state management - **Conversation isolation**: Each conversation has isolated state - **Backward compatibility**: `OpenBrowserTool` still available with deprecation warning -- **ToolSet**: `OpenBrowserToolSet` aggregates all 5 tools for registration +- **ToolSet**: `OpenBrowserToolSet` aggregates all 4 tools for registration ## NOTES - WebSocket runs on port 8766, HTTP on 8765 - `conversation_id` links all commands to session context -- Agent uses OpenHands SDK with 5 focused tools (tab, highlight, element_interaction, dialog, javascript) +- Agent uses OpenHands SDK with 4 focused tools (tab, highlight, element_interaction, dialog) - Dialogs block screenshot/JS until handled ## ANTI-PATTERNS diff --git a/server/agent/api.py b/server/agent/api.py index 6e07fc6..33c7d92 100644 --- a/server/agent/api.py +++ b/server/agent/api.py @@ -437,10 +437,8 @@ def initialize_agent(): from .tools.highlight_tool import HighlightTool from .tools.element_interaction_tool import ElementInteractionTool from .tools.dialog_tool import DialogTool - from .tools.javascript_tool import JavaScriptTool - logger.info( - "5 focused OpenBrowser tools registered: tab, highlight, element_interaction, dialog, javascript" + "4 focused OpenBrowser tools registered: tab, highlight, element_interaction, dialog" ) except Exception as e: diff --git a/server/agent/manager.py b/server/agent/manager.py index b046cec..eff2b1d 100644 --- a/server/agent/manager.py +++ b/server/agent/manager.py @@ -96,7 +96,6 @@ def __init__(self, multi_process_mode: bool = False): Tool(name="highlight"), # Element discovery with visual overlays Tool(name="element_interaction"), # Click/input with 2PC, others direct Tool(name="dialog"), # Browser dialog handling - Tool(name="javascript"), # JavaScript execution fallback ] self.general_tools = [ Tool(name=PLEASE_HELP_ME_TOOL_NAME), @@ -135,22 +134,9 @@ def _resolve_llm_settings( def _get_tools_for_model( self, model: Optional[str] = None, model_alias: Optional[str] = None ) -> list[Tool]: - """Return the tool list for a model tier. - - Small models keep the general agentic tools but use a narrower browser - toolset to reduce risky action branching. - """ - resolved_model, _, _ = self._resolve_llm_settings( - model=model, model_alias=model_alias - ) - - browser_tools = list(self.browser_tools) - if is_small_model(resolved_model): - browser_tools = [ - tool for tool in browser_tools if tool.name != "javascript" - ] - - return browser_tools + list(self.general_tools) + """Return the tool list for a model tier.""" + self._resolve_llm_settings(model=model, model_alias=model_alias) + return list(self.browser_tools) + list(self.general_tools) def _create_llm_from_config( self, @@ -196,7 +182,6 @@ def _get_system_prompt_kwargs( return { "model_profile": get_model_profile(resolved_model), "small_model": small_model, - "javascript_available": not small_model, } def _build_session_metadata( diff --git a/server/agent/prompts/big_model/element_interaction_tool.j2 b/server/agent/prompts/big_model/element_interaction_tool.j2 index 4a2e901..92ecf66 100644 --- a/server/agent/prompts/big_model/element_interaction_tool.j2 +++ b/server/agent/prompts/big_model/element_interaction_tool.j2 @@ -116,11 +116,7 @@ Use this to: - If highlight shows a target `scrollable` region, use `scroll` on that region rather than clicking nearby paging controls - If highlight shows the `swipable` hint, treat the region as a carousel/slider and use `swipe` over `scroll` - Do not compensate for a missing highlight target by guessing `next`, `prev`, `close`, or similar keywords -{% if not disable_javascript %} -- If scroll/swipe fails twice and the task still requires progress, use the JavaScript tool only as a last-resort fallback for this system -{% else %} -- If scroll/swipe fails repeatedly, re-orient around currently visible items because JavaScript fallback is disabled in this environment -{% endif %} +- If scroll/swipe fails repeatedly, re-orient around currently visible items and keep recovery inside the visual browser tools ### swipe Swipe a carousel / gallery / swiper region by its visual ID. diff --git a/server/agent/prompts/big_model/javascript_tool.j2 b/server/agent/prompts/big_model/javascript_tool.j2 deleted file mode 100644 index 3431ab0..0000000 --- a/server/agent/prompts/big_model/javascript_tool.j2 +++ /dev/null @@ -1,60 +0,0 @@ -# JavaScript Tool - -Custom JavaScript execution for system-specific fallback only. - -## Overview - -The JavaScript Tool executes arbitrary JavaScript code in the browser context. Use this only as a **fallback** when OpenBrowser's visual tools cannot accomplish the task. The prompt should teach system-specific constraints, not generic JavaScript techniques you already know. - -## When to Use JavaScript - -**Use visual commands first**: -1. `highlight` to identify the target -2. If the page state is unchanged and the target is still missing, your first reaction should be to continue `element_type: "any"` pagination -3. `element_interaction` to click / hover / scroll / swipe / input / select -4. `dialog` immediately if JavaScript or interaction opens a blocking dialog - -**Switch to JavaScript only when**: -- Complex DOM manipulation needed -- Visual commands fail twice (2-Strike Rule) -- Special browser APIs or framework APIs are required -- You need to inspect page state that is not exposed by visual tools -- You have already exhausted the reliable highlight-driven discovery flow rather than abandoning it after the first page - -## Commands - -### javascript -Execute arbitrary JavaScript. - -```json -{ "script": "(() => { return document.title; })()" } -``` - -**Parameters**: -- `script`: JavaScript code to execute - -**System-Specific Guidelines**: -- **30-second timeout**: Scripts timing out will return error -- **Return JSON-serializable values only**: No DOM nodes, functions, or circular structures -- **Use IIFE**: Wrap in `(() => { ... })()` for explicit return statements -- **Console output captured**: `console.log()` output is included in response -- **Dialog detection**: Dialogs triggered during execution are automatically detected and require the dialog tool -- **Prefer framework-safe events**: If a page action must be simulated, dispatch the event sequence the app expects rather than relying on a brittle shortcut -- **Rebuild discovery after state-changing JS**: JavaScript does not return a screenshot, so after any page-changing script the next discovery step must be `highlight` with `element_type: "any"` before choosing the next element -- **Use JavaScript to complement, not replace, the visual workflow**: Prefer reading state, calling a known page API, or performing a narrow fallback step rather than re-implementing the whole task in JavaScript -- **Do not use JavaScript just because highlight page 1 did not show the target**: On the same unchanged page state, continue `highlight` pagination with `element_type: "any"` first - -### Minimal example -```javascript -(() => { - return { title: document.title, url: location.href }; -})() -``` - -## Error Handling - -- **Syntax error**: Returns error with stack trace -- **Timeout**: Returns error after 30 seconds -- **Dialog opened**: Returns dialog info, requiring the dialog tool -- **Non-serializable return**: Returns error -- **Security violation**: Returns error (cross-origin access, etc.) diff --git a/server/agent/prompts/small_model/javascript_tool.j2 b/server/agent/prompts/small_model/javascript_tool.j2 deleted file mode 100644 index 3431ab0..0000000 --- a/server/agent/prompts/small_model/javascript_tool.j2 +++ /dev/null @@ -1,60 +0,0 @@ -# JavaScript Tool - -Custom JavaScript execution for system-specific fallback only. - -## Overview - -The JavaScript Tool executes arbitrary JavaScript code in the browser context. Use this only as a **fallback** when OpenBrowser's visual tools cannot accomplish the task. The prompt should teach system-specific constraints, not generic JavaScript techniques you already know. - -## When to Use JavaScript - -**Use visual commands first**: -1. `highlight` to identify the target -2. If the page state is unchanged and the target is still missing, your first reaction should be to continue `element_type: "any"` pagination -3. `element_interaction` to click / hover / scroll / swipe / input / select -4. `dialog` immediately if JavaScript or interaction opens a blocking dialog - -**Switch to JavaScript only when**: -- Complex DOM manipulation needed -- Visual commands fail twice (2-Strike Rule) -- Special browser APIs or framework APIs are required -- You need to inspect page state that is not exposed by visual tools -- You have already exhausted the reliable highlight-driven discovery flow rather than abandoning it after the first page - -## Commands - -### javascript -Execute arbitrary JavaScript. - -```json -{ "script": "(() => { return document.title; })()" } -``` - -**Parameters**: -- `script`: JavaScript code to execute - -**System-Specific Guidelines**: -- **30-second timeout**: Scripts timing out will return error -- **Return JSON-serializable values only**: No DOM nodes, functions, or circular structures -- **Use IIFE**: Wrap in `(() => { ... })()` for explicit return statements -- **Console output captured**: `console.log()` output is included in response -- **Dialog detection**: Dialogs triggered during execution are automatically detected and require the dialog tool -- **Prefer framework-safe events**: If a page action must be simulated, dispatch the event sequence the app expects rather than relying on a brittle shortcut -- **Rebuild discovery after state-changing JS**: JavaScript does not return a screenshot, so after any page-changing script the next discovery step must be `highlight` with `element_type: "any"` before choosing the next element -- **Use JavaScript to complement, not replace, the visual workflow**: Prefer reading state, calling a known page API, or performing a narrow fallback step rather than re-implementing the whole task in JavaScript -- **Do not use JavaScript just because highlight page 1 did not show the target**: On the same unchanged page state, continue `highlight` pagination with `element_type: "any"` first - -### Minimal example -```javascript -(() => { - return { title: document.title, url: location.href }; -})() -``` - -## Error Handling - -- **Syntax error**: Returns error with stack trace -- **Timeout**: Returns error after 30 seconds -- **Dialog opened**: Returns dialog info, requiring the dialog tool -- **Non-serializable return**: Returns error -- **Security violation**: Returns error (cross-origin access, etc.) diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py index dac0f4b..1927714 100644 --- a/server/agent/tools/browser_executor.py +++ b/server/agent/tools/browser_executor.py @@ -1,12 +1,11 @@ """ -BrowserExecutor - Unified executor for handling all 5 OpenBrowser tool actions. +BrowserExecutor - Unified executor for handling all 4 OpenBrowser tool actions. -This executor can handle actions from all 5 focused tools: +This executor can handle actions from all 4 focused tools: - TabAction (from tab_tool.py) - BaseHighlightAction (from highlight_tool.py) - ElementInteractionAction (from element_interaction_tool.py) - DialogHandleAction (from dialog_tool.py) -- JavaScriptAction (from javascript_tool.py) All actions inherit from OpenBrowserAction and share common conversation_id. This executor provides consistent command execution and pending confirmation state. @@ -24,7 +23,6 @@ from server.models.commands import ( TabCommand, GetTabsCommand, - JavascriptExecuteCommand, HandleDialogCommand, DialogAction, TabAction as TabActionEnum, @@ -45,10 +43,6 @@ from server.agent.tools.highlight_tool import BaseHighlightAction from server.agent.tools.element_interaction_tool import ElementInteractionAction from server.agent.tools.dialog_tool import DialogHandleAction -from server.agent.tools.javascript_tool import ( - JavaScriptAction, - DISABLE_JAVASCRIPT_EXECUTE, -) from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation @@ -85,7 +79,7 @@ def remove_browser_executor(conversation_id: str) -> None: class BrowserExecutor(ToolExecutor[OpenBrowserAction, OpenBrowserObservation]): - """Unified executor for all 5 OpenBrowser tool actions. + """Unified executor for all 4 OpenBrowser tool actions. This executor can handle any action that inherits from OpenBrowserAction, providing consistent pending confirmation state and command execution @@ -138,7 +132,7 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation: Args: action: Any action that inherits from OpenBrowserAction (TabAction, BaseHighlightAction, ElementInteractionAction, - DialogHandleAction, JavaScriptAction) + DialogHandleAction) Returns: OpenBrowserObservation with results of the operation @@ -174,8 +168,6 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation: return self._execute_element_interaction_action(action) elif isinstance(action, DialogHandleAction): return self._execute_dialog_action(action) - elif isinstance(action, JavaScriptAction): - return self._execute_javascript_action(action) else: raise ValueError(f"Unknown action type: {type(action).__name__}") @@ -620,42 +612,6 @@ def _execute_dialog_action( message = f"Dialog handled: {dialog_action_str}" return self._build_observation_from_result(result_dict, message) - def _execute_javascript_action( - self, action: JavaScriptAction - ) -> OpenBrowserObservation: - """Execute a JavaScript execution action.""" - logger.debug( - f"DEBUG: _execute_javascript_action called with script length={len(action.script)}" - ) - - # Check if javascript_execute is disabled via environment variable - if DISABLE_JAVASCRIPT_EXECUTE: - return OpenBrowserObservation( - success=False, - error="javascript_execute command is disabled via OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE environment variable", - tabs=[], - screenshot_data_url=None, - ) - - # Validate required parameters - if not action.script: - raise ValueError("javascript requires script parameter") - - command = JavascriptExecuteCommand( - script=action.script, conversation_id=self.conversation_id - ) - result_dict = self._execute_command_sync(command) - - # Truncate long scripts for message - script = action.script - if len(script) > 50: - message = f"Executed JavaScript: '{script[:50]}...'" - else: - message = f"Executed JavaScript: '{script}'" - - observation = self._build_observation_from_result(result_dict, message) - return observation - # ========== 2PC State Management Methods ========== def _clear_pending_confirmation(self): diff --git a/server/agent/tools/javascript_tool.py b/server/agent/tools/javascript_tool.py deleted file mode 100644 index d094116..0000000 --- a/server/agent/tools/javascript_tool.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -JavaScriptTool - AI tool for executing JavaScript in the browser. - -This tool serves as a fallback mechanism for complex browser interactions -that are not covered by visual commands (highlight, click, hover, scroll, keyboard_input). - -Key characteristics: -- Results must be JSON-serializable (no DOM nodes) -- For React/Vue apps, dispatch full event sequence (not just .click()) -- Call `screenshot` command after if visual feedback is needed -- Can be disabled via OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE environment variable -""" - -import os -import logging -from collections.abc import Sequence - -from pydantic import Field -from openhands.sdk.tool import ( - ToolDefinition, - ToolAnnotations, - register_tool, -) - -from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation -from server.agent.tools.prompt_loader import render_tool_prompt - -logger = logging.getLogger(__name__) - -# Environment variable to disable javascript tool -DISABLE_JAVASCRIPT_EXECUTE = os.getenv( - "OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE", "" -).lower() in ("1", "true", "yes") - -def get_javascript_tool_description(conv_state=None) -> str: - """Get the JavaScriptTool description, rendered from Jinja2 template.""" - return render_tool_prompt( - "javascript_tool.j2", - conv_state, - disable_javascript=DISABLE_JAVASCRIPT_EXECUTE - ) - - -class JavaScriptAction(OpenBrowserAction): - """Action for executing JavaScript code in the browser.""" - - script: str = Field( - description="JavaScript code to execute. Must return a JSON-serializable value." - ) - - -class JavaScriptTool(ToolDefinition[JavaScriptAction, OpenBrowserObservation]): - """Tool for executing JavaScript in the browser. - - This tool provides a dedicated interface for JavaScript execution, - following the same pattern as other tools in the system. - """ - - name = "javascript" - - @classmethod - def create(cls, conv_state, terminal_executor=None) -> Sequence["JavaScriptTool"]: - """Create JavaScriptTool instance. - - Args: - conv_state: Conversation state for session isolation. - terminal_executor: Optional BrowserExecutor instance for handling commands. - If None, creates a new BrowserExecutor. - - Returns: - List containing the JavaScriptTool instance - """ - # Check if disabled via environment variable - if DISABLE_JAVASCRIPT_EXECUTE: - logger.info( - "JavaScriptTool is disabled via OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE" - ) - return [] - - # Use provided executor or get shared executor for conversation - if terminal_executor is not None: - executor = terminal_executor - else: - # Try to get conversation ID from conv_state to share executor across tools - # conv_state: openhands-sdk ConversationState - conversation_id = getattr(conv_state, "id", None) - - # Get shared executor for this conversation (or create new if no conversation_id) - from server.agent.tools.browser_executor import get_browser_executor - - executor = get_browser_executor(conversation_id) - - return [ - cls( - description=get_javascript_tool_description(conv_state), - action_type=JavaScriptAction, - observation_type=OpenBrowserObservation, - annotations=ToolAnnotations( - title="JavaScript", - readOnlyHint=False, - destructiveHint=False, - idempotentHint=False, - openWorldHint=True, - ), - executor=executor, - ) - ] - - -# Register the tool -register_tool("javascript", JavaScriptTool.create) diff --git a/server/agent/tools/prompt_context.py b/server/agent/tools/prompt_context.py index 0609bec..dd68b65 100644 --- a/server/agent/tools/prompt_context.py +++ b/server/agent/tools/prompt_context.py @@ -2,7 +2,6 @@ from __future__ import annotations -import os from typing import Any from server.core.llm_config import llm_config_manager @@ -41,13 +40,8 @@ def get_prompt_render_context(conv_state: Any = None) -> dict[str, Any]: model_name = None profile = get_model_profile(model_name) - disable_javascript = os.getenv( - "OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE", "" - ).lower() in ("1", "true", "yes") - return { "model_name": model_name, "model_profile": profile, "small_model": is_small_model(model_name), - "disable_javascript": disable_javascript, } diff --git a/server/agent/tools/toolset.py b/server/agent/tools/toolset.py index 81bea27..0b8694e 100644 --- a/server/agent/tools/toolset.py +++ b/server/agent/tools/toolset.py @@ -1,8 +1,8 @@ """ -OpenBrowserToolSet - Aggregates all 5 OpenBrowser tools into a unified toolset. +OpenBrowserToolSet - Aggregates all 4 OpenBrowser tools into a unified toolset. This module provides the OpenBrowserToolSet class that creates and manages -all 5 focused OpenBrowser tools with a shared executor for state consistency. +all 4 focused OpenBrowser tools with a shared executor for state consistency. Following the OpenHands SDK ToolSet pattern from browser_use.definition.BrowserToolSet. """ @@ -16,12 +16,11 @@ from server.agent.tools.dialog_tool import DialogTool from server.agent.tools.element_interaction_tool import ElementInteractionTool from server.agent.tools.highlight_tool import HighlightTool -from server.agent.tools.javascript_tool import JavaScriptTool from server.agent.tools.tab_tool import TabTool class OpenBrowserToolSet(ToolDefinition): - """Aggregates all 5 OpenBrowser tools with shared executor. + """Aggregates all 4 OpenBrowser tools with shared executor. This toolset provides a unified interface for registering all OpenBrowser tools while ensuring they share the same executor instance. This is critical @@ -33,14 +32,13 @@ class OpenBrowserToolSet(ToolDefinition): - HighlightTool: Element discovery with collision-free visual overlays - ElementInteractionTool: Click/input with 2PC, hover/scroll/swipe/select direct - DialogTool: Browser dialog (alert/confirm/prompt/beforeunload) handling - - JavaScriptTool: Custom JavaScript execution for complex interactions Example: >>> tools = OpenBrowserToolSet.create(None) >>> len(tools) - 5 + 4 >>> [tool.name for tool in tools] - ['tab', 'highlight', 'element_interaction', 'dialog', 'javascript'] + ['tab', 'highlight', 'element_interaction', 'dialog'] """ @classmethod @@ -49,7 +47,7 @@ def create( conv_state, executor: Optional[BrowserExecutor] = None, ) -> Sequence[ToolDefinition]: - """Create all 5 OpenBrowser tools with shared executor. + """Create all 4 OpenBrowser tools with shared executor. Args: executor: Optional BrowserExecutor instance for handling commands. @@ -57,9 +55,9 @@ def create( registration in production use). Returns: - List of 5 ToolDefinition instances (TabTool, HighlightTool, - ElementInteractionTool, DialogTool, JavaScriptTool), all sharing - the same executor instance. + List of 4 ToolDefinition instances (TabTool, HighlightTool, + ElementInteractionTool, DialogTool), all sharing the same executor + instance. Note: The executor must be shared across all tools to enable: @@ -80,7 +78,6 @@ def create( HighlightTool, ElementInteractionTool, DialogTool, - JavaScriptTool, ]: tools.extend(tool_class.create(conv_state, executor)) diff --git a/server/tests/integration/test_toolset_integration.py b/server/tests/integration/test_toolset_integration.py index b45a028..08f5726 100644 --- a/server/tests/integration/test_toolset_integration.py +++ b/server/tests/integration/test_toolset_integration.py @@ -48,7 +48,6 @@ class MockTaskTrackerTool: tools_module.__path__ = [str(tools_dir)] sys.modules.setdefault("server.agent.tools", tools_module) -from server.agent.tools import javascript_tool from server.agent.tools.toolset import OpenBrowserToolSet @@ -60,6 +59,7 @@ def test_pending_confirmation_is_shared_across_tool_instances(self) -> None: shared_executor._set_pending_confirmation( element_id="abc123", + highlight_snapshot_id=17, action_type="click", full_html="", ) @@ -72,29 +72,36 @@ def test_pending_confirmation_is_shared_across_tool_instances(self) -> None: tools["tab"].executor._get_pending_confirmation()["action_type"] == "click" ) - def test_confirmed_element_cache_is_isolated_by_conversation(self) -> None: + def test_pending_confirmation_is_isolated_by_conversation(self) -> None: tools = {tool.name: tool for tool in OpenBrowserToolSet.create(None)} executor = tools["element_interaction"].executor executor.conversation_id = "conv-1" - executor._add_confirmed_element("elem-1") + executor._set_pending_confirmation( + element_id="elem-1", + highlight_snapshot_id=11, + action_type="click", + full_html="", + ) executor.conversation_id = "conv-2" - assert executor._is_element_confirmed("elem-1") is False + assert executor._get_pending_confirmation() is None - executor._add_confirmed_element("elem-2") - assert executor._is_element_confirmed("elem-2") is True + executor._set_pending_confirmation( + element_id="elem-2", + highlight_snapshot_id=22, + action_type="keyboard_input", + full_html='', + ) + assert executor._get_pending_confirmation()["element_id"] == "elem-2" + assert executor._get_pending_confirmation()["highlight_snapshot_id"] == 22 executor.conversation_id = "conv-1" - assert executor._is_element_confirmed("elem-1") is True - assert executor._is_element_confirmed("elem-2") is False - - def test_disabling_javascript_keeps_core_workflow_tools_available( - self, monkeypatch - ) -> None: - monkeypatch.setattr(javascript_tool, "DISABLE_JAVASCRIPT_EXECUTE", True) + assert executor._get_pending_confirmation()["element_id"] == "elem-1" + assert executor._get_pending_confirmation()["highlight_snapshot_id"] == 11 + def test_toolset_keeps_core_workflow_tools_available(self) -> None: tools = OpenBrowserToolSet.create(None) assert [tool.name for tool in tools] == [ diff --git a/server/tests/unit/test_agent_manager_process.py b/server/tests/unit/test_agent_manager_process.py index 6a0166e..d031e78 100644 --- a/server/tests/unit/test_agent_manager_process.py +++ b/server/tests/unit/test_agent_manager_process.py @@ -63,8 +63,8 @@ def test_multi_process_mode_initializes_infrastructure(self) -> None: assert manager._process_manager is not None assert manager._ipc_router is not None - def test_large_models_keep_full_browser_toolset(self) -> None: - """Large models should keep javascript plus general tools.""" + def test_large_models_keep_core_browser_toolset(self) -> None: + """Large models should expose the four browser tools plus general tools.""" with patch("server.agent.manager.llm_config_manager") as mock_llm_config: manager = OpenBrowserAgentManager() mock_llm_config.reload_config.return_value = MagicMock() @@ -84,15 +84,14 @@ def test_large_models_keep_full_browser_toolset(self) -> None: "highlight", "element_interaction", "dialog", - "javascript", "please_help_me", "terminal", "file_editor", "task_tracker", ] - def test_small_models_drop_javascript_only(self) -> None: - """Small models keep general tools but lose javascript.""" + def test_small_models_keep_the_same_browser_toolset(self) -> None: + """Small models should use the same four browser tools.""" with patch("server.agent.manager.llm_config_manager") as mock_llm_config: manager = OpenBrowserAgentManager() mock_llm_config.reload_config.return_value = MagicMock() @@ -119,7 +118,7 @@ def test_small_models_drop_javascript_only(self) -> None: ] def test_unknown_models_default_to_large_profile(self) -> None: - """Unconfigured models should keep the large-model toolset.""" + """Unconfigured models should keep the standard browser toolset.""" with patch("server.agent.manager.llm_config_manager") as mock_llm_config: manager = OpenBrowserAgentManager() mock_llm_config.reload_config.return_value = MagicMock() @@ -133,11 +132,11 @@ def test_unknown_models_default_to_large_profile(self) -> None: tool.name for tool in manager._get_tools_for_model("some/new-model") ] - assert "javascript" in tool_names + assert "dialog" in tool_names assert "please_help_me" in tool_names def test_system_prompt_kwargs_follow_large_model_profile(self) -> None: - """Large models should advertise full browser freedom in system prompt.""" + """Large models should get the large-profile system prompt kwargs.""" manager = OpenBrowserAgentManager() with patch("server.agent.manager.llm_config_manager") as mock_llm_config: @@ -153,11 +152,10 @@ def test_system_prompt_kwargs_follow_large_model_profile(self) -> None: assert kwargs == { "model_profile": "large", "small_model": False, - "javascript_available": True, } def test_system_prompt_kwargs_follow_small_model_profile(self) -> None: - """Small models should get the constrained system prompt variant.""" + """Small models should get the small-profile system prompt kwargs.""" manager = OpenBrowserAgentManager() with patch("server.agent.manager.llm_config_manager") as mock_llm_config: @@ -173,7 +171,6 @@ def test_system_prompt_kwargs_follow_small_model_profile(self) -> None: assert kwargs == { "model_profile": "small", "small_model": True, - "javascript_available": False, } diff --git a/server/tests/unit/test_prompt_context.py b/server/tests/unit/test_prompt_context.py index 81580c8..f996258 100644 --- a/server/tests/unit/test_prompt_context.py +++ b/server/tests/unit/test_prompt_context.py @@ -31,7 +31,6 @@ def test_prompt_context_defaults_to_large_profile_without_conversation() -> None "model_name": None, "model_profile": "large", "small_model": False, - "disable_javascript": False, } diff --git a/server/tests/unit/test_toolset.py b/server/tests/unit/test_toolset.py index d236a59..40f8039 100644 --- a/server/tests/unit/test_toolset.py +++ b/server/tests/unit/test_toolset.py @@ -50,7 +50,6 @@ class MockTaskTrackerTool: sys.modules["openhands.tools.preset"] = types.ModuleType("openhands.tools.preset") from server.agent.tools.browser_executor import BrowserExecutor -from server.agent.tools import javascript_tool from server.agent.tools.toolset import OpenBrowserToolSet @@ -60,7 +59,7 @@ def test_create_shares_one_executor_within_single_toolset(self) -> None: executors = {tool.executor for tool in tools} - assert len(tools) == 5 + assert len(tools) == 4 assert len(executors) == 1 assert isinstance(next(iter(executors)), BrowserExecutor) @@ -80,13 +79,10 @@ def test_provided_executor_is_reused_for_all_tools_in_workflow_order(self) -> No "highlight", "element_interaction", "dialog", - "javascript", ] assert all(tool.executor is executor for tool in tools) - def test_disabling_javascript_removes_only_fallback_tool(self, monkeypatch) -> None: - monkeypatch.setattr(javascript_tool, "DISABLE_JAVASCRIPT_EXECUTE", True) - + def test_create_exposes_only_core_browser_tools(self) -> None: tools = OpenBrowserToolSet.create(None) assert [tool.name for tool in tools] == [ @@ -103,4 +99,3 @@ def test_capability_hints_match_actual_tool_behavior(self) -> None: assert tools["highlight"].annotations.readOnlyHint is True assert tools["element_interaction"].annotations.destructiveHint is True assert tools["dialog"].annotations.readOnlyHint is False - assert tools["javascript"].annotations.openWorldHint is True diff --git a/uv.lock b/uv.lock index 61dda0c..88ad2be 100644 --- a/uv.lock +++ b/uv.lock @@ -1675,8 +1675,8 @@ requires-dist = [ { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=bfba5e3889829067baeab3b12d38008360913771" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, - { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" }, - { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" }, + { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0" }, + { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "pydantic", specifier = ">=2.5.0" }, @@ -2221,7 +2221,7 @@ wheels = [ [[package]] name = "openhands-sdk" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0#56c315b34a8d3f0b3d1c89cba57115dd859edbb0" } dependencies = [ { name = "agent-client-protocol" }, { name = "deprecation" }, @@ -2241,7 +2241,7 @@ dependencies = [ [[package]] name = "openhands-tools" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0#56c315b34a8d3f0b3d1c89cba57115dd859edbb0" } dependencies = [ { name = "bashlex" }, { name = "binaryornot" }, From 18b76a94d193e31b8a1578c18a46d379139f23d6 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 26 Mar 2026 23:30:39 +0800 Subject: [PATCH 5/7] Update evaluation report --- eval/evaluation_report.json | 308 ++++++++++++++++++------------------ 1 file changed, 154 insertions(+), 154 deletions(-) diff --git a/eval/evaluation_report.json b/eval/evaluation_report.json index 7393560..e34b5d9 100644 --- a/eval/evaluation_report.json +++ b/eval/evaluation_report.json @@ -1,11 +1,11 @@ { "evaluation": { - "timestamp": "2026-03-26 01:06:07", - "unix_timestamp": 1774458367.956289, + "timestamp": "2026-03-26 23:28:03", + "unix_timestamp": 1774538883.639733, "summary": { "total_tests": 22, - "passed_tests": 21, - "pass_rate": 95.45, + "passed_tests": 20, + "pass_rate": 90.91, "models_tested": [ "dashscope/qwen3.5-plus", "dashscope/qwen3.5-flash" @@ -13,27 +13,27 @@ }, "model_performance": { "dashscope/qwen3.5-plus": { - "pass_rate": 90.91, - "task_score": 61.0, + "pass_rate": 100.0, + "task_score": 62.5, "task_max_score": 62.5, - "efficiency_score": 7.2697, - "usage_score": 4.256, - "composite_score": 0.755, - "avg_duration": 165.39, - "avg_cost": 0.663257, - "passed_count": 10, + "efficiency_score": 7.6698, + "usage_score": 4.4243, + "composite_score": 0.8199, + "avg_duration": 146.25, + "avg_cost": 0.639522, + "passed_count": 11, "total_tests": 11 }, "dashscope/qwen3.5-flash": { - "pass_rate": 100.0, - "task_score": 62.5, + "pass_rate": 81.82, + "task_score": 54.0, "task_max_score": 62.5, - "efficiency_score": 7.3059, - "usage_score": 8.7355, - "composite_score": 0.8917, - "avg_duration": 161.03, - "avg_cost": 0.19694, - "passed_count": 11, + "efficiency_score": 6.0924, + "usage_score": 9.0375, + "composite_score": 0.766, + "avg_duration": 220.62, + "avg_cost": 0.185729, + "passed_count": 9, "total_tests": 11 } }, @@ -42,26 +42,26 @@ "name": "BlueBook Search And Like Test", "results_by_model": { "dashscope/qwen3.5-plus": { - "passed": false, - "task_score": 4.5, + "passed": true, + "task_score": 6.0, "task_max_score": 6.0, - "efficiency_score": 0.6649, - "usage_score": 0.4188, - "composite_score": 0.2167, - "total_score": 5.58, - "duration": 100.53, - "cost": 0.348724 + "efficiency_score": 0.637, + "usage_score": 0.2891, + "composite_score": 0.7852, + "total_score": 6.93, + "duration": 108.9, + "cost": 0.426542 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 6.0, "task_max_score": 6.0, - "efficiency_score": 0.7076, - "usage_score": 0.8345, - "composite_score": 0.9084, - "total_score": 7.54, - "duration": 87.71, - "cost": 0.099273 + "efficiency_score": 0.6236, + "usage_score": 0.7717, + "composite_score": 0.8791, + "total_score": 7.4, + "duration": 112.92, + "cost": 0.136968 } } }, @@ -72,23 +72,23 @@ "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7872, - "usage_score": 0.7718, - "composite_score": 0.9118, + "efficiency_score": 0.7897, + "usage_score": 0.773, + "composite_score": 0.9125, "total_score": 4.56, - "duration": 63.85, - "cost": 0.182561 + "duration": 63.1, + "cost": 0.181606 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.8726, - "usage_score": 0.9595, - "composite_score": 0.9664, - "total_score": 4.83, - "duration": 38.22, - "cost": 0.032404 + "efficiency_score": 0.7887, + "usage_score": 0.9319, + "composite_score": 0.9441, + "total_score": 4.72, + "duration": 63.38, + "cost": 0.054464 } } }, @@ -99,23 +99,23 @@ "passed": true, "task_score": 9.0, "task_max_score": 9.0, - "efficiency_score": 0.404, - "usage_score": 0.2516, - "composite_score": 0.7311, - "total_score": 9.66, - "duration": 417.19, - "cost": 1.49673 + "efficiency_score": 0.5631, + "usage_score": 0.2639, + "composite_score": 0.7654, + "total_score": 9.83, + "duration": 305.82, + "cost": 1.472194 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 9.0, "task_max_score": 9.0, - "efficiency_score": 0.6498, - "usage_score": 0.8601, - "composite_score": 0.902, - "total_score": 10.51, - "duration": 245.11, - "cost": 0.279726 + "efficiency_score": 0.6021, + "usage_score": 0.8314, + "composite_score": 0.8867, + "total_score": 10.43, + "duration": 278.5, + "cost": 0.337292 } } }, @@ -126,23 +126,23 @@ "passed": true, "task_score": 2.5, "task_max_score": 2.5, - "efficiency_score": 0.6196, - "usage_score": 0.5748, - "composite_score": 0.8389, - "total_score": 3.69, - "duration": 152.16, - "cost": 0.340137 + "efficiency_score": 0.758, + "usage_score": 0.5909, + "composite_score": 0.8698, + "total_score": 3.85, + "duration": 96.81, + "cost": 0.32725 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 2.5, "task_max_score": 2.5, - "efficiency_score": 0.8196, - "usage_score": 0.9146, - "composite_score": 0.9468, - "total_score": 4.23, - "duration": 72.16, - "cost": 0.068325 + "efficiency_score": 0.7436, + "usage_score": 0.9038, + "composite_score": 0.9295, + "total_score": 4.15, + "duration": 102.54, + "cost": 0.076921 } } }, @@ -153,23 +153,23 @@ "passed": true, "task_score": 9.5, "task_max_score": 9.5, - "efficiency_score": 0.6174, - "usage_score": 0.0907, - "composite_score": 0.7416, - "total_score": 10.21, - "duration": 191.31, - "cost": 0.90927 + "efficiency_score": 0.5944, + "usage_score": 0.0415, + "composite_score": 0.7272, + "total_score": 10.14, + "duration": 202.82, + "cost": 0.958494 }, "dashscope/qwen3.5-flash": { - "passed": true, - "task_score": 9.5, + "passed": false, + "task_score": 6.0, "task_max_score": 9.5, - "efficiency_score": 0.6108, - "usage_score": 0.7116, - "composite_score": 0.8645, - "total_score": 10.82, - "duration": 194.61, - "cost": 0.288375 + "efficiency_score": 0, + "usage_score": 0.9935, + "composite_score": 0.1987, + "total_score": 6.99, + "duration": 500.0, + "cost": 0.006517 } } }, @@ -180,23 +180,23 @@ "passed": true, "task_score": 12.0, "task_max_score": 12.0, - "efficiency_score": 0.697, - "usage_score": 0.3845, - "composite_score": 0.8163, + "efficiency_score": 0.6687, + "usage_score": 0.4162, + "composite_score": 0.817, "total_score": 13.08, - "duration": 151.48, - "cost": 0.738602 + "duration": 165.67, + "cost": 0.700563 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 12.0, "task_max_score": 12.0, - "efficiency_score": 0.6493, - "usage_score": 0.8553, - "composite_score": 0.9009, - "total_score": 13.5, - "duration": 175.36, - "cost": 0.173623 + "efficiency_score": 0.688, + "usage_score": 0.8486, + "composite_score": 0.9073, + "total_score": 13.54, + "duration": 156.0, + "cost": 0.181736 } } }, @@ -207,23 +207,23 @@ "passed": true, "task_score": 2, "task_max_score": 2, - "efficiency_score": 0.8385, - "usage_score": 0.762, - "composite_score": 0.9201, - "total_score": 3.6, - "duration": 48.45, - "cost": 0.119009 + "efficiency_score": 0.8682, + "usage_score": 0.7719, + "composite_score": 0.928, + "total_score": 3.64, + "duration": 39.55, + "cost": 0.114047 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 2, "task_max_score": 2, - "efficiency_score": 0.8944, - "usage_score": 0.9556, - "composite_score": 0.97, + "efficiency_score": 0.8999, + "usage_score": 0.9453, + "composite_score": 0.969, "total_score": 3.85, - "duration": 31.69, - "cost": 0.022211 + "duration": 30.03, + "cost": 0.027328 } } }, @@ -234,23 +234,23 @@ "passed": true, "task_score": 5.0, "task_max_score": 5.0, - "efficiency_score": 0.5267, - "usage_score": 0.1531, - "composite_score": 0.736, - "total_score": 5.68, - "duration": 189.34, - "cost": 0.84689 + "efficiency_score": 0.6309, + "usage_score": 0.3874, + "composite_score": 0.8037, + "total_score": 6.02, + "duration": 147.63, + "cost": 0.612606 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 5.0, "task_max_score": 5.0, - "efficiency_score": 0.0609, - "usage_score": 0.5408, - "composite_score": 0.7204, - "total_score": 5.6, - "duration": 375.62, - "cost": 0.459171 + "efficiency_score": 0.622, + "usage_score": 0.835, + "composite_score": 0.8914, + "total_score": 6.46, + "duration": 151.2, + "cost": 0.164994 } } }, @@ -261,23 +261,23 @@ "passed": true, "task_score": 3.5, "task_max_score": 3.5, - "efficiency_score": 0.737, - "usage_score": 0.4947, - "composite_score": 0.8463, - "total_score": 4.73, - "duration": 131.52, - "cost": 0.606343 + "efficiency_score": 0.7444, + "usage_score": 0.51, + "composite_score": 0.8509, + "total_score": 4.75, + "duration": 127.79, + "cost": 0.587982 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 3.5, "task_max_score": 3.5, - "efficiency_score": 0.7542, - "usage_score": 0.8895, - "composite_score": 0.9287, - "total_score": 5.14, - "duration": 122.9, - "cost": 0.13258 + "efficiency_score": 0.0687, + "usage_score": 0.5945, + "composite_score": 0.7326, + "total_score": 4.16, + "duration": 465.64, + "cost": 0.486634 } } }, @@ -288,23 +288,23 @@ "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7219, + "efficiency_score": 0.7553, "usage_score": 0, - "composite_score": 0.7444, - "total_score": 3.72, - "duration": 166.87, - "cost": 0.73836 + "composite_score": 0.7511, + "total_score": 3.76, + "duration": 146.83, + "cost": 0.723918 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.6795, - "usage_score": 0.4314, - "composite_score": 0.8222, - "total_score": 4.11, - "duration": 192.3, - "cost": 0.284306 + "efficiency_score": 0.7302, + "usage_score": 0.6429, + "composite_score": 0.8746, + "total_score": 4.37, + "duration": 161.88, + "cost": 0.178565 } } }, @@ -315,23 +315,23 @@ "passed": true, "task_score": 7.0, "task_max_score": 7.0, - "efficiency_score": 0.6556, - "usage_score": 0.3539, - "composite_score": 0.8019, - "total_score": 8.01, - "duration": 206.63, - "cost": 0.969205 + "efficiency_score": 0.6602, + "usage_score": 0.3803, + "composite_score": 0.8081, + "total_score": 8.04, + "duration": 203.86, + "cost": 0.929538 }, "dashscope/qwen3.5-flash": { - "passed": true, - "task_score": 7.0, + "passed": false, + "task_score": 2.0, "task_max_score": 7.0, - "efficiency_score": 0.6072, - "usage_score": 0.7824, - "composite_score": 0.8779, - "total_score": 8.39, - "duration": 235.67, - "cost": 0.326348 + "efficiency_score": 0.3255, + "usage_score": 0.7389, + "composite_score": 0.2129, + "total_score": 3.06, + "duration": 404.7, + "cost": 0.3916 } } } From 566a9ba1a1149af4c60102717586d242f516a65a Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Fri, 27 Mar 2026 12:26:01 +0800 Subject: [PATCH 6/7] Improve confirmation shortcuts and activation handling --- eval/evaluation_report.json | 300 +++++++++--------- .../element-actions-regression.test.ts | 15 + extension/src/commands/element-actions.ts | 37 ++- server/agent/tools/base.py | 6 - server/agent/tools/browser_executor.py | 96 ++++++ .../tests/unit/test_agent_browser_executor.py | 194 ++++++++++- server/tests/unit/test_base_classes.py | 3 +- 7 files changed, 491 insertions(+), 160 deletions(-) diff --git a/eval/evaluation_report.json b/eval/evaluation_report.json index e34b5d9..c59c71c 100644 --- a/eval/evaluation_report.json +++ b/eval/evaluation_report.json @@ -1,11 +1,11 @@ { "evaluation": { - "timestamp": "2026-03-26 23:28:03", - "unix_timestamp": 1774538883.639733, + "timestamp": "2026-03-27 00:45:37", + "unix_timestamp": 1774543537.602308, "summary": { "total_tests": 22, - "passed_tests": 20, - "pass_rate": 90.91, + "passed_tests": 21, + "pass_rate": 95.45, "models_tested": [ "dashscope/qwen3.5-plus", "dashscope/qwen3.5-flash" @@ -16,24 +16,24 @@ "pass_rate": 100.0, "task_score": 62.5, "task_max_score": 62.5, - "efficiency_score": 7.6698, - "usage_score": 4.4243, - "composite_score": 0.8199, - "avg_duration": 146.25, - "avg_cost": 0.639522, + "efficiency_score": 7.9068, + "usage_score": 4.7439, + "composite_score": 0.83, + "avg_duration": 135.49, + "avg_cost": 0.588661, "passed_count": 11, "total_tests": 11 }, "dashscope/qwen3.5-flash": { - "pass_rate": 81.82, - "task_score": 54.0, + "pass_rate": 90.91, + "task_score": 56.0, "task_max_score": 62.5, - "efficiency_score": 6.0924, - "usage_score": 9.0375, - "composite_score": 0.766, - "avg_duration": 220.62, - "avg_cost": 0.185729, - "passed_count": 9, + "efficiency_score": 7.8906, + "usage_score": 9.2849, + "composite_score": 0.8577, + "avg_duration": 133.22, + "avg_cost": 0.152816, + "passed_count": 10, "total_tests": 11 } }, @@ -45,23 +45,23 @@ "passed": true, "task_score": 6.0, "task_max_score": 6.0, - "efficiency_score": 0.637, - "usage_score": 0.2891, - "composite_score": 0.7852, - "total_score": 6.93, - "duration": 108.9, - "cost": 0.426542 + "efficiency_score": 0.6587, + "usage_score": 0.2884, + "composite_score": 0.7894, + "total_score": 6.95, + "duration": 102.4, + "cost": 0.42695 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 6.0, "task_max_score": 6.0, - "efficiency_score": 0.6236, - "usage_score": 0.7717, - "composite_score": 0.8791, - "total_score": 7.4, - "duration": 112.92, - "cost": 0.136968 + "efficiency_score": 0.6927, + "usage_score": 0.8331, + "composite_score": 0.9052, + "total_score": 7.53, + "duration": 92.2, + "cost": 0.100139 } } }, @@ -72,23 +72,23 @@ "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7897, - "usage_score": 0.773, - "composite_score": 0.9125, - "total_score": 4.56, - "duration": 63.1, - "cost": 0.181606 + "efficiency_score": 0.7931, + "usage_score": 0.7763, + "composite_score": 0.9139, + "total_score": 4.57, + "duration": 62.06, + "cost": 0.178958 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7887, - "usage_score": 0.9319, - "composite_score": 0.9441, - "total_score": 4.72, - "duration": 63.38, - "cost": 0.054464 + "efficiency_score": 0.733, + "usage_score": 0.8868, + "composite_score": 0.9239, + "total_score": 4.62, + "duration": 80.11, + "cost": 0.090585 } } }, @@ -99,23 +99,23 @@ "passed": true, "task_score": 9.0, "task_max_score": 9.0, - "efficiency_score": 0.5631, - "usage_score": 0.2639, - "composite_score": 0.7654, - "total_score": 9.83, - "duration": 305.82, - "cost": 1.472194 + "efficiency_score": 0.5473, + "usage_score": 0.2619, + "composite_score": 0.7618, + "total_score": 9.81, + "duration": 316.88, + "cost": 1.47621 }, "dashscope/qwen3.5-flash": { - "passed": true, - "task_score": 9.0, + "passed": false, + "task_score": 2.5, "task_max_score": 9.0, - "efficiency_score": 0.6021, - "usage_score": 0.8314, - "composite_score": 0.8867, - "total_score": 10.43, - "duration": 278.5, - "cost": 0.337292 + "efficiency_score": 0.7064, + "usage_score": 0.8895, + "composite_score": 0.3192, + "total_score": 4.1, + "duration": 205.55, + "cost": 0.221029 } } }, @@ -126,23 +126,23 @@ "passed": true, "task_score": 2.5, "task_max_score": 2.5, - "efficiency_score": 0.758, - "usage_score": 0.5909, - "composite_score": 0.8698, - "total_score": 3.85, - "duration": 96.81, - "cost": 0.32725 + "efficiency_score": 0.7329, + "usage_score": 0.5884, + "composite_score": 0.8643, + "total_score": 3.82, + "duration": 106.82, + "cost": 0.329246 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 2.5, "task_max_score": 2.5, - "efficiency_score": 0.7436, - "usage_score": 0.9038, - "composite_score": 0.9295, - "total_score": 4.15, - "duration": 102.54, - "cost": 0.076921 + "efficiency_score": 0.7885, + "usage_score": 0.8918, + "composite_score": 0.9361, + "total_score": 4.18, + "duration": 84.61, + "cost": 0.08655 } } }, @@ -153,23 +153,23 @@ "passed": true, "task_score": 9.5, "task_max_score": 9.5, - "efficiency_score": 0.5944, - "usage_score": 0.0415, - "composite_score": 0.7272, - "total_score": 10.14, - "duration": 202.82, - "cost": 0.958494 + "efficiency_score": 0.676, + "usage_score": 0.1585, + "composite_score": 0.7669, + "total_score": 10.33, + "duration": 162.0, + "cost": 0.841473 }, "dashscope/qwen3.5-flash": { - "passed": false, - "task_score": 6.0, + "passed": true, + "task_score": 9.5, "task_max_score": 9.5, - "efficiency_score": 0, - "usage_score": 0.9935, - "composite_score": 0.1987, - "total_score": 6.99, - "duration": 500.0, - "cost": 0.006517 + "efficiency_score": 0.6157, + "usage_score": 0.7408, + "composite_score": 0.8713, + "total_score": 10.86, + "duration": 192.13, + "cost": 0.259165 } } }, @@ -180,23 +180,23 @@ "passed": true, "task_score": 12.0, "task_max_score": 12.0, - "efficiency_score": 0.6687, - "usage_score": 0.4162, - "composite_score": 0.817, - "total_score": 13.08, - "duration": 165.67, - "cost": 0.700563 + "efficiency_score": 0.6307, + "usage_score": 0.3382, + "composite_score": 0.7938, + "total_score": 12.97, + "duration": 184.64, + "cost": 0.79412 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 12.0, "task_max_score": 12.0, - "efficiency_score": 0.688, - "usage_score": 0.8486, - "composite_score": 0.9073, + "efficiency_score": 0.7021, + "usage_score": 0.8414, + "composite_score": 0.9087, "total_score": 13.54, - "duration": 156.0, - "cost": 0.181736 + "duration": 148.94, + "cost": 0.190283 } } }, @@ -207,23 +207,23 @@ "passed": true, "task_score": 2, "task_max_score": 2, - "efficiency_score": 0.8682, - "usage_score": 0.7719, - "composite_score": 0.928, - "total_score": 3.64, - "duration": 39.55, - "cost": 0.114047 + "efficiency_score": 0.8652, + "usage_score": 0.7683, + "composite_score": 0.9267, + "total_score": 3.63, + "duration": 40.44, + "cost": 0.115867 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 2, "task_max_score": 2, "efficiency_score": 0.8999, - "usage_score": 0.9453, - "composite_score": 0.969, + "usage_score": 0.946, + "composite_score": 0.9692, "total_score": 3.85, "duration": 30.03, - "cost": 0.027328 + "cost": 0.027012 } } }, @@ -234,23 +234,23 @@ "passed": true, "task_score": 5.0, "task_max_score": 5.0, - "efficiency_score": 0.6309, - "usage_score": 0.3874, - "composite_score": 0.8037, - "total_score": 6.02, - "duration": 147.63, - "cost": 0.612606 + "efficiency_score": 0.6882, + "usage_score": 0.4682, + "composite_score": 0.8313, + "total_score": 6.16, + "duration": 124.71, + "cost": 0.531767 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 5.0, "task_max_score": 5.0, - "efficiency_score": 0.622, - "usage_score": 0.835, - "composite_score": 0.8914, - "total_score": 6.46, - "duration": 151.2, - "cost": 0.164994 + "efficiency_score": 0.535, + "usage_score": 0.7822, + "composite_score": 0.8634, + "total_score": 6.32, + "duration": 186.0, + "cost": 0.217815 } } }, @@ -261,23 +261,23 @@ "passed": true, "task_score": 3.5, "task_max_score": 3.5, - "efficiency_score": 0.7444, - "usage_score": 0.51, - "composite_score": 0.8509, - "total_score": 4.75, - "duration": 127.79, - "cost": 0.587982 + "efficiency_score": 0.7916, + "usage_score": 0.621, + "composite_score": 0.8825, + "total_score": 4.91, + "duration": 104.2, + "cost": 0.454825 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 3.5, "task_max_score": 3.5, - "efficiency_score": 0.0687, - "usage_score": 0.5945, - "composite_score": 0.7326, - "total_score": 4.16, - "duration": 465.64, - "cost": 0.486634 + "efficiency_score": 0.7624, + "usage_score": 0.8874, + "composite_score": 0.9299, + "total_score": 5.15, + "duration": 118.82, + "cost": 0.135177 } } }, @@ -288,23 +288,23 @@ "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7553, + "efficiency_score": 0.792, "usage_score": 0, - "composite_score": 0.7511, - "total_score": 3.76, - "duration": 146.83, - "cost": 0.723918 + "composite_score": 0.7584, + "total_score": 3.79, + "duration": 124.81, + "cost": 0.537748 }, "dashscope/qwen3.5-flash": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7302, - "usage_score": 0.6429, - "composite_score": 0.8746, - "total_score": 4.37, - "duration": 161.88, - "cost": 0.178565 + "efficiency_score": 0.781, + "usage_score": 0.7322, + "composite_score": 0.9026, + "total_score": 4.51, + "duration": 131.41, + "cost": 0.133914 } } }, @@ -315,23 +315,23 @@ "passed": true, "task_score": 7.0, "task_max_score": 7.0, - "efficiency_score": 0.6602, - "usage_score": 0.3803, - "composite_score": 0.8081, - "total_score": 8.04, - "duration": 203.86, - "cost": 0.929538 + "efficiency_score": 0.731, + "usage_score": 0.4746, + "composite_score": 0.8411, + "total_score": 8.21, + "duration": 161.37, + "cost": 0.78811 }, "dashscope/qwen3.5-flash": { - "passed": false, - "task_score": 2.0, + "passed": true, + "task_score": 7.0, "task_max_score": 7.0, - "efficiency_score": 0.3255, - "usage_score": 0.7389, - "composite_score": 0.2129, - "total_score": 3.06, - "duration": 404.7, - "cost": 0.3916 + "efficiency_score": 0.674, + "usage_score": 0.8538, + "composite_score": 0.9056, + "total_score": 8.53, + "duration": 195.59, + "cost": 0.219313 } } } diff --git a/extension/src/__tests__/element-actions-regression.test.ts b/extension/src/__tests__/element-actions-regression.test.ts index c13b04f..e1ec551 100644 --- a/extension/src/__tests__/element-actions-regression.test.ts +++ b/extension/src/__tests__/element-actions-regression.test.ts @@ -26,6 +26,9 @@ describe('Element action regressions', () => { expect(elementActionsSource).toContain( 'function getInteractiveActivationTarget(target)', ); + expect(elementActionsSource).toContain( + 'function resolveActivationDispatchTarget(target, activationTarget)', + ); expect(elementActionsSource).toContain( 'dispatchActivationPress(activationTarget, activation.point);', ); @@ -34,6 +37,18 @@ describe('Element action regressions', () => { ); }); + test('performElementClick preserves structured interactive targets like anchors', () => { + expect(elementActionsSource).toContain( + 'if (isPlaceholderCoverForInput(activationTarget, target))', + ); + expect(elementActionsSource).toContain( + 'if (isStructuredInteractiveElement(target))', + ); + expect(elementActionsSource).toContain( + 'keep dispatch on that exact element instead of drifting to a non-interactive ancestor', + ); + }); + test('performKeyboardInput primes editable activation and beforeinput events', () => { expect(elementActionsSource).toContain('const alreadyFocused ='); expect(elementActionsSource).toContain("new InputEvent('beforeinput'"); diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts index 05082e3..2871107 100644 --- a/extension/src/commands/element-actions.ts +++ b/extension/src/commands/element-actions.ts @@ -207,6 +207,33 @@ function buildEditableActivationHelpersScript(): string { return { target, point: fallbackPoint }; } + function resolveActivationDispatchTarget(target, activationTarget) { + if (!(target instanceof Element)) { + return activationTarget instanceof Element ? activationTarget : null; + } + + if (!(activationTarget instanceof Element)) { + return target; + } + + // Placeholder covers for text inputs need the visible overlay surface. + if (isPlaceholderCoverForInput(activationTarget, target)) { + return activationTarget; + } + + // If highlight selected a structured interactive element such as or ") + + lru = executor._get_confirmed_action_html_lru("click") + + assert len(lru) == 10 + assert "" not in lru + assert "" in lru + + +def test_repeat_keyboard_input_with_confirmed_html_executes_without_pending_confirmation( + monkeypatch, +) -> None: + executor = BrowserExecutor() + executor.conversation_id = "conv-repeat-input" + executor._remember_confirmed_action_html( + "keyboard_input", '' + ) + + monkeypatch.setattr( + executor, + "_get_element_full_html", + lambda element_id, highlight_snapshot_id: ( + '', + "data:image/png;base64,input-repeat", + ), + ) + + captured = {} + + def fake_execute(command): + captured["command"] = command + return { + "success": True, + "data": {"screenshot": "data:image/png;base64,input-direct"}, + } + + monkeypatch.setattr(executor, "_execute_command_sync", fake_execute) + + observation = executor._execute_action_sync( + ElementInteractionAction( + action="keyboard_input", + element_id="inp123", + highlight_snapshot_id=17, + text="hello", + tab_id=321, + conversation_id="conv-repeat-input", + ) + ) + + assert observation.success is True + assert observation.message == "Input text to element: inp123" + assert observation.pending_confirmation is None + assert captured["command"].element_id == "inp123" + assert captured["command"].highlight_snapshot_id == 17 + assert captured["command"].text == "hello" + assert captured["command"].tab_id == 321 + assert executor._get_pending_confirmation() is None + + +def test_confirmed_click_html_does_not_skip_keyboard_input_confirmation( + monkeypatch, +) -> None: + executor = BrowserExecutor() + executor.conversation_id = "conv-input-action-isolation" + executor._remember_confirmed_action_html( + "click", '' + ) + + monkeypatch.setattr( + executor, + "_get_element_full_html", + lambda element_id, highlight_snapshot_id: ( + '', + "data:image/png;base64,input-pending", + ), + ) + monkeypatch.setattr( + executor, + "_execute_command_sync", + lambda command: (_ for _ in ()).throw(AssertionError("should stay in 2PC")), + ) + + observation = executor._execute_action_sync( + ElementInteractionAction( + action="keyboard_input", + element_id="inp123", + highlight_snapshot_id=17, + text="hello", + conversation_id="conv-input-action-isolation", + ) + ) + + assert observation.success is True + assert ( + observation.message + == "Keyboard input action pending confirmation for element: inp123" + ) + assert observation.pending_confirmation is not None + assert observation.pending_confirmation["action_type"] == "keyboard_input" + + def test_confirm_keyboard_input_reports_nested_extension_error(monkeypatch) -> None: executor = BrowserExecutor() executor.conversation_id = "conv-input-error" diff --git a/server/tests/unit/test_base_classes.py b/server/tests/unit/test_base_classes.py index 0793763..32598e5 100644 --- a/server/tests/unit/test_base_classes.py +++ b/server/tests/unit/test_base_classes.py @@ -175,11 +175,11 @@ def test_pending_confirmation_includes_follow_up_command(self) -> None: text = _text_content(observation) assert "## Pending Confirmation" in text - assert "Inspect the screenshot first." in text assert '{"action": "confirm_click"}' in text assert '"highlight_snapshot_id"' not in text assert '"element_id"' not in text assert "**Element ID**: a1b2c3" in text + assert "**Action Type**: click" in text def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) -> None: observation = OpenBrowserObservation( @@ -197,6 +197,7 @@ def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) -> assert '"highlight_snapshot_id"' not in text assert '"element_id"' not in text assert "**Element ID**: inp789" in text + assert "**Action Type**: keyboard_input" in text def test_pending_confirmation_with_screenshot_is_image_first_and_text_minimal(self) -> None: observation = OpenBrowserObservation( From 6e80a552f59347d8a3623f9871330ad74e5f31ad Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Fri, 27 Mar 2026 12:36:35 +0800 Subject: [PATCH 7/7] Stabilize confirmation flow and test coverage --- eval/evaluate_browser_agent.py | 14 ++- .../background-cleanup-regression.test.ts | 8 +- .../element-actions-regression.test.ts | 8 +- .../src/__tests__/highlight-detection.test.ts | 20 +++-- .../__tests__/highlight-integration.test.ts | 17 ++-- .../__tests__/highlight-screenshot.test.ts | 8 +- extension/src/background/index.ts | 25 +++--- .../src/commands/__tests__/element-id.test.ts | 22 +++-- .../__tests__/single-highlight.test.ts | 4 +- .../__tests__/tab-manager-cleanup.test.ts | 10 +-- extension/src/commands/element-actions.ts | 27 ++++-- extension/src/commands/element-cache.ts | 87 +++++++++++++++---- extension/src/commands/screenshot.ts | 8 +- extension/src/utils/collision-detection.ts | 25 +++--- server/agent/api.py | 1 + server/agent/tools/base.py | 4 +- server/agent/tools/browser_executor.py | 35 ++++---- server/agent/tools/help_tool.py | 8 +- server/core/session_manager.py | 42 ++++++--- .../integration/test_element_operations.py | 4 +- .../tests/unit/test_agent_browser_executor.py | 35 +++++--- server/tests/unit/test_base_classes.py | 8 +- server/tests/unit/test_command_models.py | 21 ++++- server/tests/unit/test_eval_client.py | 4 +- server/tests/unit/test_prompt_contracts.py | 70 +++++++++++---- server/tests/unit/test_screenshot_behavior.py | 4 +- server/tests/unit/test_state.py | 31 ++++++- .../tests/unit/test_tool_prompt_profiles.py | 84 +++++++++++++----- 28 files changed, 437 insertions(+), 197 deletions(-) diff --git a/eval/evaluate_browser_agent.py b/eval/evaluate_browser_agent.py index 7666549..d8686c8 100644 --- a/eval/evaluate_browser_agent.py +++ b/eval/evaluate_browser_agent.py @@ -546,6 +546,7 @@ def cleanup_managed_tabs(self, conversation_id: str) -> bool: return all_closed + class EvalServerClient: """Client for evaluation server tracking API""" @@ -609,7 +610,8 @@ def start_openbrowser(self) -> bool: return True root_dir = EVAL_DIR.parent - logger.error(f""" + logger.error( + f""" ❌ OpenBrowser server is not running! Please start the OpenBrowser server manually with: @@ -617,7 +619,8 @@ def start_openbrowser(self) -> bool: uv run local-chrome-server serve The server should start on port 8765 (REST API) and 8766 (WebSocket). -""") +""" + ) return False except Exception as e: @@ -634,7 +637,8 @@ def start_eval_server(self) -> bool: eval_dir = EVAL_DIR root_dir = EVAL_DIR.parent - logger.error(f""" + logger.error( + f""" ❌ Eval server is not running! Please start the eval server manually with: @@ -646,7 +650,8 @@ def start_eval_server(self) -> bool: uv run python eval/server.py The server should start on port 16605. -""") +""" + ) return False except Exception as e: @@ -1397,6 +1402,7 @@ def _check_count_min_condition( def _event_matches_expected(self, event: Dict, expected: Dict) -> bool: """Check if a track event matches expected criteria""" + def normalize_text(value: Any) -> str: return str(value or "").casefold() diff --git a/extension/src/__tests__/background-cleanup-regression.test.ts b/extension/src/__tests__/background-cleanup-regression.test.ts index 22fe2cb..7938fe4 100644 --- a/extension/src/__tests__/background-cleanup-regression.test.ts +++ b/extension/src/__tests__/background-cleanup-regression.test.ts @@ -19,12 +19,16 @@ describe('Background cleanup regressions', () => { test('tab close events are wired into browser-state cleanup', () => { expect(backgroundSource).toContain('tabManager.addTabClosedListener'); - expect(backgroundSource).toContain('cleanupTabState(conversationId, tabId);'); + expect(backgroundSource).toContain( + 'cleanupTabState(conversationId, tabId);', + ); }); test('swipe screenshots reuse tab-view warmup capture options', () => { expect(backgroundSource).toContain("case 'swipe_element': {"); expect(backgroundSource).toContain(' 900,'); - expect(backgroundSource).toContain(' TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,'); + expect(backgroundSource).toContain( + ' TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,', + ); }); }); diff --git a/extension/src/__tests__/element-actions-regression.test.ts b/extension/src/__tests__/element-actions-regression.test.ts index e1ec551..845b8ea 100644 --- a/extension/src/__tests__/element-actions-regression.test.ts +++ b/extension/src/__tests__/element-actions-regression.test.ts @@ -55,7 +55,9 @@ describe('Element action regressions', () => { }); test('performElementSwipe includes gesture fallback and settle wait', () => { - expect(elementActionsSource).toContain('async function performGestureSwipe'); + expect(elementActionsSource).toContain( + 'async function performGestureSwipe', + ); expect(elementActionsSource).toContain('await waitForSwipeToSettle('); expect(elementActionsSource).toContain("stepMethod = 'gesture';"); }); @@ -71,6 +73,8 @@ describe('Element action regressions', () => { expect(elementActionsSource).toContain('currentApi.slideNext(0)'); expect(elementActionsSource).toContain('currentApi.slidePrev(0)'); expect(elementActionsSource).toContain('currentApi.scrollNext(true)'); - expect(elementActionsSource).toContain('currentApi.slideTo(targetIndex, 0)'); + expect(elementActionsSource).toContain( + 'currentApi.slideTo(targetIndex, 0)', + ); }); }); diff --git a/extension/src/__tests__/highlight-detection.test.ts b/extension/src/__tests__/highlight-detection.test.ts index eb0c6c6..cd080b8 100644 --- a/extension/src/__tests__/highlight-detection.test.ts +++ b/extension/src/__tests__/highlight-detection.test.ts @@ -29,7 +29,12 @@ function createElement( describe('highlight-detection helpers', () => { test('normalizeHighlightKeywords trims, lowercases, removes whitespace, and deduplicates', () => { expect( - normalizeHighlightKeywords([' Like ', 'like', ' REPLY ', " John's reply "]), + normalizeHighlightKeywords([ + ' Like ', + 'like', + ' REPLY ', + " John's reply ", + ]), ).toEqual(['like', 'reply', "john'sreply"]); }); @@ -75,7 +80,7 @@ describe('highlight-detection helpers', () => { selector: 'button.reply-target', text: "John's reply", searchText: "john ' s reply", - html: '', + html: "", }), createElement({ id: 'reply456', @@ -133,7 +138,9 @@ describe('highlight-detection helpers', () => { const script = buildHighlightDetectionScript({ elementType: 'inputable' }); expect(script).toContain('return el.isContentEditable;'); - expect(script).not.toContain("return el.getAttribute('contenteditable') === 'true';"); + expect(script).not.toContain( + "return el.getAttribute('contenteditable') === 'true';", + ); }); test("buildHighlightDetectionScript keeps 'any' candidate selection across all element types", () => { @@ -216,7 +223,10 @@ describe('highlight-detection helpers', () => { test('buildHighlightDetectionScript keeps semantic controls clickable in swipe regions', () => { const script = buildHighlightDetectionScript({ elementType: 'any' }); const start = script.indexOf('function toInteractiveElement'); - const end = script.indexOf('function countVisibleClickableCandidates', start); + const end = script.indexOf( + 'function countVisibleClickableCandidates', + start, + ); const toInteractiveElementSource = script.slice(start, end); expect(script).toContain('function isSemanticControlElement'); @@ -230,7 +240,7 @@ describe('highlight-detection helpers', () => { "interactionHints.includes('swipable')", ); expect(toInteractiveElementSource).toContain( - "isScrollableCandidate(candidate.element)", + 'isScrollableCandidate(candidate.element)', ); expect(toInteractiveElementSource).toContain("? 'scrollable'"); }); diff --git a/extension/src/__tests__/highlight-integration.test.ts b/extension/src/__tests__/highlight-integration.test.ts index 2122ff2..2dfdb57 100644 --- a/extension/src/__tests__/highlight-integration.test.ts +++ b/extension/src/__tests__/highlight-integration.test.ts @@ -193,7 +193,6 @@ describe('Highlight Integration', () => { expect(elem.labelPosition).toBeDefined(); } }); - }); describe('Collision detection', () => { @@ -370,12 +369,7 @@ describe('Highlight Integration', () => { const result = sortElementsByVisualOrder(elements); - expect(result.map((element) => element.id)).toEqual([ - 'a', - 'b', - 'd', - 'c', - ]); + expect(result.map((element) => element.id)).toEqual(['a', 'b', 'd', 'c']); }); test('should handle empty elements array', () => { @@ -461,12 +455,13 @@ describe('Highlight Integration', () => { elements.push(createElement(`elem${i}`, 'clickable', x, y, 80, 30)); } - const startTime = Date.now(); + const startTime = performance.now(); const result = selectCollisionFreePage(elements, 1); - const duration = Date.now() - startTime; + const duration = performance.now() - startTime; - // Should complete in reasonable time (< 100ms for 100 elements) - expect(duration).toBeLessThan(100); + // Keep this as a coarse smoke test rather than a microbenchmark. + // Bun's wall-clock timing varies noticeably under full-suite load. + expect(duration).toBeLessThan(1000); // Should return some elements expect(result.length).toBeGreaterThan(0); diff --git a/extension/src/__tests__/highlight-screenshot.test.ts b/extension/src/__tests__/highlight-screenshot.test.ts index f4c500d..0f95377 100644 --- a/extension/src/__tests__/highlight-screenshot.test.ts +++ b/extension/src/__tests__/highlight-screenshot.test.ts @@ -55,9 +55,13 @@ describe('Highlight Screenshot', () => { expect(HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.maxOutputHeight).toBe(1080); expect(HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.warmupBeforeCapture).toBe(true); expect(HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.warmupMaxAttempts).toBe(2); - expect(HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.settleBeforeCapture).toBeUndefined(); + expect( + HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.settleBeforeCapture, + ).toBeUndefined(); expect(TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS.warmupBeforeCapture).toBe(true); expect(TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS.warmupMaxAttempts).toBe(3); - expect(TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS.settleBeforeCapture).toBeUndefined(); + expect( + TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS.settleBeforeCapture, + ).toBeUndefined(); }); }); diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index 965ad65..ce63ef3 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -117,7 +117,9 @@ function buildStoredHighlightPages(options: { viewportWidth, viewportHeight, ); - pages.push(assignSequentialElementIds(sortElementsByVisualOrder(pageElements))); + pages.push( + assignSequentialElementIds(sortElementsByVisualOrder(pageElements)), + ); } return pages; @@ -138,7 +140,7 @@ function buildSnapshotPageRefreshScript(options: { return ` (() => { const expectedDocumentId = ${JSON.stringify(expectedDocumentId || '')}; - const highlightSnapshotId = ${highlightSnapshotId ?? "null"}; + const highlightSnapshotId = ${highlightSnapshotId ?? 'null'}; const refreshTargets = ${JSON.stringify(refreshTargets)}; function normalizeIdentityWhitespace(value, maxLength = 240) { @@ -1883,10 +1885,7 @@ async function handleCommand(command: Command): Promise { const highlightSnapshotId = command.highlight_snapshot_id; const requestedKeywords = normalizeHighlightKeywords(keywords); - if ( - highlightSnapshotId !== undefined && - highlightSnapshotId !== null - ) { + if (highlightSnapshotId !== undefined && highlightSnapshotId !== null) { const baseSnapshot = elementCache.getSnapshotPage( conversationId, activeTabId, @@ -1900,7 +1899,9 @@ async function handleCommand(command: Command): Promise { }; } - const cachedKeywords = normalizeHighlightKeywords(baseSnapshot.keywords); + const cachedKeywords = normalizeHighlightKeywords( + baseSnapshot.keywords, + ); if (baseSnapshot.elementType !== elementType) { return { success: false, @@ -2122,9 +2123,7 @@ async function handleCommand(command: Command): Promise { 1; const viewportWidth = screenshotResult.metadata?.viewportWidth || 0; const viewportHeight = screenshotResult.metadata?.viewportHeight || 0; - console.log( - `📐 [HighlightElements] Image scale: ${imageScale}`, - ); + console.log(`📐 [HighlightElements] Image scale: ${imageScale}`); console.log( `📐 [HighlightElements] Viewport: ${viewportWidth}x${viewportHeight} CSS pixels`, ); @@ -2889,9 +2888,9 @@ async function handleCommand(command: Command): Promise { JSON.stringify(freshBbox), ); } else if ( - bboxResult.success - && bboxResult.result?.value - && bboxResult.result.value.ok === false + bboxResult.success && + bboxResult.result?.value && + bboxResult.result.value.ok === false ) { return { success: false, diff --git a/extension/src/commands/__tests__/element-id.test.ts b/extension/src/commands/__tests__/element-id.test.ts index 6c95cd8..632fca1 100644 --- a/extension/src/commands/__tests__/element-id.test.ts +++ b/extension/src/commands/__tests__/element-id.test.ts @@ -25,7 +25,11 @@ describe('element-id', () => { ]); expect(result.map((element) => element.id)).toEqual(['1', '2', '3']); - expect(result.map((element) => element.selector)).toEqual(['#a', '#b', '#c']); + expect(result.map((element) => element.selector)).toEqual([ + '#a', + '#b', + '#c', + ]); }); test('does not mutate the caller-owned element objects', () => { @@ -49,10 +53,7 @@ describe('element-cache highlight snapshots', () => { documentId: 'doc-1', elementType: 'any', totalElements: 2, - pages: [ - [createElement('1', '#page-1')], - [createElement('1', '#page-2')], - ], + pages: [[createElement('1', '#page-1')], [createElement('1', '#page-2')]], page: 1, }); @@ -62,7 +63,12 @@ describe('element-cache highlight snapshots', () => { '#page-1', ]); - const lookup = elementCache.getElementById('conv-1', 101, snapshot.snapshotId, '1'); + const lookup = elementCache.getElementById( + 'conv-1', + 101, + snapshot.snapshotId, + '1', + ); expect(lookup?.element.selector).toBe('#page-1'); expect(lookup?.documentId).toBe('doc-1'); }); @@ -102,8 +108,8 @@ describe('element-cache highlight snapshots', () => { .selector, ).toBe('#first-page'); expect( - elementCache.getElementById('conv-2', 101, page2!.snapshotId, '1')?.element - .selector, + elementCache.getElementById('conv-2', 101, page2!.snapshotId, '1') + ?.element.selector, ).toBe('#second-page'); }); }); diff --git a/extension/src/commands/__tests__/single-highlight.test.ts b/extension/src/commands/__tests__/single-highlight.test.ts index 1268896..5e9aa11 100644 --- a/extension/src/commands/__tests__/single-highlight.test.ts +++ b/extension/src/commands/__tests__/single-highlight.test.ts @@ -3,9 +3,7 @@ import { describe, expect, test } from 'bun:test'; import type { InteractiveElement } from '../../types'; import { calculateConfirmationPreviewLayout } from '../single-highlight'; -function createElement( - bbox: InteractiveElement['bbox'], -): InteractiveElement { +function createElement(bbox: InteractiveElement['bbox']): InteractiveElement { return { id: 'abc123', type: 'clickable', diff --git a/extension/src/commands/__tests__/tab-manager-cleanup.test.ts b/extension/src/commands/__tests__/tab-manager-cleanup.test.ts index a7d706c..69ce1b9 100644 --- a/extension/src/commands/__tests__/tab-manager-cleanup.test.ts +++ b/extension/src/commands/__tests__/tab-manager-cleanup.test.ts @@ -123,9 +123,8 @@ describe('TabManager cleanup behavior', () => { const onTabClosed = mock(() => {}); manager.addTabClosedListener(onTabClosed); - const [tabRemovedListener] = (globalThis as any).__tabRemovedListeners as Array< - (tabId: number) => void - >; + const [tabRemovedListener] = (globalThis as any) + .__tabRemovedListeners as Array<(tabId: number) => void>; tabRemovedListener(11); expect(onTabClosed).toHaveBeenCalledWith('conv-last-tab', 11); @@ -133,8 +132,9 @@ describe('TabManager cleanup behavior', () => { }); test('cleanup sweep removes sessions whose tracked tabs are already gone', async () => { - const chromeTabsQuery = (globalThis as any).chrome.tabs - .query as ReturnType; + const chromeTabsQuery = (globalThis as any).chrome.tabs.query as ReturnType< + typeof mock + >; chromeTabsQuery.mockResolvedValue([{ id: 99 }]); const manager = new TabManager(); diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts index 2871107..fae90a0 100644 --- a/extension/src/commands/element-actions.ts +++ b/extension/src/commands/element-actions.ts @@ -435,7 +435,9 @@ export async function performElementClick( // STEP 2: Build JavaScript to click with full event sequence // ============================================================ // Escape quotes in selector for safe injection - const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector); + const escapedSelector = escapeForDoubleQuotedJavaScriptString( + element.selector, + ); const escapedDocumentId = escapeForDoubleQuotedJavaScriptString( cachedElement.documentId, ); @@ -726,7 +728,9 @@ export async function performElementHover( // ============================================================ // STEP 2: Build JavaScript to dispatch hover events // ============================================================ - const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector); + const escapedSelector = escapeForDoubleQuotedJavaScriptString( + element.selector, + ); const escapedDocumentId = escapeForDoubleQuotedJavaScriptString( cachedElement.documentId, ); @@ -1001,7 +1005,8 @@ export async function performElementScroll( success: false, elementId, scrolled: false, - error: 'highlight_snapshot_id is required when scrolling a highlighted element.', + error: + 'highlight_snapshot_id is required when scrolling a highlighted element.', }; } @@ -1025,7 +1030,9 @@ export async function performElementScroll( console.log( `✅ [ElementScroll] Found element: selector="${element.selector}"`, ); - const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector); + const escapedSelector = escapeForDoubleQuotedJavaScriptString( + element.selector, + ); const escapedDocumentId = escapeForDoubleQuotedJavaScriptString( cachedElement.documentId, ); @@ -1361,7 +1368,9 @@ export async function performElementSwipe( `✅ [ElementSwipe] Found element: selector="${element.selector}"`, ); - const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector); + const escapedSelector = escapeForDoubleQuotedJavaScriptString( + element.selector, + ); const escapedDocumentId = escapeForDoubleQuotedJavaScriptString( cachedElement.documentId, ); @@ -2538,7 +2547,9 @@ export async function performKeyboardInput( // STEP 2: Build JavaScript to input text // ============================================================ // Escape quotes and backslashes in selector and text for safe injection - const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector); + const escapedSelector = escapeForDoubleQuotedJavaScriptString( + element.selector, + ); const escapedDocumentId = escapeForDoubleQuotedJavaScriptString( cachedElement.documentId, ); @@ -2842,7 +2853,9 @@ export async function performElementSelect( // STEP 2: Build JavaScript to select option(s) // ============================================================ // Escape quotes and backslashes in selector for safe injection - const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector); + const escapedSelector = escapeForDoubleQuotedJavaScriptString( + element.selector, + ); const escapedDocumentId = escapeForDoubleQuotedJavaScriptString( cachedElement.documentId, ); diff --git a/extension/src/commands/element-cache.ts b/extension/src/commands/element-cache.ts index f16787f..a257817 100644 --- a/extension/src/commands/element-cache.ts +++ b/extension/src/commands/element-cache.ts @@ -122,7 +122,9 @@ class ElementCacheImpl { for (const [snapshotKey, snapshot] of this.snapshotViews.entries()) { if (this.isExpired(snapshot.createdAt)) { this.snapshotViews.delete(snapshotKey); - console.log(`⏰ [ElementCache] Snapshot expired for key ${snapshotKey}`); + console.log( + `⏰ [ElementCache] Snapshot expired for key ${snapshotKey}`, + ); continue; } @@ -140,7 +142,10 @@ class ElementCacheImpl { continue; } - if (!activeInventoryKeys.has(inventoryKey) && this.isExpired(inventory.createdAt)) { + if ( + !activeInventoryKeys.has(inventoryKey) && + this.isExpired(inventory.createdAt) + ) { inventoryKeysToDelete.push(inventoryKey); } } @@ -195,8 +200,16 @@ class ElementCacheImpl { const inventoryId = this.nextInventoryId++; const snapshotId = this.nextSnapshotId++; const now = Date.now(); - const inventoryKey = this.buildInventoryKey(conversationId, tabId, inventoryId); - const snapshotKey = this.buildSnapshotKey(conversationId, tabId, snapshotId); + const inventoryKey = this.buildInventoryKey( + conversationId, + tabId, + inventoryId, + ); + const snapshotKey = this.buildSnapshotKey( + conversationId, + tabId, + snapshotId, + ); this.inventories.set(inventoryKey, { tabId, @@ -223,7 +236,11 @@ class ElementCacheImpl { this.pruneInventoriesForTab(conversationId, tabId); - const snapshotPage = this.getSnapshotPage(conversationId, tabId, snapshotId); + const snapshotPage = this.getSnapshotPage( + conversationId, + tabId, + snapshotId, + ); if (!snapshotPage) { throw new Error( `Failed to retrieve newly stored highlight snapshot ${snapshotId}`, @@ -244,13 +261,21 @@ class ElementCacheImpl { ): HighlightSnapshotPage | undefined { this.cleanupExpired(); - const baseSnapshot = this.getSnapshotView(conversationId, tabId, baseSnapshotId); + const baseSnapshot = this.getSnapshotView( + conversationId, + tabId, + baseSnapshotId, + ); if (!baseSnapshot) { return undefined; } const snapshotId = this.nextSnapshotId++; - const snapshotKey = this.buildSnapshotKey(conversationId, tabId, snapshotId); + const snapshotKey = this.buildSnapshotKey( + conversationId, + tabId, + snapshotId, + ); const now = Date.now(); this.snapshotViews.set(snapshotKey, { @@ -260,12 +285,20 @@ class ElementCacheImpl { page, }); - const inventory = this.getInventory(conversationId, tabId, baseSnapshot.inventoryId); + const inventory = this.getInventory( + conversationId, + tabId, + baseSnapshot.inventoryId, + ); if (inventory) { this.touchInventory(inventory); } - const snapshotPage = this.getSnapshotPage(conversationId, tabId, snapshotId); + const snapshotPage = this.getSnapshotPage( + conversationId, + tabId, + snapshotId, + ); if (snapshotPage) { console.log( `📄 [ElementCache] Forked snapshot ${snapshotId} from base ${baseSnapshotId} for conversation ${conversationId}, tab ${tabId}, page ${page}`, @@ -286,7 +319,11 @@ class ElementCacheImpl { return undefined; } - const inventory = this.getInventory(conversationId, tabId, snapshot.inventoryId); + const inventory = this.getInventory( + conversationId, + tabId, + snapshot.inventoryId, + ); if (!inventory) { return undefined; } @@ -318,7 +355,11 @@ class ElementCacheImpl { snapshotId: number, elementId: string, ): CachedElementLookup | undefined { - const snapshotPage = this.getSnapshotPage(conversationId, tabId, snapshotId); + const snapshotPage = this.getSnapshotPage( + conversationId, + tabId, + snapshotId, + ); if (!snapshotPage) { return undefined; } @@ -352,7 +393,11 @@ class ElementCacheImpl { return undefined; } - const snapshotKey = this.buildSnapshotKey(conversationId, tabId, snapshotId); + const snapshotKey = this.buildSnapshotKey( + conversationId, + tabId, + snapshotId, + ); const snapshot = this.snapshotViews.get(snapshotKey); if (!snapshot) { return undefined; @@ -360,7 +405,9 @@ class ElementCacheImpl { if (snapshot.tabId !== tabId || this.isExpired(snapshot.createdAt)) { this.snapshotViews.delete(snapshotKey); - console.log(`⏰ [ElementCache] Snapshot expired or mismatched for key ${snapshotKey}`); + console.log( + `⏰ [ElementCache] Snapshot expired or mismatched for key ${snapshotKey}`, + ); return undefined; } @@ -376,7 +423,11 @@ class ElementCacheImpl { return undefined; } - const inventoryKey = this.buildInventoryKey(conversationId, tabId, inventoryId); + const inventoryKey = this.buildInventoryKey( + conversationId, + tabId, + inventoryId, + ); const inventory = this.inventories.get(inventoryKey); if (!inventory) { return undefined; @@ -400,11 +451,11 @@ class ElementCacheImpl { ? `${conversationId}:${tabId}:snapshot:` : `${conversationId}:`; - const inventoryKeysToDelete = Array.from(this.inventories.keys()).filter((key) => - key.startsWith(inventoryPrefix), + const inventoryKeysToDelete = Array.from(this.inventories.keys()).filter( + (key) => key.startsWith(inventoryPrefix), ); - const snapshotKeysToDelete = Array.from(this.snapshotViews.keys()).filter((key) => - key.startsWith(snapshotPrefix), + const snapshotKeysToDelete = Array.from(this.snapshotViews.keys()).filter( + (key) => key.startsWith(snapshotPrefix), ); for (const key of inventoryKeysToDelete) { diff --git a/extension/src/commands/screenshot.ts b/extension/src/commands/screenshot.ts index e66d066..fa3005e 100644 --- a/extension/src/commands/screenshot.ts +++ b/extension/src/commands/screenshot.ts @@ -702,12 +702,16 @@ async function inspectImageDimensions(dataUrl: string): Promise<{ height: number; }> { if (!dataUrl.startsWith('data:')) { - throw new Error('[Screenshot] Invalid data URL while inspecting dimensions'); + throw new Error( + '[Screenshot] Invalid data URL while inspecting dimensions', + ); } const commaIndex = dataUrl.indexOf(','); if (commaIndex === -1) { - throw new Error('[Screenshot] Screenshot data URL missing payload separator'); + throw new Error( + '[Screenshot] Screenshot data URL missing payload separator', + ); } const header = dataUrl.slice(0, commaIndex); diff --git a/extension/src/utils/collision-detection.ts b/extension/src/utils/collision-detection.ts index 609fd41..2af05ae 100644 --- a/extension/src/utils/collision-detection.ts +++ b/extension/src/utils/collision-detection.ts @@ -287,12 +287,10 @@ function chooseNextCandidate( ): (PlacementEvaluation & { candidate: RemainingCandidate }) | null { const currentLabelText = String(selected.length + 1); let minFeasiblePositions = Number.POSITIVE_INFINITY; - let constrainedCandidate: - | { - candidate: RemainingCandidate; - feasiblePositions: LabelPosition[]; - } - | null = null; + let constrainedCandidate: { + candidate: RemainingCandidate; + feasiblePositions: LabelPosition[]; + } | null = null; for (const candidate of remaining) { const feasiblePositions = getFeasiblePositions( @@ -342,7 +340,8 @@ function chooseLeastBlockingPlacement( ): PlacementEvaluation { const currentLabelText = String(selected.length + 1); const futureCandidates = remaining.filter( - (remainingCandidate) => remainingCandidate.sourceIndex !== candidate.sourceIndex, + (remainingCandidate) => + remainingCandidate.sourceIndex !== candidate.sourceIndex, ); let bestPlacement: PlacementEvaluation | null = null; @@ -392,11 +391,13 @@ function chooseLeastBlockingPlacement( } } - return bestPlacement ?? { - position: POSITION_PRIORITY[0], - blockedCandidateCount: Number.POSITIVE_INFINITY, - totalFutureOptions: Number.NEGATIVE_INFINITY, - }; + return ( + bestPlacement ?? { + position: POSITION_PRIORITY[0], + blockedCandidateCount: Number.POSITIVE_INFINITY, + totalFutureOptions: Number.NEGATIVE_INFINITY, + } + ); } function getFeasiblePositions( diff --git a/server/agent/api.py b/server/agent/api.py index 33c7d92..681fb1b 100644 --- a/server/agent/api.py +++ b/server/agent/api.py @@ -437,6 +437,7 @@ def initialize_agent(): from .tools.highlight_tool import HighlightTool from .tools.element_interaction_tool import ElementInteractionTool from .tools.dialog_tool import DialogTool + logger.info( "4 focused OpenBrowser tools registered: tab, highlight, element_interaction, dialog" ) diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py index 62615ba..2d945af 100644 --- a/server/agent/tools/base.py +++ b/server/agent/tools/base.py @@ -113,7 +113,9 @@ def _pending_confirmation_llm_content( action_type = str(pending.get("action_type", "unknown")) element_id = str(pending.get("element_id", "unknown")) highlight_snapshot_id = str( - pending.get("highlight_snapshot_id", self.highlight_snapshot_id or "unknown") + pending.get( + "highlight_snapshot_id", self.highlight_snapshot_id or "unknown" + ) ) confirm_cmd = f'{{"action": "confirm_{action_type}"}}' diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py index 65f1b8e..4c9fa87 100644 --- a/server/agent/tools/browser_executor.py +++ b/server/agent/tools/browser_executor.py @@ -101,9 +101,9 @@ def __init__(self): # Pending confirmations per conversation for 2PC actions. self.pending_confirmations: Dict[str, Dict[str, Any]] = {} # Recently confirmed element targets keyed by action_type then exact element HTML. - self.confirmed_action_html_lru: Dict[ - str, Dict[str, OrderedDict[str, None]] - ] = {} + self.confirmed_action_html_lru: Dict[str, Dict[str, OrderedDict[str, None]]] = ( + {} + ) def __call__( self, action: OpenBrowserAction, conversation @@ -394,9 +394,7 @@ def _execute_element_interaction_action( conversation_id=self.conversation_id, tab_id=action.tab_id, ) - result_dict = self._execute_element_command( - command, "hover element" - ) + result_dict = self._execute_element_command(command, "hover element") return self._build_observation_from_result( result_dict, f"Hovered element: {action.element_id}", @@ -418,9 +416,7 @@ def _execute_element_interaction_action( conversation_id=self.conversation_id, tab_id=action.tab_id, ) - result_dict = self._execute_element_command( - command, "scroll element" - ) + result_dict = self._execute_element_command(command, "scroll element") return self._build_observation_from_result( result_dict, f"Scrolled element: {action.element_id}", @@ -473,7 +469,9 @@ def _execute_element_interaction_action( if not action.text: raise ValueError("keyboard_input requires text parameter") if action.highlight_snapshot_id is None: - raise ValueError("keyboard_input requires highlight_snapshot_id parameter") + raise ValueError( + "keyboard_input requires highlight_snapshot_id parameter" + ) full_html, screenshot = self._get_element_full_html( action.element_id, action.highlight_snapshot_id ) @@ -529,9 +527,7 @@ def _execute_element_interaction_action( conversation_id=self.conversation_id, tab_id=action.tab_id, ) - result_dict = self._execute_element_command( - command, "select option" - ) + result_dict = self._execute_element_command(command, "select option") return self._build_observation_from_result( result_dict, f"Selected option in element: {action.element_id}", @@ -685,21 +681,22 @@ def _get_pending_confirmation(self) -> Optional[Dict[str, Any]]: """Get pending confirmation for current conversation""" return self.pending_confirmations.get(self.conversation_id) - def _normalize_confirmed_action_html(self, full_html: Optional[str]) -> Optional[str]: + def _normalize_confirmed_action_html( + self, full_html: Optional[str] + ) -> Optional[str]: """Normalize cached element HTML used for repeat-click shortcut matching.""" if not isinstance(full_html, str): return None normalized = full_html.strip() - if ( - not normalized - or normalized == ELEMENT_HTML_CACHE_MISS_PLACEHOLDER - ): + if not normalized or normalized == ELEMENT_HTML_CACHE_MISS_PLACEHOLDER: return None return normalized - def _get_confirmed_action_html_lru(self, action_type: str) -> OrderedDict[str, None]: + def _get_confirmed_action_html_lru( + self, action_type: str + ) -> OrderedDict[str, None]: """Get or create the confirmed-action LRU cache for current conversation.""" conversation_lru = self.confirmed_action_html_lru.setdefault( self.conversation_id, {} diff --git a/server/agent/tools/help_tool.py b/server/agent/tools/help_tool.py index 1a87881..a9b451d 100644 --- a/server/agent/tools/help_tool.py +++ b/server/agent/tools/help_tool.py @@ -57,9 +57,7 @@ def to_llm_content(self) -> Sequence[TextContent]: return [TextContent(text="Pending user's decision")] -class PleaseHelpMeExecutor( - ToolExecutor[PleaseHelpMeAction, PleaseHelpMeObservation] -): +class PleaseHelpMeExecutor(ToolExecutor[PleaseHelpMeAction, PleaseHelpMeObservation]): """Executor that pauses the current turn and surfaces a user-help request.""" def __call__( @@ -76,9 +74,7 @@ def __call__( ) -class PleaseHelpMeTool( - ToolDefinition[PleaseHelpMeAction, PleaseHelpMeObservation] -): +class PleaseHelpMeTool(ToolDefinition[PleaseHelpMeAction, PleaseHelpMeObservation]): """Tool for requesting human intervention.""" name = PLEASE_HELP_ME_TOOL_NAME diff --git a/server/core/session_manager.py b/server/core/session_manager.py index c02d146..1e0f3bd 100644 --- a/server/core/session_manager.py +++ b/server/core/session_manager.py @@ -122,7 +122,8 @@ def _init_database(self): cursor = conn.cursor() # Create sessions table - cursor.execute(""" + cursor.execute( + """ CREATE TABLE IF NOT EXISTS sessions ( conversation_id TEXT PRIMARY KEY, status TEXT NOT NULL DEFAULT 'idle', @@ -135,13 +136,15 @@ def _init_database(self): tags TEXT DEFAULT '[]', metadata TEXT DEFAULT '{}' ) - """) + """ + ) # Run migrations to add missing columns self._migrate_database(cursor) # Create user_messages table - cursor.execute(""" + cursor.execute( + """ CREATE TABLE IF NOT EXISTS user_messages ( id INTEGER PRIMARY KEY AUTOINCREMENT, conversation_id TEXT NOT NULL, @@ -151,10 +154,12 @@ def _init_database(self): FOREIGN KEY (conversation_id) REFERENCES sessions(conversation_id), UNIQUE(conversation_id, message_index) ) - """) + """ + ) # Create session_events table for SSE event history - cursor.execute(""" + cursor.execute( + """ CREATE TABLE IF NOT EXISTS session_events ( id INTEGER PRIMARY KEY AUTOINCREMENT, conversation_id TEXT NOT NULL, @@ -164,23 +169,30 @@ def _init_database(self): created_at TEXT NOT NULL, FOREIGN KEY (conversation_id) REFERENCES sessions(conversation_id) ) - """) + """ + ) # Create indexes for faster queries - cursor.execute(""" + cursor.execute( + """ CREATE INDEX IF NOT EXISTS idx_user_messages_conversation ON user_messages(conversation_id) - """) + """ + ) - cursor.execute(""" + cursor.execute( + """ CREATE INDEX IF NOT EXISTS idx_session_events_conversation ON session_events(conversation_id) - """) + """ + ) - cursor.execute(""" + cursor.execute( + """ CREATE INDEX IF NOT EXISTS idx_session_events_index ON session_events(conversation_id, event_index) - """) + """ + ) conn.commit() conn.close() @@ -198,9 +210,11 @@ def _migrate_database(self, cursor): if "first_user_message" not in columns: logger.info("Adding missing column: first_user_message") - cursor.execute(""" + cursor.execute( + """ ALTER TABLE sessions ADD COLUMN first_user_message TEXT - """) + """ + ) logger.info("Migration completed: first_user_message column added") except Exception as e: diff --git a/server/tests/integration/test_element_operations.py b/server/tests/integration/test_element_operations.py index 6b62aff..9b20d89 100644 --- a/server/tests/integration/test_element_operations.py +++ b/server/tests/integration/test_element_operations.py @@ -108,8 +108,8 @@ def test_returns_page_local_numeric_ids( assert re.match( r"^\d+$", element_id ), f"Element ID should be numeric: {element_id}" - assert ( - element_id == str(index) + assert element_id == str( + index ), f"Element ID should follow response order: expected {index}, got {element_id}" diff --git a/server/tests/unit/test_agent_browser_executor.py b/server/tests/unit/test_agent_browser_executor.py index 2ea9c7d..eaa07e4 100644 --- a/server/tests/unit/test_agent_browser_executor.py +++ b/server/tests/unit/test_agent_browser_executor.py @@ -4,7 +4,11 @@ from server.agent.tools.browser_executor import BrowserExecutor from server.agent.tools.element_interaction_tool import ElementInteractionAction -from server.models.commands import ClickElementCommand, HoverElementCommand, SwipeElementCommand +from server.models.commands import ( + ClickElementCommand, + HoverElementCommand, + SwipeElementCommand, +) class _FakeResponse: @@ -103,7 +107,10 @@ def test_keyboard_input_sets_pending_confirmation(monkeypatch) -> None: ) assert observation.success is True - assert observation.message == "Keyboard input action pending confirmation for element: inp123" + assert ( + observation.message + == "Keyboard input action pending confirmation for element: inp123" + ) assert observation.pending_confirmation is not None assert observation.pending_confirmation["action_type"] == "keyboard_input" assert observation.pending_confirmation["element_id"] == "inp123" @@ -125,7 +132,10 @@ def test_confirm_click_uses_pending_confirmation_state(monkeypatch) -> None: def fake_execute(command): captured["command"] = command - return {"success": True, "data": {"screenshot": "data:image/png;base64,clicked"}} + return { + "success": True, + "data": {"screenshot": "data:image/png;base64,clicked"}, + } monkeypatch.setattr(executor, "_execute_command_sync", fake_execute) @@ -151,7 +161,7 @@ def test_repeat_click_with_confirmed_html_executes_without_pending_confirmation( executor.conversation_id = "conv-repeat-click" executor._remember_confirmed_action_html( "click", - 'Why the Fed Must Act Decisively' + 'Why the Fed Must Act Decisively', ) monkeypatch.setattr( @@ -197,9 +207,7 @@ def fake_execute(command): def test_repeat_click_does_not_shortcut_cache_miss_placeholder(monkeypatch) -> None: executor = BrowserExecutor() executor.conversation_id = "conv-repeat-placeholder" - executor._remember_confirmed_action_html( - "click", "" - ) + executor._remember_confirmed_action_html("click", "") monkeypatch.setattr( executor, @@ -227,7 +235,9 @@ def test_repeat_click_does_not_shortcut_cache_miss_placeholder(monkeypatch) -> N assert observation.success is True assert observation.message == "Click action pending confirmation for element: 15" assert observation.pending_confirmation is not None - assert observation.pending_confirmation["full_html"] == "" + assert ( + observation.pending_confirmation["full_html"] == "" + ) def test_confirmed_action_html_lru_keeps_only_most_recent_ten_entries() -> None: @@ -235,7 +245,9 @@ def test_confirmed_action_html_lru_keeps_only_most_recent_ten_entries() -> None: executor.conversation_id = "conv-repeat-lru" for i in range(11): - executor._remember_confirmed_action_html("click", f"") + executor._remember_confirmed_action_html( + "click", f"" + ) lru = executor._get_confirmed_action_html_lru("click") @@ -371,10 +383,7 @@ def fake_execute(command): ) assert observation.success is False - assert ( - observation.error - == "Failed to input text: Input element is detached" - ) + assert observation.error == "Failed to input text: Input element is detached" assert captured["command"].element_id == "inp123" assert captured["command"].highlight_snapshot_id == 17 assert captured["command"].tab_id == 321 diff --git a/server/tests/unit/test_base_classes.py b/server/tests/unit/test_base_classes.py index 32598e5..cd67bd2 100644 --- a/server/tests/unit/test_base_classes.py +++ b/server/tests/unit/test_base_classes.py @@ -181,7 +181,9 @@ def test_pending_confirmation_includes_follow_up_command(self) -> None: assert "**Element ID**: a1b2c3" in text assert "**Action Type**: click" in text - def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) -> None: + def test_pending_keyboard_confirmation_uses_matching_follow_up_command( + self, + ) -> None: observation = OpenBrowserObservation( success=True, pending_confirmation={ @@ -199,7 +201,9 @@ def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) -> assert "**Element ID**: inp789" in text assert "**Action Type**: keyboard_input" in text - def test_pending_confirmation_with_screenshot_is_image_first_and_text_minimal(self) -> None: + def test_pending_confirmation_with_screenshot_is_image_first_and_text_minimal( + self, + ) -> None: observation = OpenBrowserObservation( success=True, screenshot_data_url="data:image/png;base64,confirm123", diff --git a/server/tests/unit/test_command_models.py b/server/tests/unit/test_command_models.py index 03a321e..140e3a3 100644 --- a/server/tests/unit/test_command_models.py +++ b/server/tests/unit/test_command_models.py @@ -64,15 +64,21 @@ def test_scroll_rejects_out_of_range_amounts(self, amount: float) -> None: def test_keyboard_input_allows_empty_text_for_clear_style_interactions( self, ) -> None: - command = KeyboardInputCommand(element_id="field123", text="") + command = KeyboardInputCommand( + element_id="field123", + highlight_snapshot_id=101, + text="", + ) assert command.text == "" + assert command.highlight_snapshot_id == 101 def test_swipe_defaults_match_carousel_workflow(self) -> None: - command = SwipeElementCommand(element_id="carousel1") + command = SwipeElementCommand(element_id="carousel1", highlight_snapshot_id=202) assert command.direction == "next" assert command.swipe_count == 1 + assert command.highlight_snapshot_id == 202 def test_swipe_direction_description_uses_content_semantics(self) -> None: description = SwipeElementCommand.model_fields["direction"].description @@ -84,7 +90,11 @@ def test_swipe_direction_description_uses_content_semantics(self) -> None: @pytest.mark.parametrize("count", [1, 5]) def test_swipe_accepts_documented_count_boundaries(self, count: int) -> None: - command = SwipeElementCommand(element_id="carousel1", swipe_count=count) + command = SwipeElementCommand( + element_id="carousel1", + highlight_snapshot_id=202, + swipe_count=count, + ) assert command.swipe_count == count @@ -107,6 +117,7 @@ class TestParseCommandContracts: { "type": "click_element", "element_id": "abc123", + "highlight_snapshot_id": 11, "conversation_id": "conv-1", "browser_id": "browser-1", }, @@ -116,6 +127,7 @@ class TestParseCommandContracts: { "type": "keyboard_input", "element_id": "field123", + "highlight_snapshot_id": 12, "text": "hello", "tab_id": 7, "conversation_id": "conv-2", @@ -136,6 +148,7 @@ class TestParseCommandContracts: { "type": "swipe_element", "element_id": "carousel1", + "highlight_snapshot_id": 13, "direction": "prev", "swipe_count": 2, "conversation_id": "conv-4", @@ -153,6 +166,8 @@ def test_parse_command_preserves_routing_metadata( assert isinstance(command, expected_type) assert command.conversation_id == payload["conversation_id"] assert command.browser_id == payload["browser_id"] + if "highlight_snapshot_id" in payload: + assert command.highlight_snapshot_id == payload["highlight_snapshot_id"] def test_parse_command_rejects_missing_type(self) -> None: with pytest.raises(ValueError, match="must have 'type' field"): diff --git a/server/tests/unit/test_eval_client.py b/server/tests/unit/test_eval_client.py index c597d90..dcca19a 100644 --- a/server/tests/unit/test_eval_client.py +++ b/server/tests/unit/test_eval_client.py @@ -252,9 +252,7 @@ def test_run_test_cleans_managed_tabs_before_delete(tmp_path) -> None: evaluator.openbrowser.create_conversation.return_value = "conv-123" evaluator.openbrowser.send_message.return_value = MessageRunResult(events=[]) evaluator.openbrowser.cleanup_managed_tabs.side_effect = ( - lambda conversation_id: teardown_calls.append( - f"cleanup:{conversation_id}" - ) + lambda conversation_id: teardown_calls.append(f"cleanup:{conversation_id}") or False ) evaluator.openbrowser.delete_conversation.side_effect = ( diff --git a/server/tests/unit/test_prompt_contracts.py b/server/tests/unit/test_prompt_contracts.py index e26a17d..1a8a825 100644 --- a/server/tests/unit/test_prompt_contracts.py +++ b/server/tests/unit/test_prompt_contracts.py @@ -72,37 +72,61 @@ def test_highlight_prompt_keeps_icon_targets_on_any_pagination(self) -> None: assert "icon-only controls" in description assert "Stay on the same `element_type` across pages" in description assert "actual button may simply be on the next page" in description - assert "Keep generic controls, buttons, links, dense toolbars, and icon-only targets inside `any`" in description - assert '`clickable`' not in description + assert ( + "Keep generic controls, buttons, links, dense toolbars, and icon-only targets inside `any`" + in description + ) + assert "`clickable`" not in description - def test_highlight_prompt_requires_exact_text_keywords_and_pagination_before_guessing(self) -> None: + def test_highlight_prompt_requires_exact_text_keywords_and_pagination_before_guessing( + self, + ) -> None: description = get_highlight_tool_description() - assert "Treat pages as reliable collision-free slices of the same candidate set" in description + assert ( + "Treat pages as reliable collision-free slices of the same candidate set" + in description + ) assert "Do not jump from a first-page miss to `keywords`" in description - assert "Use keywords only for exact literal text characters you can already see on the target itself in the current screenshot" in description + assert ( + "Use keywords only for exact literal text characters you can already see on the target itself in the current screenshot" + in description + ) assert '`{"keywords": ["52"]}`' in description assert '`["star"]`, `["favorite"]`, or `["bookmark"]`' in description assert "DO NOT use synonym bundles like" in description assert "Examples of broad search" not in description assert "Phase 2: Broad Search" not in description - def test_highlight_prompt_requires_rehighlight_after_significant_page_change(self) -> None: + def test_highlight_prompt_requires_rehighlight_after_significant_page_change( + self, + ) -> None: description = get_highlight_tool_description() assert "After any significant page-state change" in description - assert 'call `highlight` with `element_type: "any"` again before choosing the next element' in description - assert "Do not jump straight to `keywords` or another narrower type on that changed page" in description + assert ( + 'call `highlight` with `element_type: "any"` again before choosing the next element' + in description + ) + assert ( + "Do not jump straight to `keywords` or another narrower type on that changed page" + in description + ) def test_highlight_prompt_omits_clickable_mode_from_agent_guidance(self) -> None: description = get_highlight_tool_description() - assert '`clickable`' not in description + assert "`clickable`" not in description - def test_highlight_prompt_requires_click_before_keyboard_input_for_inputable_targets(self) -> None: + def test_highlight_prompt_requires_click_before_keyboard_input_for_inputable_targets( + self, + ) -> None: description = get_highlight_tool_description() - assert "always `click` it first and complete that confirmation before `keyboard_input`" in description + assert ( + "always `click` it first and complete that confirmation before `keyboard_input`" + in description + ) def test_small_model_highlight_prompt_omits_keywords_guidance(self) -> None: with patch.object( @@ -139,11 +163,19 @@ def test_tab_prompt_points_agents_to_tab_view_for_clean_screenshots(self) -> Non assert "tab view" in description assert "clean screenshot" in description.lower() - def test_element_interaction_prompt_requires_click_before_keyboard_input(self) -> None: + def test_element_interaction_prompt_requires_click_before_keyboard_input( + self, + ) -> None: description = get_element_interaction_tool_description() - assert "Always `click` the target first and complete that confirmation before `keyboard_input`." in description - assert "only after you already clicked the same input target and completed that click confirmation" in description + assert ( + "Always `click` the target first and complete that confirmation before `keyboard_input`." + in description + ) + assert ( + "only after you already clicked the same input target and completed that click confirmation" + in description + ) def test_element_interaction_prompt_explains_swipe_semantics(self) -> None: description = get_element_interaction_tool_description() @@ -157,8 +189,14 @@ def test_element_interaction_confirm_examples_use_pending_state_only(self) -> No assert '{ "action": "confirm_click" }' in description assert '{ "action": "confirm_keyboard_input" }' in description - assert '{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 }' not in description - assert '{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 }' not in description + assert ( + '{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 }' + not in description + ) + assert ( + '{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 }' + not in description + ) def test_element_interaction_action_schema_explains_swipe_semantics(self) -> None: description = ElementInteractionAction.model_fields["direction"].description diff --git a/server/tests/unit/test_screenshot_behavior.py b/server/tests/unit/test_screenshot_behavior.py index 001011a..a575bd2 100644 --- a/server/tests/unit/test_screenshot_behavior.py +++ b/server/tests/unit/test_screenshot_behavior.py @@ -149,7 +149,9 @@ async def mock_send_cmd(cmd): async def test_click_element_returns_screenshot(self, processor): """click_element command should return screenshot (from extension).""" command = ClickElementCommand( - element_id="1", conversation_id="test-conv-5" + element_id="1", + highlight_snapshot_id=55, + conversation_id="test-conv-5", ) mock_response = CommandResponse( diff --git a/server/tests/unit/test_state.py b/server/tests/unit/test_state.py index a49882f..9ee0400 100644 --- a/server/tests/unit/test_state.py +++ b/server/tests/unit/test_state.py @@ -25,11 +25,13 @@ def test_creation_with_required_fields(self) -> None: """Test creating PendingConfirmation with only required fields.""" confirmation = PendingConfirmation( element_id="abc123", + highlight_snapshot_id=101, action_type="click", full_html="", ) assert confirmation.element_id == "abc123" + assert confirmation.highlight_snapshot_id == 101 assert confirmation.action_type == "click" assert confirmation.full_html == "" assert confirmation.extra_data == {} @@ -39,6 +41,7 @@ def test_creation_with_all_fields(self) -> None: """Test creating PendingConfirmation with all fields.""" confirmation = PendingConfirmation( element_id="xyz789", + highlight_snapshot_id=202, action_type="keyboard_input", full_html='', extra_data={"text": "hello world"}, @@ -46,6 +49,7 @@ def test_creation_with_all_fields(self) -> None: ) assert confirmation.element_id == "xyz789" + assert confirmation.highlight_snapshot_id == 202 assert confirmation.action_type == "keyboard_input" assert confirmation.full_html == '' assert confirmation.extra_data == {"text": "hello world"} @@ -53,9 +57,17 @@ def test_creation_with_all_fields(self) -> None: def test_extra_data_default_is_mutable(self) -> None: """Test that extra_data defaults to a new empty dict each time.""" - c1 = PendingConfirmation(element_id="a", action_type="click", full_html="") + c1 = PendingConfirmation( + element_id="a", + highlight_snapshot_id=1, + action_type="click", + full_html="", + ) c2 = PendingConfirmation( - element_id="b", action_type="keyboard_input", full_html="" + element_id="b", + highlight_snapshot_id=2, + action_type="keyboard_input", + full_html="", ) c1.extra_data["key"] = "value" @@ -76,6 +88,7 @@ def test_set_pending(self) -> None: state.set_pending( conversation_id="conv-123", element_id="elem-456", + highlight_snapshot_id=101, action_type="click", full_html="", ) @@ -83,6 +96,7 @@ def test_set_pending(self) -> None: assert "conv-123" in state.pending_confirmations pending = state.pending_confirmations["conv-123"] assert pending.element_id == "elem-456" + assert pending.highlight_snapshot_id == 101 assert pending.action_type == "click" assert pending.full_html == "" @@ -92,6 +106,7 @@ def test_set_pending_with_optional_fields(self) -> None: state.set_pending( conversation_id="conv-789", element_id="elem-abc", + highlight_snapshot_id=202, action_type="keyboard_input", full_html="", extra_data={"text": "hello"}, @@ -108,6 +123,7 @@ def test_get_pending_existing(self) -> None: state.set_pending( conversation_id="conv-1", element_id="elem-1", + highlight_snapshot_id=303, action_type="keyboard_input", full_html="", ) @@ -115,6 +131,7 @@ def test_get_pending_existing(self) -> None: pending = state.get_pending("conv-1") assert pending is not None assert pending.element_id == "elem-1" + assert pending.highlight_snapshot_id == 303 assert pending.action_type == "keyboard_input" def test_get_pending_nonexistent(self) -> None: @@ -129,6 +146,7 @@ def test_clear_pending_existing(self) -> None: state.set_pending( conversation_id="conv-to-clear", element_id="elem-1", + highlight_snapshot_id=404, action_type="click", full_html="", ) @@ -160,6 +179,7 @@ def test_multiple_conversations_isolation(self) -> None: state.set_pending( conversation_id="conv-2", element_id="elem-2", + highlight_snapshot_id=502, action_type="keyboard_input", full_html="B", ) @@ -167,7 +187,9 @@ def test_multiple_conversations_isolation(self) -> None: # Verify isolation assert len(state.pending_confirmations) == 2 assert state.get_pending("conv-1").element_id == "elem-1" + assert state.get_pending("conv-1").highlight_snapshot_id == 501 assert state.get_pending("conv-2").element_id == "elem-2" + assert state.get_pending("conv-2").highlight_snapshot_id == 502 # Clear one doesn't affect the other state.clear_pending("conv-1") @@ -181,6 +203,7 @@ def test_overwrite_pending(self) -> None: state.set_pending( conversation_id="conv-1", element_id="elem-1", + highlight_snapshot_id=601, action_type="click", full_html="", ) @@ -188,12 +211,14 @@ def test_overwrite_pending(self) -> None: state.set_pending( conversation_id="conv-1", element_id="elem-2", + highlight_snapshot_id=602, action_type="keyboard_input", full_html="Second", ) pending = state.get_pending("conv-1") assert pending.element_id == "elem-2" + assert pending.highlight_snapshot_id == 602 assert pending.action_type == "keyboard_input" assert pending.full_html == "Second" @@ -206,10 +231,12 @@ def test_all_action_types(self) -> None: state.set_pending( conversation_id=f"conv-{i}", element_id=f"elem-{i}", + highlight_snapshot_id=700 + i, action_type=action_type, full_html=f"<{action_type}>", ) for i, action_type in enumerate(action_types): pending = state.get_pending(f"conv-{i}") + assert pending.highlight_snapshot_id == 700 + i assert pending.action_type == action_type diff --git a/server/tests/unit/test_tool_prompt_profiles.py b/server/tests/unit/test_tool_prompt_profiles.py index ba90e0e..8d562ab 100644 --- a/server/tests/unit/test_tool_prompt_profiles.py +++ b/server/tests/unit/test_tool_prompt_profiles.py @@ -55,22 +55,37 @@ def test_small_model_highlight_prompt_stays_compact_and_actionable() -> None: description = get_highlight_tool_description() assert "## Core Rules" in description - assert 'default first pass for each new page state' in description - assert 'extension-derived page insight across element types' in description + assert "default first pass for each new page state" in description + assert "extension-derived page insight across element types" in description assert "Treat highlight pagination as reliable" in description assert "After any significant page-state change" in description - assert "Do not jump away from `element_type: \"any\"` on a newly changed page" in description + assert ( + 'Do not jump away from `element_type: "any"` on a newly changed page' + in description + ) assert "icon-only toolbar or header control" in description - assert "continue `any` pagination and inspect the next pages instead of switching modes" in description + assert ( + "continue `any` pagination and inspect the next pages instead of switching modes" + in description + ) assert "you may narrow to `inputable`" in description - assert "always `click` it first and complete that confirmation before `keyboard_input`" in description + assert ( + "always `click` it first and complete that confirmation before `keyboard_input`" + in description + ) assert "After typing, continue discovery with `any`" in description - assert "When a search results page loads, call `highlight` with `element_type: \"any\"`" in description + assert ( + 'When a search results page loads, call `highlight` with `element_type: "any"`' + in description + ) assert "use `tab back`" in description - assert "Do not use guessed labels such as \"settings\", \"gear\", \"bell\", \"next\", \"prev\", or \"close\"" in description + assert ( + 'Do not use guessed labels such as "settings", "gear", "bell", "next", "prev", or "close"' + in description + ) assert "icon button next to a count or badge" in description assert "`keywords`" not in description - assert '`clickable`' not in description + assert "`clickable`" not in description assert "Phase 1: Precise Search" not in description assert "Collision-Aware Pagination" not in description @@ -105,27 +120,45 @@ def test_large_model_highlight_prompt_keeps_detailed_pagination_guidance_without description = get_highlight_tool_description() assert "Collision-Aware Pagination" in description - assert '## Any-First Discovery Rule' in description - assert 'default first pass for each new page state' in description - assert 'extension-derived page insight across element types' in description - assert "After any significant page-state change, restart with `highlight` on `element_type: \"any\"`" in description - assert "Do not jump away from `element_type: \"any\"` on that changed page" in description + assert "## Any-First Discovery Rule" in description + assert "default first pass for each new page state" in description + assert "extension-derived page insight across element types" in description + assert ( + 'After any significant page-state change, restart with `highlight` on `element_type: "any"`' + in description + ) + assert ( + 'Do not jump away from `element_type: "any"` on that changed page' + in description + ) assert "Exact-Text Search Only" in description assert "icon-only controls" in description assert 'Prefer `element_type: "any"` as the default first pass' in description - assert "always `click` it first and complete that confirmation before `keyboard_input`" in description - assert "Treat pages as reliable collision-free slices of the same candidate set" in description + assert ( + "always `click` it first and complete that confirmation before `keyboard_input`" + in description + ) + assert ( + "Treat pages as reliable collision-free slices of the same candidate set" + in description + ) assert "Do not jump from a first-page miss to `keywords`" in description - assert "prefer more `any` pages over broad keyword search or a narrower generic-control mode" in description + assert ( + "prefer more `any` pages over broad keyword search or a narrower generic-control mode" + in description + ) assert ( 'DO NOT search for unlabeled toolbar icons or ambiguous controls with guessed words like "settings", "gear", "bell", "chat", "next", "prev", or "close"' in description ) - assert "Use keywords only for exact literal text characters you can already see on the target itself in the current screenshot" in description + assert ( + "Use keywords only for exact literal text characters you can already see on the target itself in the current screenshot" + in description + ) assert '`{"keywords": ["52"]}`' in description assert '`["star"]`, `["favorite"]`, or `["bookmark"]`' in description assert "the actual button may simply be on the next page" in description - assert '`clickable`' not in description + assert "`clickable`" not in description assert "Phase 2: Broad Search" not in description assert "Examples of broad search" not in description @@ -142,7 +175,10 @@ def test_small_model_element_interaction_requires_click_before_keyboard_input() ): description = get_element_interaction_tool_description() - assert "If the target is `inputable`, always `click` it first and complete that confirmation before `keyboard_input`." in description + assert ( + "If the target is `inputable`, always `click` it first and complete that confirmation before `keyboard_input`." + in description + ) assert "always `click` first, then use `keyboard_input`" in description assert '`direction: "next"` means show the next picture' in description assert '`direction: "prev"` means show the previous picture' in description @@ -161,8 +197,14 @@ def test_large_model_element_interaction_requires_click_before_keyboard_input() ): description = get_element_interaction_tool_description() - assert "Always `click` the target first and complete that confirmation before `keyboard_input`." in description - assert "only after you already clicked the same input target and completed that click confirmation" in description + assert ( + "Always `click` the target first and complete that confirmation before `keyboard_input`." + in description + ) + assert ( + "only after you already clicked the same input target and completed that click confirmation" + in description + ) assert '`direction: "next"` means show the next picture' in description assert '`direction: "prev"` means show the previous picture' in description assert "not finger or gesture directions" in description