From 20b1002925feac12891988333e84b4c88684a7b5 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Thu, 26 Mar 2026 12:06:03 +0800
Subject: [PATCH 1/7] Simplify confirm actions and improve stale snapshot
 errors

---
 extension/src/background/index.ts             |  9 ++-
 extension/src/commands/element-actions.ts     | 16 ++++-
 .../big_model/element_interaction_tool.j2     |  8 +--
 .../small_model/element_interaction_tool.j2   |  5 +-
 server/agent/tools/base.py                    | 10 +--
 server/agent/tools/browser_executor.py        | 60 ++++++++--------
 .../agent/tools/element_interaction_tool.py   |  8 +--
 .../tests/unit/test_agent_browser_executor.py | 68 ++++++++++++++++---
 server/tests/unit/test_base_classes.py        | 12 ++--
 server/tests/unit/test_prompt_contracts.py    |  8 +++
 10 files changed, 139 insertions(+), 65 deletions(-)
diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index 921fd5c..965ad65 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -126,8 +126,9 @@ function buildStoredHighlightPages(options: {
 function buildSnapshotPageRefreshScript(options: {
   elements: InteractiveElement[];
   expectedDocumentId?: string;
+  highlightSnapshotId?: number;
 }): string {
-  const { elements, expectedDocumentId } = options;
+  const { elements, expectedDocumentId, highlightSnapshotId } = options;
   const refreshTargets = elements.map((element) => ({
     id: element.id,
     selector: element.selector,
@@ -137,6 +138,7 @@ function buildSnapshotPageRefreshScript(options: {
   return `
     (() => {
       const expectedDocumentId = ${JSON.stringify(expectedDocumentId || '')};
+      const highlightSnapshotId = ${highlightSnapshotId ?? "null"};
       const refreshTargets = ${JSON.stringify(refreshTargets)};
 
       function normalizeIdentityWhitespace(value, maxLength = 240) {
@@ -259,7 +261,7 @@ function buildSnapshotPageRefreshScript(options: {
           ok: false,
           stale: true,
           error:
-            'Highlight snapshot is stale because the document changed. Call highlight_elements() again.',
+            \`Highlight snapshot \${highlightSnapshotId} is stale because the document changed. Call highlight_elements() again.\`,
         };
       }
 
@@ -350,6 +352,7 @@ async function renderHighlightSnapshotPage(options: {
     buildSnapshotPageRefreshScript({
       elements,
       expectedDocumentId,
+      highlightSnapshotId,
     }),
     true,
     false,
@@ -2830,7 +2833,7 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
                 ok: false,
                 stale: true,
                 error:
-                  "Highlight snapshot is stale because the document changed. Call highlight_elements() again."
+                  "Highlight snapshot ${highlightSnapshotId} is stale because the document changed. Call highlight_elements() again."
               };
             }
             if (!el) {
diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts
index 869acb6..05082e3 100644
--- a/extension/src/commands/element-actions.ts
+++ b/extension/src/commands/element-actions.ts
@@ -145,14 +145,14 @@ function buildSnapshotIdentityHelpersScript(): string {
       return overlap >= Math.max(2, Math.min(4, Math.ceil(expectedTokens.length * 0.5)));
     }
 
-    function validateSnapshotElement(expectedDocumentId, expectedFingerprint, el) {
+    function validateSnapshotElement(expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el) {
       const currentDocumentId = getCurrentDocumentId();
       if (expectedDocumentId && currentDocumentId !== expectedDocumentId) {
         return {
           ok: false,
           stale: true,
           error:
-            'Highlight snapshot is stale because the document changed. Call highlight_elements() again.',
+            \`Highlight snapshot \${expectedHighlightSnapshotId} is stale because the document changed. Call highlight_elements() again.\`,
         };
       }
 
@@ -419,6 +419,7 @@ export async function performElementClick(
   const script = `
     (async function() {
       const selector = "${escapedSelector}";
+      const expectedHighlightSnapshotId = ${highlightSnapshotId};
       const expectedDocumentId = "${escapedDocumentId}";
       const expectedFingerprint = "${escapedFingerprint}";
       ${buildEditableActivationHelpersScript()}
@@ -429,6 +430,7 @@ export async function performElementClick(
       }
 
       const snapshotValidation = validateSnapshotElement(
+        expectedHighlightSnapshotId,
         expectedDocumentId,
         expectedFingerprint,
         el,
@@ -705,6 +707,7 @@ export async function performElementHover(
   const script = `
     (function() {
       const selector = "${escapedSelector}";
+      const expectedHighlightSnapshotId = ${highlightSnapshotId};
       const expectedDocumentId = "${escapedDocumentId}";
       const expectedFingerprint = "${escapedFingerprint}";
       ${buildSnapshotIdentityHelpersScript()}
@@ -715,6 +718,7 @@ export async function performElementHover(
       }
 
       const snapshotValidation = validateSnapshotElement(
+        expectedHighlightSnapshotId,
         expectedDocumentId,
         expectedFingerprint,
         el,
@@ -1002,6 +1006,7 @@ export async function performElementScroll(
     script = `
       (function() {
         const selector = "${escapedSelector}";
+        const expectedHighlightSnapshotId = ${highlightSnapshotId};
         const expectedDocumentId = "${escapedDocumentId}";
         const expectedFingerprint = "${escapedFingerprint}";
         const el = document.querySelector(selector);
@@ -1014,6 +1019,7 @@ export async function performElementScroll(
         }
 
         const snapshotValidation = validateSnapshotElement(
+          expectedHighlightSnapshotId,
           expectedDocumentId,
           expectedFingerprint,
           el,
@@ -1336,6 +1342,7 @@ export async function performElementSwipe(
   const script = `
     (async function() {
       const selector = "${escapedSelector}";
+      const expectedHighlightSnapshotId = ${highlightSnapshotId};
       const expectedDocumentId = "${escapedDocumentId}";
       const expectedFingerprint = "${escapedFingerprint}";
       const direction = "${direction}";
@@ -1348,6 +1355,7 @@ export async function performElementSwipe(
       }
 
       const snapshotValidation = validateSnapshotElement(
+        expectedHighlightSnapshotId,
         expectedDocumentId,
         expectedFingerprint,
         el,
@@ -2512,6 +2520,7 @@ export async function performKeyboardInput(
   const script = `
     (function() {
       const selector = "${escapedSelector}";
+      const expectedHighlightSnapshotId = ${highlightSnapshotId};
       const expectedDocumentId = "${escapedDocumentId}";
       const expectedFingerprint = "${escapedFingerprint}";
       const text = "${escapedText}";
@@ -2523,6 +2532,7 @@ export async function performKeyboardInput(
       }
 
       const snapshotValidation = validateSnapshotElement(
+        expectedHighlightSnapshotId,
         expectedDocumentId,
         expectedFingerprint,
         el,
@@ -2813,6 +2823,7 @@ export async function performElementSelect(
   const script = `
     (function() {
       const selector = "${escapedSelector}";
+      const expectedHighlightSnapshotId = ${highlightSnapshotId};
       const expectedDocumentId = "${escapedDocumentId}";
       const expectedFingerprint = "${escapedFingerprint}";
       const value = ${valueJson};
@@ -2825,6 +2836,7 @@ export async function performElementSelect(
       }
 
       const snapshotValidation = validateSnapshotElement(
+        expectedHighlightSnapshotId,
         expectedDocumentId,
         expectedFingerprint,
         el,
diff --git a/server/agent/prompts/big_model/element_interaction_tool.j2 b/server/agent/prompts/big_model/element_interaction_tool.j2
index 18496b2..4a2e901 100644
--- a/server/agent/prompts/big_model/element_interaction_tool.j2
+++ b/server/agent/prompts/big_model/element_interaction_tool.j2
@@ -43,7 +43,7 @@ Direct-execution actions
 **Practical rule**: Expect a confirmation step only for `click` and `keyboard_input`.
 **Hard rule for inputable targets**: Always `click` the target first and complete that confirmation before `keyboard_input`.
 
-**Snapshot rule**: Every element-targeted action must include both `highlight_snapshot_id` and `element_id`. `element_id` is page-local within one highlight snapshot response and is not valid by itself.
+**Snapshot rule**: Every initiating or direct element-targeted action must include both `highlight_snapshot_id` and `element_id`. `element_id` is page-local within one highlight snapshot response and is not valid by itself. `confirm_click` and `confirm_keyboard_input` operate on the current pending confirmation and do not need those fields repeated.
 
 **Discovery rule**: If you do not yet have the correct `element_id`, continue discovery with highlight pagination instead of replacing the missing target with guessed keywords. On the same unchanged page state, your default next step is another `highlight` call with `element_type: "any"`, the next page, and the previous `highlight_snapshot_id`. After any significant page-state change caused by your last action, restart discovery with `highlight` on `element_type: "any"` page 1 without reusing the old snapshot, because it exposes extension-derived page insight you cannot infer reliably from intent alone. Do not jump straight to `keywords` or another narrower type on that changed page. Use `keywords` only when you already see the target's exact literal text on the target itself in the current screenshot and can copy it verbatim. If a control itself shows an icon plus `52`, the literal keyword is `52`, not guessed icon words like `star`, `favorite`, or `bookmark`.
 
@@ -57,7 +57,7 @@ Initiate a click on an element by its visual ID.
 // → Returns an ORANGE preview screenshot
 // → Review the orange highlight + HTML
 // → Then confirm:
-{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 }
+{ "action": "confirm_click" }
 ```
 
 Use this for buttons, links, and other trigger controls you identified from the highlight tool.
@@ -148,7 +148,7 @@ Type text into an input element by its visual ID.
 // → Returns an ORANGE preview screenshot
 // → Review the orange highlight + HTML
 // → Then confirm:
-{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 }
+{ "action": "confirm_keyboard_input" }
 ```
 
 Use this for text inputs, textareas, and search boxes, but only after you already clicked the same input target and completed that click confirmation.
@@ -204,7 +204,7 @@ Each action has a corresponding confirm action (use the same `element_interactio
 - `confirm_click` (for click action)
 - `confirm_keyboard_input` (for keyboard_input action)
 
-**Parameters**: Use the same `highlight_snapshot_id` and `element_id`. `tab_id` is optional. For `confirm_keyboard_input`, the `text` comes from the pending confirmation created by the initiating action.
+**Parameters**: No `element_id`, `highlight_snapshot_id`, or `tab_id` is required. The system executes the current pending confirmation. For `confirm_keyboard_input`, the `text` also comes from that pending confirmation.
 
 **Behavior**: Executes the action after visual verification.
 
diff --git a/server/agent/prompts/small_model/element_interaction_tool.j2 b/server/agent/prompts/small_model/element_interaction_tool.j2
index 5dbf2c4..18f68cf 100644
--- a/server/agent/prompts/small_model/element_interaction_tool.j2
+++ b/server/agent/prompts/small_model/element_interaction_tool.j2
@@ -27,14 +27,14 @@ If the page did not significantly change, stay on the current `any` inventory an
 
 ```json
 { "action": "click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 }
-{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 }
+{ "action": "confirm_click" }
 ```
 
 ### keyboard_input
 
 ```json
 { "action": "keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "text": "hello@example.com", "tab_id": 123 }
-{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 }
+{ "action": "confirm_keyboard_input" }
 ```
 
 Use `keyboard_input` only after you already clicked the same input target and completed that click confirmation.
@@ -91,6 +91,7 @@ These are content directions, not hand or finger movement directions. Do not rei
 
 - `click` and `keyboard_input` require confirmation
 - For any `inputable` target, the required sequence is `click` first, then `keyboard_input`
+- `confirm_click` and `confirm_keyboard_input` operate on the current pending confirmation; do not repeat `element_id` or `highlight_snapshot_id`
 - If ORANGE preview appears, confirm only when the box position and HTML both match your target
 - If they do not match, do not confirm; highlight again
 
diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py
index 8e01f03..7a04083 100644
--- a/server/agent/tools/base.py
+++ b/server/agent/tools/base.py
@@ -115,20 +115,14 @@ def _pending_confirmation_llm_content(
         highlight_snapshot_id = str(
             pending.get("highlight_snapshot_id", self.highlight_snapshot_id or "unknown")
         )
-        snapshot_json_value = (
-            highlight_snapshot_id
-            if highlight_snapshot_id.isdigit()
-            else f'"{highlight_snapshot_id}"'
-        )
-        confirm_cmd = (
-            f'{{"action": "confirm_{action_type}", "highlight_snapshot_id": {snapshot_json_value}, "element_id": "{element_id}"}}'
-        )
+        confirm_cmd = f'{{"action": "confirm_{action_type}"}}'
 
         text_parts = [
             "## Pending Confirmation",
             "",
             "Inspect the screenshot first.",
             "Confirm only if the single highlighted element is exactly the intended target.",
+            "The pending element and highlight snapshot are already stored by the system.",
             "",
             f"**Highlight Snapshot ID**: {highlight_snapshot_id}",
             f"**Element ID**: {element_id}",
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 60c57f1..dac0f4b 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -515,35 +515,37 @@ def _execute_element_interaction_action(
                 raise ValueError(
                     "No pending click confirmation found. Please call click first."
                 )
-            if action.highlight_snapshot_id is None:
-                raise ValueError("confirm_click requires highlight_snapshot_id parameter")
-            if pending["element_id"] != action.element_id:
+            pending_element_id = pending.get("element_id")
+            pending_snapshot_id = pending.get("highlight_snapshot_id")
+            pending_extra_data = pending.get("extra_data", {})
+            if pending_snapshot_id is None:
+                pending_snapshot_id = pending_extra_data.get("highlight_snapshot_id")
+            if not pending_element_id:
                 raise ValueError(
-                    f"Element ID mismatch. Expected {pending['element_id']}, got {action.element_id}"
+                    "Pending click confirmation is missing element_id state."
                 )
-            if pending["highlight_snapshot_id"] != action.highlight_snapshot_id:
+            if pending_snapshot_id is None:
                 raise ValueError(
-                    f"Highlight snapshot mismatch. Expected {pending['highlight_snapshot_id']}, got {action.highlight_snapshot_id}"
+                    "Pending click confirmation is missing highlight_snapshot_id state."
                 )
             # Execute actual click
             command = ClickElementCommand(
-                element_id=action.element_id,
-                highlight_snapshot_id=action.highlight_snapshot_id
-                or pending["extra_data"].get("highlight_snapshot_id"),
+                element_id=pending_element_id,
+                highlight_snapshot_id=pending_snapshot_id,
                 conversation_id=self.conversation_id,
-                tab_id=action.tab_id or pending["extra_data"].get("tab_id"),
+                tab_id=pending_extra_data.get("tab_id"),
             )
             result_dict = self._execute_command_sync(command)
             if not result_dict or not result_dict.get("success"):
                 ext_error = self._extract_result_error(result_dict)
                 raise RuntimeError(f"Failed to click element: {ext_error}")
-            message = f"Confirmed and clicked element: {action.element_id}"
+            message = f"Confirmed and clicked element: {pending_element_id}"
             self._clear_pending_confirmation()
             return self._build_observation_from_result(
                 result_dict,
                 message,
-                element_id=action.element_id,
-                highlight_snapshot_id=action.highlight_snapshot_id,
+                element_id=pending_element_id,
+                highlight_snapshot_id=pending_snapshot_id,
             )
 
         elif action_type == "confirm_keyboard_input":
@@ -552,37 +554,37 @@ def _execute_element_interaction_action(
                 raise ValueError(
                     "No pending keyboard_input confirmation found. Please call keyboard_input first."
                 )
-            if action.highlight_snapshot_id is None:
-                raise ValueError(
-                    "confirm_keyboard_input requires highlight_snapshot_id parameter"
-                )
-            if pending["element_id"] != action.element_id:
+            pending_element_id = pending.get("element_id")
+            pending_snapshot_id = pending.get("highlight_snapshot_id")
+            pending_extra_data = pending.get("extra_data", {})
+            if pending_snapshot_id is None:
+                pending_snapshot_id = pending_extra_data.get("highlight_snapshot_id")
+            if not pending_element_id:
                 raise ValueError(
-                    f"Element ID mismatch. Expected {pending['element_id']}, got {action.element_id}"
+                    "Pending keyboard_input confirmation is missing element_id state."
                 )
-            if pending["highlight_snapshot_id"] != action.highlight_snapshot_id:
+            if pending_snapshot_id is None:
                 raise ValueError(
-                    f"Highlight snapshot mismatch. Expected {pending['highlight_snapshot_id']}, got {action.highlight_snapshot_id}"
+                    "Pending keyboard_input confirmation is missing highlight_snapshot_id state."
                 )
             command = KeyboardInputCommand(
-                element_id=action.element_id,
-                highlight_snapshot_id=action.highlight_snapshot_id
-                or pending["extra_data"].get("highlight_snapshot_id"),
-                text=pending["extra_data"].get("text", ""),
+                element_id=pending_element_id,
+                highlight_snapshot_id=pending_snapshot_id,
+                text=pending_extra_data.get("text", ""),
                 conversation_id=self.conversation_id,
-                tab_id=action.tab_id or pending["extra_data"].get("tab_id"),
+                tab_id=pending_extra_data.get("tab_id"),
             )
             result_dict = self._execute_command_sync(command)
             if not result_dict or not result_dict.get("success"):
                 ext_error = self._extract_result_error(result_dict)
                 raise RuntimeError(f"Failed to input text: {ext_error}")
-            message = f"Confirmed and input text to element: {action.element_id}"
+            message = f"Confirmed and input text to element: {pending_element_id}"
             self._clear_pending_confirmation()
             return self._build_observation_from_result(
                 result_dict,
                 message,
-                element_id=action.element_id,
-                highlight_snapshot_id=action.highlight_snapshot_id,
+                element_id=pending_element_id,
+                highlight_snapshot_id=pending_snapshot_id,
             )
 
         else:
diff --git a/server/agent/tools/element_interaction_tool.py b/server/agent/tools/element_interaction_tool.py
index 57fac02..f68b83c 100644
--- a/server/agent/tools/element_interaction_tool.py
+++ b/server/agent/tools/element_interaction_tool.py
@@ -42,15 +42,15 @@ class ElementInteractionAction(OpenBrowserAction):
         "confirm_click",
         "confirm_keyboard_input",
     ] = Field(
-        description="Element interaction action (click and keyboard_input require confirm_* follow-up; hover/scroll/swipe/select execute directly)"
+        description="Element interaction action (click and keyboard_input require confirm_* follow-up; confirm_* executes the current pending confirmation; hover/scroll/swipe/select execute directly)"
     )
     element_id: Optional[str] = Field(
         default=None,
-        description="Element ID (page-local numeric string) from a specific highlight_elements snapshot",
+        description="Element ID (page-local numeric string) from a specific highlight_elements snapshot. Required for click, hover, element scroll/swipe, keyboard_input, and select. Ignored for confirm_* actions.",
     )
     highlight_snapshot_id: Optional[int] = Field(
         default=None,
-        description="Highlight snapshot ID returned by highlight_elements. Required for all element-targeted actions.",
+        description="Highlight snapshot ID returned by highlight_elements. Required for click, hover, element scroll/swipe, keyboard_input, and select. Ignored for confirm_* actions.",
     )
     direction: Optional[Literal["up", "down", "left", "right", "next", "prev"]] = Field(
         default="down",
@@ -83,7 +83,7 @@ class ElementInteractionAction(OpenBrowserAction):
     )
     tab_id: Optional[int] = Field(
         default=None,
-        description="Tab ID (optional, uses active tab if not specified)",
+        description="Tab ID (optional, uses active tab if not specified). confirm_* actions use the tab stored in the pending confirmation.",
     )
 
 
diff --git a/server/tests/unit/test_agent_browser_executor.py b/server/tests/unit/test_agent_browser_executor.py
index 20edab1..ed9d3fb 100644
--- a/server/tests/unit/test_agent_browser_executor.py
+++ b/server/tests/unit/test_agent_browser_executor.py
@@ -37,6 +37,7 @@ def test_execute_command_sync_promotes_nested_error_to_top_level(monkeypatch) ->
     result = executor._execute_command_sync(
         SwipeElementCommand(
             element_id="swp123",
+            highlight_snapshot_id=17,
             direction="next",
             conversation_id="conv-swipe-error",
         )
@@ -61,6 +62,7 @@ def fake_execute(command):
         ElementInteractionAction(
             action="hover",
             element_id="hov123",
+            highlight_snapshot_id=17,
             conversation_id="conv-hover-direct",
         )
     )
@@ -79,7 +81,10 @@ def test_keyboard_input_sets_pending_confirmation(monkeypatch) -> None:
     monkeypatch.setattr(
         executor,
         "_get_element_full_html",
-        lambda element_id: ('<input type="text" />', "data:image/png;base64,pending"),
+        lambda element_id, highlight_snapshot_id: (
+            '<input type="text" />',
+            "data:image/png;base64,pending",
+        ),
     )
     monkeypatch.setattr(
         executor,
@@ -91,6 +96,7 @@ def test_keyboard_input_sets_pending_confirmation(monkeypatch) -> None:
         ElementInteractionAction(
             action="keyboard_input",
             element_id="inp123",
+            highlight_snapshot_id=17,
             text="hello",
             conversation_id="conv-input-pending",
         )
@@ -101,6 +107,41 @@ def test_keyboard_input_sets_pending_confirmation(monkeypatch) -> None:
     assert observation.pending_confirmation is not None
     assert observation.pending_confirmation["action_type"] == "keyboard_input"
     assert observation.pending_confirmation["element_id"] == "inp123"
+    assert observation.pending_confirmation["highlight_snapshot_id"] == 17
+
+
+def test_confirm_click_uses_pending_confirmation_state(monkeypatch) -> None:
+    executor = BrowserExecutor()
+    executor.conversation_id = "conv-click-confirm"
+    executor._set_pending_confirmation(
+        element_id="btn13",
+        highlight_snapshot_id=39,
+        action_type="click",
+        full_html="<button>Save</button>",
+        extra_data={"tab_id": 456, "highlight_snapshot_id": 39},
+    )
+
+    captured = {}
+
+    def fake_execute(command):
+        captured["command"] = command
+        return {"success": True, "data": {"screenshot": "data:image/png;base64,clicked"}}
+
+    monkeypatch.setattr(executor, "_execute_command_sync", fake_execute)
+
+    observation = executor._execute_action_sync(
+        ElementInteractionAction(
+            action="confirm_click",
+            conversation_id="conv-click-confirm",
+        )
+    )
+
+    assert observation.success is True
+    assert observation.message == "Confirmed and clicked element: btn13"
+    assert captured["command"].element_id == "btn13"
+    assert captured["command"].highlight_snapshot_id == 39
+    assert captured["command"].tab_id == 456
+    assert executor._get_pending_confirmation() is None
 
 
 def test_confirm_keyboard_input_reports_nested_extension_error(monkeypatch) -> None:
@@ -108,25 +149,31 @@ def test_confirm_keyboard_input_reports_nested_extension_error(monkeypatch) -> N
     executor.conversation_id = "conv-input-error"
     executor._set_pending_confirmation(
         element_id="inp123",
+        highlight_snapshot_id=17,
         action_type="keyboard_input",
         full_html='<input type="text" />',
-        extra_data={"text": "hello world"},
+        extra_data={
+            "text": "hello world",
+            "tab_id": 321,
+            "highlight_snapshot_id": 17,
+        },
     )
 
-    monkeypatch.setattr(
-        executor,
-        "_execute_command_sync",
-        lambda command: {
+    captured = {}
+
+    def fake_execute(command):
+        captured["command"] = command
+        return {
             "success": False,
             "error": None,
             "data": {"error": "Input element is detached"},
-        },
-    )
+        }
+
+    monkeypatch.setattr(executor, "_execute_command_sync", fake_execute)
 
     observation = executor._execute_action_sync(
         ElementInteractionAction(
             action="confirm_keyboard_input",
-            element_id="inp123",
             conversation_id="conv-input-error",
         )
     )
@@ -136,4 +183,7 @@ def test_confirm_keyboard_input_reports_nested_extension_error(monkeypatch) -> N
         observation.error
         == "Failed to input text: Input element is detached"
     )
+    assert captured["command"].element_id == "inp123"
+    assert captured["command"].highlight_snapshot_id == 17
+    assert captured["command"].tab_id == 321
     assert "None" not in observation.message
diff --git a/server/tests/unit/test_base_classes.py b/server/tests/unit/test_base_classes.py
index 559225d..0793763 100644
--- a/server/tests/unit/test_base_classes.py
+++ b/server/tests/unit/test_base_classes.py
@@ -176,8 +176,10 @@ def test_pending_confirmation_includes_follow_up_command(self) -> None:
 
         assert "## Pending Confirmation" in text
         assert "Inspect the screenshot first." in text
-        assert '"action": "confirm_click"' in text
-        assert '"element_id": "a1b2c3"' in text
+        assert '{"action": "confirm_click"}' in text
+        assert '"highlight_snapshot_id"' not in text
+        assert '"element_id"' not in text
+        assert "**Element ID**: a1b2c3" in text
 
     def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) -> None:
         observation = OpenBrowserObservation(
@@ -191,8 +193,10 @@ def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) ->
 
         text = _text_content(observation)
 
-        assert '"action": "confirm_keyboard_input"' in text
-        assert '"element_id": "inp789"' in text
+        assert '{"action": "confirm_keyboard_input"}' in text
+        assert '"highlight_snapshot_id"' not in text
+        assert '"element_id"' not in text
+        assert "**Element ID**: inp789" in text
 
     def test_pending_confirmation_with_screenshot_is_image_first_and_text_minimal(self) -> None:
         observation = OpenBrowserObservation(
diff --git a/server/tests/unit/test_prompt_contracts.py b/server/tests/unit/test_prompt_contracts.py
index 09cfe2d..e26a17d 100644
--- a/server/tests/unit/test_prompt_contracts.py
+++ b/server/tests/unit/test_prompt_contracts.py
@@ -152,6 +152,14 @@ def test_element_interaction_prompt_explains_swipe_semantics(self) -> None:
         assert '`direction: "prev"` means show the previous picture' in description
         assert "not finger or gesture directions" in description
 
+    def test_element_interaction_confirm_examples_use_pending_state_only(self) -> None:
+        description = get_element_interaction_tool_description()
+
+        assert '{ "action": "confirm_click" }' in description
+        assert '{ "action": "confirm_keyboard_input" }' in description
+        assert '{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 }' not in description
+        assert '{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 }' not in description
+
     def test_element_interaction_action_schema_explains_swipe_semantics(self) -> None:
         description = ElementInteractionAction.model_fields["direction"].description
 

From 4e7f088f9ae5ca65581d71917b39bf57f23b0d87 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Thu, 26 Mar 2026 13:00:17 +0800
Subject: [PATCH 2/7] Add please_help_me user assistance flow

Co-authored-by: openhands <openhands@all-hands.dev>
---
 frontend/index.html                           | 503 ++++++++++++++++--
 pyproject.toml                                |   4 +-
 server/agent/api.py                           |   9 +-
 server/agent/manager.py                       |  21 +-
 server/agent/tools/help_tool.py               | 111 ++++
 server/agent/user_help.py                     |  54 ++
 server/agent/visualizer.py                    |  16 +
 server/core/browser_executor_bundle.py        |   9 +-
 .../tests/unit/test_agent_manager_process.py  |  12 +-
 server/tests/unit/test_user_help.py           |  80 +++
 uv.lock                                       |   8 +-
 11 files changed, 765 insertions(+), 62 deletions(-)
 create mode 100644 server/agent/tools/help_tool.py
 create mode 100644 server/agent/user_help.py
 create mode 100644 server/tests/unit/test_user_help.py

diff --git a/frontend/index.html b/frontend/index.html
index 9c31966..9af5c57 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -1350,6 +1350,129 @@
             visibility: visible;
         }
 
+        .help-request-modal {
+            position: fixed;
+            inset: 0;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            padding: 24px;
+            background: rgba(8, 6, 4, 0.72);
+            backdrop-filter: blur(10px);
+            z-index: 3200;
+            opacity: 0;
+            visibility: hidden;
+            transition: opacity 0.2s ease, visibility 0.2s ease;
+        }
+
+        .help-request-modal.show {
+            opacity: 1;
+            visibility: visible;
+        }
+
+        .help-request-card {
+            width: min(640px, 100%);
+            display: flex;
+            flex-direction: column;
+            gap: 16px;
+            padding: 24px;
+            border-radius: 24px;
+            border: 1px solid rgba(255, 210, 140, 0.32);
+            background: linear-gradient(180deg, rgba(42, 29, 17, 0.97), rgba(29, 21, 15, 0.98));
+            box-shadow: 0 32px 90px rgba(0, 0, 0, 0.42);
+        }
+
+        .help-request-kicker {
+            color: #ffd28c;
+            font-size: 11px;
+            font-weight: 700;
+            letter-spacing: 0.16em;
+            text-transform: uppercase;
+        }
+
+        .help-request-title {
+            font-size: 28px;
+            line-height: 1.05;
+            letter-spacing: -0.04em;
+        }
+
+        .help-request-copy,
+        .help-request-conversation {
+            color: var(--text-muted);
+            font-size: 14px;
+            line-height: 1.6;
+        }
+
+        .help-request-message {
+            padding: 16px 18px;
+            border-radius: 18px;
+            border: 1px solid rgba(255, 210, 140, 0.18);
+            background: rgba(255, 210, 140, 0.08);
+            color: var(--text-primary);
+            font-size: 15px;
+            line-height: 1.7;
+            white-space: pre-wrap;
+        }
+
+        .help-request-textarea {
+            min-height: 112px;
+            width: 100%;
+            padding: 14px 16px;
+            border-radius: 16px;
+            border: 1px solid var(--panel-border);
+            background: rgba(21, 17, 13, 0.92);
+            color: var(--text-primary);
+            font: inherit;
+            font-size: 14px;
+            line-height: 1.6;
+            resize: vertical;
+        }
+
+        .help-request-textarea:focus {
+            outline: none;
+            border-color: rgba(255, 210, 140, 0.42);
+            box-shadow: 0 0 0 4px rgba(255, 210, 140, 0.08);
+        }
+
+        .help-request-actions {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 10px;
+        }
+
+        .help-request-primary,
+        .help-request-secondary {
+            border: none;
+            border-radius: 999px;
+            padding: 12px 18px;
+            cursor: pointer;
+            font: inherit;
+            font-size: 14px;
+            font-weight: 700;
+            transition: transform 0.18s ease, opacity 0.18s ease;
+        }
+
+        .help-request-primary {
+            background: #ffd28c;
+            color: #2b1b0c;
+        }
+
+        .help-request-secondary {
+            background: rgba(255, 255, 255, 0.08);
+            color: var(--text-secondary);
+            border: 1px solid var(--panel-border);
+        }
+
+        .help-request-primary:hover,
+        .help-request-secondary:hover {
+            transform: translateY(-1px);
+        }
+
+        body.help-needed .terminal-header {
+            border-color: rgba(255, 210, 140, 0.34);
+            box-shadow: 0 28px 90px rgba(0, 0, 0, 0.38), 0 0 0 1px rgba(255, 210, 140, 0.14);
+        }
+
         .empty-state {
             text-align: center;
             padding: 40px 20px;
@@ -2635,6 +2758,37 @@
 <body>
     <!-- Session Sidebar Overlay -->
     <div class="session-overlay" id="session-overlay" onclick="closeSessionSidebar()"></div>
+
+    <div class="help-request-modal" id="help-request-modal">
+        <div class="help-request-card">
+            <div class="help-request-kicker">Human Help Needed</div>
+            <div class="help-request-title">The agent is waiting for you</div>
+            <div class="help-request-copy">Complete the manual browser step below, then resume the same conversation.</div>
+            <div class="help-request-conversation">Conversation: <span id="help-request-conversation">Current conversation</span></div>
+            <div class="help-request-message" id="help-request-message">The agent needs your help before it can continue.</div>
+            <textarea
+                class="help-request-textarea"
+                id="help-request-input"
+                placeholder="Optional custom reply to the agent, for example: 我已经通过验证码了，请继续。"
+            ></textarea>
+            <div class="help-request-actions">
+                <button
+                    type="button"
+                    class="help-request-primary"
+                    onclick="resolveHelpWithDefaultReply()"
+                >
+                    I've resolved it
+                </button>
+                <button
+                    type="button"
+                    class="help-request-secondary"
+                    onclick="resolvePendingHelpRequest(document.getElementById('help-request-input').value)"
+                >
+                    Send custom reply
+                </button>
+            </div>
+        </div>
+    </div>
     
     <!-- Session Sidebar -->
     <div class="session-sidebar" id="session-sidebar">
@@ -3132,6 +3286,9 @@ <h1>Sisyphus</h1>
         let latestAgentStep = 'Waiting for a task.';
         let latestAgentThinking = 'Waiting for agent reasoning.';
         let latestAgentTimestamp = null;
+        let pendingHelpRequest = null;
+        let helpAttentionTimer = null;
+        let defaultDocumentTitle = document.title;
         
         // Sisyphus mode variables
         let sisyphusEnabled = false;
@@ -3149,6 +3306,8 @@ <h1>Sisyphus</h1>
         const MODEL_ALIAS_STORAGE_KEY = 'openbrowser_model_alias';
         const AGENT_STEP_PLACEHOLDER = 'Waiting for a task.';
         const AGENT_THINKING_PLACEHOLDER = 'Waiting for agent reasoning.';
+        const DEFAULT_HELP_RESUME_MESSAGE = 'I have completed the human step you needed. Please continue based on the current page state.';
+        const HELP_REQUEST_TITLE = 'Human Help Needed · OpenBrowser';
         
         // Helper function to build API URL
         function apiUrl(path) {
@@ -3173,6 +3332,214 @@ <h1>Sisyphus</h1>
             return `${value.substring(0, 8)}...${value.substring(value.length - 4)}`;
         }
 
+        function sleep(ms) {
+            return new Promise(resolve => setTimeout(resolve, ms));
+        }
+
+        async function waitForProcessingToFinish(timeoutMs = 5000) {
+            const start = Date.now();
+            while (isProcessing && Date.now() - start < timeoutMs) {
+                await sleep(50);
+            }
+        }
+
+        function getHelpRequestText(event) {
+            return (
+                event.help_request ||
+                event.message ||
+                event.text ||
+                'The agent needs your help before it can continue.'
+            );
+        }
+
+        function clearHelpAttention() {
+            if (helpAttentionTimer) {
+                clearInterval(helpAttentionTimer);
+                helpAttentionTimer = null;
+            }
+            document.title = defaultDocumentTitle;
+            document.body.classList.remove('help-needed');
+        }
+
+        function startHelpAttention() {
+            clearHelpAttention();
+            document.body.classList.add('help-needed');
+
+            let highlighted = false;
+            document.title = HELP_REQUEST_TITLE;
+            helpAttentionTimer = setInterval(() => {
+                highlighted = !highlighted;
+                document.title = highlighted ? HELP_REQUEST_TITLE : defaultDocumentTitle;
+            }, 1200);
+        }
+
+        function focusHelpRequestUi() {
+            try {
+                window.focus();
+            } catch (error) {
+                console.debug('Unable to focus window:', error);
+            }
+
+            setTimeout(() => {
+                try {
+                    window.focus();
+                } catch (error) {
+                    console.debug('Unable to refocus window:', error);
+                }
+                const customInput = document.getElementById('help-request-input');
+                if (customInput) {
+                    customInput.focus();
+                }
+            }, 120);
+        }
+
+        function hideHelpRequestModal() {
+            const modal = document.getElementById('help-request-modal');
+            if (modal) {
+                modal.classList.remove('show');
+            }
+        }
+
+        function clearPendingHelpRequest() {
+            pendingHelpRequest = null;
+            clearHelpAttention();
+            hideHelpRequestModal();
+        }
+
+        function showHelpRequestModal(request, options = {}) {
+            if (
+                pendingHelpRequest &&
+                pendingHelpRequest.toolCallId &&
+                request.toolCallId &&
+                pendingHelpRequest.toolCallId === request.toolCallId
+            ) {
+                if (!options.silent) {
+                    focusHelpRequestUi();
+                }
+                return;
+            }
+
+            pendingHelpRequest = request;
+
+            const modal = document.getElementById('help-request-modal');
+            const messageNode = document.getElementById('help-request-message');
+            const conversationNode = document.getElementById('help-request-conversation');
+            const inputNode = document.getElementById('help-request-input');
+
+            if (!modal || !messageNode || !conversationNode || !inputNode) {
+                return;
+            }
+
+            messageNode.textContent = request.message;
+            conversationNode.textContent = shortDisplay(
+                request.conversationId || currentConversationId,
+                'Current conversation'
+            );
+            inputNode.value = request.customReply || '';
+            modal.classList.add('show');
+            startHelpAttention();
+
+            if (!options.silent) {
+                focusHelpRequestUi();
+            }
+
+            if (sisyphusRunning) {
+                stopSisyphus();
+                addEvent({
+                    type: 'SystemEvent',
+                    text: 'Sisyphus stopped because the agent requested human help.',
+                    timestamp: new Date().toISOString()
+                });
+            }
+        }
+
+        async function resolvePendingHelpRequest(messageText) {
+            if (!pendingHelpRequest) {
+                return;
+            }
+
+            const trimmed = (messageText || '').trim();
+            if (!trimmed) {
+                addEvent({
+                    type: 'ErrorEvent',
+                    text: 'Please enter a reply for the agent, or use “I\'ve resolved it”.',
+                    timestamp: new Date().toISOString()
+                });
+                return;
+            }
+
+            const request = pendingHelpRequest;
+            clearPendingHelpRequest();
+            await waitForProcessingToFinish();
+
+            if (isProcessing) {
+                showHelpRequestModal(request, { silent: true });
+                addEvent({
+                    type: 'ErrorEvent',
+                    text: 'The previous turn is still closing. Please try again in a moment.',
+                    timestamp: new Date().toISOString()
+                });
+                return;
+            }
+
+            if (request.conversationId && currentConversationId && request.conversationId !== currentConversationId) {
+                addEvent({
+                    type: 'ErrorEvent',
+                    text: 'The pending help request belongs to a different conversation.',
+                    timestamp: new Date().toISOString()
+                });
+                return;
+            }
+
+            const input = document.getElementById('command-input');
+            if (input) {
+                input.value = trimmed;
+                syncCommandInputHeight();
+            }
+
+            await sendCommandText(trimmed, { preserveInput: false });
+        }
+
+        async function resolveHelpWithDefaultReply() {
+            await resolvePendingHelpRequest(DEFAULT_HELP_RESUME_MESSAGE);
+        }
+
+        function isHelpRequestEvent(event) {
+            return (
+                event.type === 'ActionEvent' &&
+                event.tool_name === 'please_help_me'
+            );
+        }
+
+        function normalizeFrontendEvent(data, fallback = {}) {
+            const eventType = data.type || fallback.type || 'SystemEvent';
+            let source = data.source || fallback.source || 'agent';
+
+            if (eventType === 'MessageEvent') {
+                source = data.source || fallback.source || (data.role === 'user' ? 'user' : 'agent');
+            }
+
+            return {
+                id: fallback.id || `event-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
+                type: eventType,
+                text: data.text || fallback.text || '',
+                timestamp: data.timestamp || fallback.timestamp || new Date().toISOString(),
+                image: data.image || fallback.image || null,
+                success: data.success,
+                message: data.message,
+                error: data.error,
+                role: data.role || fallback.role || null,
+                activated_skills: data.activated_skills || fallback.activated_skills || null,
+                sender: data.sender || fallback.sender || null,
+                source: source,
+                summary: data.summary || fallback.summary || null,
+                tool_name: data.tool_name || fallback.tool_name || null,
+                tool_call_id: data.tool_call_id || fallback.tool_call_id || null,
+                help_request: data.help_request || fallback.help_request || null,
+                awaiting_user_help: Boolean(data.awaiting_user_help || fallback.awaiting_user_help)
+            };
+        }
+
         function truncateForSummary(value, maxLength = 140) {
             const normalized = (value || '').replace(/\s+/g, ' ').trim();
             if (!normalized) {
@@ -3217,6 +3584,10 @@ <h1>Sisyphus</h1>
         }
 
         function getAgentVoiceFallback() {
+            if (pendingHelpRequest) {
+                return pendingHelpRequest.message;
+            }
+
             if (isProcessing) {
                 return 'The agent is working. Follow the browser stage and the step feed below.';
             }
@@ -3390,6 +3761,20 @@ <h1>Sisyphus</h1>
                 latestAgentTimestamp = timestamp;
                 shouldResetStageScroll = true;
             } else if (event.type === 'ActionEvent') {
+                if (event.tool_name === 'please_help_me') {
+                    const helpText = getHelpRequestText(event);
+                    latestAgentStep = 'Needs human help';
+                    latestAgentThinking = truncateForSummary(helpText, 220);
+                    latestAgentMessage = helpText;
+                    latestAgentTimestamp = timestamp;
+                    shouldResetStageScroll = true;
+                    renderAgentVoice();
+                    if (shouldResetStageScroll) {
+                        resetStageConversationScroll();
+                    }
+                    return;
+                }
+
                 const parsed = parseActionSections(content);
                 latestAgentStep = truncateForSummary(
                     event.summary || parsed.summary || content,
@@ -3483,6 +3868,8 @@ <h1>Sisyphus</h1>
 
             if (event.type === 'MessageEvent' && event.source === 'user') {
                 label = 'User task';
+            } else if (event.tool_name === 'please_help_me') {
+                label = 'Help';
             } else if (event.type === 'SystemEvent') {
                 label = 'System';
             }
@@ -3512,6 +3899,8 @@ <h1>Sisyphus</h1>
             let nextStep = 'Paste the browser UUID to connect this browser window.';
             if (!appConfig?.is_configured) {
                 nextStep = 'Open Settings and add a model before running tasks.';
+            } else if (pendingHelpRequest) {
+                nextStep = 'Human help requested. Complete the manual step and resume the same conversation.';
             } else if (sisyphusEnabled && sisyphusRunning) {
                 nextStep = 'Sisyphus is running. Watch the browser stage or stop the loop.';
             } else if (sisyphusEnabled) {
@@ -3590,6 +3979,7 @@ <h1>Sisyphus</h1>
                 currentConversationId
             ) {
                 const oldConversationId = currentConversationId;
+                clearPendingHelpRequest();
                 currentConversationId = null;
                 sisyphusCurrentConversationId = null;
                 updateConversationIdDisplay();
@@ -3697,6 +4087,7 @@ <h1>Sisyphus</h1>
             }
 
             const oldConversationId = currentConversationId;
+            clearPendingHelpRequest();
             currentConversationId = null;
             sisyphusCurrentConversationId = null;
             updateConversationIdDisplay();
@@ -4305,6 +4696,7 @@ <h1>Sisyphus</h1>
                 
                 // If deleted session was current, clear it
                 if (sessionId === currentConversationId) {
+                    clearPendingHelpRequest();
                     currentConversationId = null;
                     localStorage.removeItem('openbrowser_conversation_id');
                     updateConversationIdDisplay();
@@ -4391,6 +4783,9 @@ <h1>Sisyphus</h1>
                     resetEventCounters();
                     resetRecentActivity();
                     resetAgentVoice();
+                    clearPendingHelpRequest();
+
+                    let unresolvedHelpRequest = null;
                     
                     // Replay events
                     for (const event of data.events) {
@@ -4402,15 +4797,27 @@ <h1>Sisyphus</h1>
                         }
                         
                         // Convert to frontend event format
-                        const frontendEvent = {
-                            type: eventData.type || event.event_type,
-                            text: eventData.text || eventData.action || eventData.observation || '',
-                            timestamp: eventData.timestamp || event.created_at,
-                            role: eventData.role
-                        };
+                        const frontendEvent = normalizeFrontendEvent(eventData, {
+                            type: event.event_type,
+                            timestamp: event.created_at
+                        });
                         
                         updateEventCounters(frontendEvent.type);
                         addEvent(frontendEvent);
+
+                        if (frontendEvent.type === 'MessageEvent' && frontendEvent.source === 'user') {
+                            unresolvedHelpRequest = null;
+                        } else if (isHelpRequestEvent(frontendEvent)) {
+                            unresolvedHelpRequest = {
+                                conversationId: conversationId,
+                                toolCallId: frontendEvent.tool_call_id,
+                                message: getHelpRequestText(frontendEvent)
+                            };
+                        }
+                    }
+
+                    if (unresolvedHelpRequest) {
+                        showHelpRequestModal(unresolvedHelpRequest, { silent: true });
                     }
                     
                     console.log(`Loaded ${data.events.length} events for session ${conversationId}`);
@@ -4573,6 +4980,7 @@ <h1>Sisyphus</h1>
                 // When cwd changes, clear current conversation to force new creation
                 if (currentConversationId) {
                     console.log(`[Frontend] CWD changed to "${this.value}", clearing current conversation`);
+                    clearPendingHelpRequest();
                     currentConversationId = null;
                     localStorage.removeItem('openbrowser_conversation_id');
                     updateConversationIdDisplay();
@@ -4640,15 +5048,25 @@ <h1>Sisyphus</h1>
         // Send command to agent
         async function sendCommand() {
             const input = document.getElementById('command-input');
-            const command = input.value.trim();
-            
+            return sendCommandText(input.value.trim(), { inputElement: input });
+        }
+
+        async function sendCommandText(command, options = {}) {
+            const input = options.inputElement || document.getElementById('command-input');
+
             if (!command || isProcessing) {
                 return;
             }
-            
+
+            if (pendingHelpRequest) {
+                clearPendingHelpRequest();
+            }
+
             // Clear input
-            input.value = '';
-            syncCommandInputHeight();
+            if (!options.preserveInput && input) {
+                input.value = '';
+                syncCommandInputHeight();
+            }
             
             // Add user command to terminal (always show user message)
             console.log(`[Frontend] Sending command: "${command}"`);
@@ -4828,9 +5246,10 @@ <h1>Sisyphus</h1>
                         processEvent(data);
                     } else if (eventType === 'complete') {
                         let message = 'Conversation completed';
+                        let completionData = null;
                         try {
-                            const data = JSON.parse(eventData);
-                            message = data.message || message;
+                            completionData = JSON.parse(eventData);
+                            message = completionData.message || message;
                         } catch (e) {
                             // Use default message
                         }
@@ -4851,6 +5270,14 @@ <h1>Sisyphus</h1>
                         isProcessing = false;
                         isConversationRunning = false;
                         updateGuidedPanels();
+
+                        if (completionData && completionData.needs_user_help) {
+                            showHelpRequestModal({
+                                conversationId: completionData.conversation_id || currentConversationId,
+                                toolCallId: completionData.tool_call_id || null,
+                                message: completionData.help_request || message
+                            });
+                        }
                     } else if (eventType === 'error') {
                         let errorMessage = 'SSE connection error';
                         try {
@@ -5108,32 +5535,7 @@ <h1>Sisyphus</h1>
                 return;
             }
             
-            // Determine source based on role or event type
-            let source = 'agent'; // default
-            if (eventType === 'MessageEvent') {
-                // Use role field for MessageEvent
-                source = data.role === 'user' ? 'user' : 'agent';
-            } else if (data.text && data.text.includes('user:')) {
-                // Fallback for other event types that might have user text
-                source = 'user';
-            }
-            
-            // Create event object
-            const event = {
-                id: `event-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
-                type: eventType,
-                text: data.text || '',
-                timestamp: data.timestamp || new Date().toISOString(),
-                image: data.image || null,
-                success: data.success,
-                message: data.message,
-                error: data.error,
-                role: data.role || null, // Include role field
-                activated_skills: data.activated_skills || null,
-                sender: data.sender || null,
-                source: source,
-                summary: data.summary || null // Include summary field for collapsed display
-            };
+            const event = normalizeFrontendEvent(data);
             
             console.log(`[Frontend] Created event:`, event);
             
@@ -5150,6 +5552,16 @@ <h1>Sisyphus</h1>
             
             // Render event (without image in the event line)
             addEvent(event);
+
+            if (isHelpRequestEvent(event)) {
+                if (!pendingHelpRequest || pendingHelpRequest.toolCallId !== event.tool_call_id) {
+                    showHelpRequestModal({
+                        conversationId: currentConversationId,
+                        toolCallId: event.tool_call_id,
+                        message: getHelpRequestText(event)
+                    });
+                }
+            }
         }
         
         // Update event counters and display
@@ -5216,6 +5628,11 @@ <h1>Sisyphus</h1>
                 typeColor = '#ffb8b8';
                 typeLabel = 'ERROR';
             }
+
+            if (event.tool_name === 'please_help_me') {
+                typeColor = '#ffd28c';
+                typeLabel = 'HELP';
+            }
             
             // Format timestamp
             const timestamp = new Date(event.timestamp);
@@ -5225,9 +5642,13 @@ <h1>Sisyphus</h1>
             let content = event.text || event.message || '';
             if (event.error) {
                 content = `ERROR: ${event.error}`;
+            } else if (event.tool_name === 'please_help_me') {
+                content = getHelpRequestText(event);
             }
             const headerSummary = event.summary || (
-                event.type === 'MessageEvent'
+                event.tool_name === 'please_help_me'
+                    ? truncateForSummary(getHelpRequestText(event), 96)
+                    : event.type === 'MessageEvent'
                     ? truncateForSummary(content, 72)
                     : null
             );
@@ -5446,6 +5867,7 @@ <h1>Sisyphus</h1>
         async function refreshConversationId() {
             // Clear current conversation ID to force creation of a new one on next command
             const oldConversationId = currentConversationId;
+            clearPendingHelpRequest();
             currentConversationId = null;
             localStorage.removeItem('openbrowser_conversation_id');
             updateConversationIdDisplay();
@@ -5708,6 +6130,7 @@ <h1>Sisyphus</h1>
 
                 if (currentConversationId) {
                     const oldConversationId = currentConversationId;
+                    clearPendingHelpRequest();
                     currentConversationId = null;
                     sisyphusCurrentConversationId = null;
                     updateConversationIdDisplay();
diff --git a/pyproject.toml b/pyproject.toml
index 8399707..cc66190 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,8 @@ dependencies = [
     "pillow>=10.0.0",
     "numpy>=1.24.0",
     "requests>=2.31.0",
-    "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@4e6dab44f253b3c03c46f7469dc58aba97540c88#subdirectory=openhands-sdk",
-    "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@4e6dab44f253b3c03c46f7469dc58aba97540c88#subdirectory=openhands-tools",
+    "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#subdirectory=openhands-sdk",
+    "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#subdirectory=openhands-tools",
     "litellm @ git+https://github.com/softpudding/litellm.git@bfba5e3889829067baeab3b12d38008360913771",
 ]
 
diff --git a/server/agent/api.py b/server/agent/api.py
index 5a67545..6e07fc6 100644
--- a/server/agent/api.py
+++ b/server/agent/api.py
@@ -16,6 +16,7 @@
 from server.api.sse import SSEEvent
 from server.agent.manager import agent_manager
 from server.agent.conversation import ConversationState
+from server.agent.user_help import build_completion_event_payload
 from server.core.session_manager import session_manager, SessionStatus
 
 logger = logging.getLogger(__name__)
@@ -280,10 +281,10 @@ def run_conversation():
             event_queue.put(
                 SSEEvent(
                     "complete",
-                    {
-                        "conversation_id": conversation_id,
-                        "message": "Conversation completed",
-                    },
+                    build_completion_event_payload(
+                        conversation_id,
+                        conv_state.conversation.state.events,
+                    ),
                 )
             )
             logger.debug(f"DEBUG: Complete event put into queue")
diff --git a/server/agent/manager.py b/server/agent/manager.py
index c05d140..b046cec 100644
--- a/server/agent/manager.py
+++ b/server/agent/manager.py
@@ -30,6 +30,8 @@
 from server.api.sse import SSEEvent
 from server.agent.visualizer import QueueVisualizer
 from server.agent.conversation import ConversationState
+from server.agent.user_help import PLEASE_HELP_ME_TOOL_NAME
+import server.agent.tools.help_tool  # noqa: F401
 from server.agent.tools.browser_executor import remove_browser_executor
 from server.core.ipc_router import IPCRouter
 from server.core.ipc_types import BrowserCommandMessage
@@ -97,11 +99,26 @@ def __init__(self, multi_process_mode: bool = False):
             Tool(name="javascript"),  # JavaScript execution fallback
         ]
         self.general_tools = [
+            Tool(name=PLEASE_HELP_ME_TOOL_NAME),
             Tool(name=TerminalTool.name),  # Terminal access
             Tool(name=FileEditorTool.name),  # File editing
             Tool(name=TaskTrackerTool.name),  # Task tracking
         ]
 
+    def _build_agent_context(self) -> AgentContext:
+        """Create the shared AgentContext used for OpenBrowser conversations."""
+        help_suffix = (
+            "When you need a human to complete a manual step, do not only say it in "
+            "assistant text. Call `please_help_me` with a concise, actionable "
+            "message, then stop and wait for the user's next message. Use this for "
+            "CAPTCHA, MFA, approvals, account-specific choices, or any step that "
+            "requires the real human user."
+        )
+        return AgentContext(
+            current_datetime=datetime.now(),
+            system_message_suffix=help_suffix,
+        )
+
     def _resolve_llm_settings(
         self,
         model: Optional[str] = None,
@@ -282,7 +299,7 @@ def _create_conversation_in_process(
             The conversation ID
         """
         # Create agent with tools
-        agent_context = AgentContext(current_datetime=datetime.now())
+        agent_context = self._build_agent_context()
         llm_instance = self._create_llm_from_config(model, base_url, model_alias)
         tools = self._get_tools_for_model(model, model_alias)
         agent = Agent(
@@ -508,7 +525,7 @@ def get_or_create_conversation(
 
         # Create new conversation with the given ID
         # Create agent with tools
-        agent_context = AgentContext(current_datetime=datetime.now())
+        agent_context = self._build_agent_context()
         llm_instance = self._create_llm_from_config(model, base_url, model_alias)
         tools = self._get_tools_for_model(model, model_alias)
         agent = Agent(
diff --git a/server/agent/tools/help_tool.py b/server/agent/tools/help_tool.py
new file mode 100644
index 0000000..1a87881
--- /dev/null
+++ b/server/agent/tools/help_tool.py
@@ -0,0 +1,111 @@
+"""Tool for explicit agent-to-human handoff requests."""
+
+from collections.abc import Sequence
+from typing import Any
+
+from pydantic import Field
+from rich.text import Text
+
+from openhands.sdk import Action, TextContent
+from openhands.sdk.conversation.state import ConversationExecutionStatus
+from openhands.sdk.tool import (
+    ToolAnnotations,
+    ToolDefinition,
+    ToolExecutor,
+    register_tool,
+)
+
+from server.agent.user_help import PLEASE_HELP_ME_TOOL_NAME
+from server.agent.tools.base import OpenBrowserObservation
+
+
+class PleaseHelpMeAction(Action):
+    """Action asking the human user to complete a manual step."""
+
+    message: str = Field(
+        description=(
+            "The exact message to show the human user, describing what they need "
+            "to do manually before you can continue."
+        )
+    )
+
+    @property
+    def visualize(self) -> Text:
+        content = Text()
+        content.append("Please help me:\n", style="bold yellow")
+        content.append(self.message)
+        return content
+
+
+class PleaseHelpMeObservation(OpenBrowserObservation):
+    """Observation shown after a help request is emitted."""
+
+    help_message: str = Field(
+        description="The help request shown to the user for this pending turn."
+    )
+    pending_user_decision: bool = Field(
+        default=True,
+        description="Whether the agent is waiting for the user's manual help.",
+    )
+
+    @property
+    def visualize(self) -> Text:
+        return Text("Pending user's decision")
+
+    @property
+    def to_llm_content(self) -> Sequence[TextContent]:
+        return [TextContent(text="Pending user's decision")]
+
+
+class PleaseHelpMeExecutor(
+    ToolExecutor[PleaseHelpMeAction, PleaseHelpMeObservation]
+):
+    """Executor that pauses the current turn and surfaces a user-help request."""
+
+    def __call__(
+        self, action: PleaseHelpMeAction, conversation: Any = None
+    ) -> PleaseHelpMeObservation:
+        if conversation is not None:
+            conversation.state.execution_status = ConversationExecutionStatus.FINISHED
+
+        return PleaseHelpMeObservation(
+            success=True,
+            message="Pending user's decision",
+            help_message=action.message,
+            pending_user_decision=True,
+        )
+
+
+class PleaseHelpMeTool(
+    ToolDefinition[PleaseHelpMeAction, PleaseHelpMeObservation]
+):
+    """Tool for requesting human intervention."""
+
+    name = PLEASE_HELP_ME_TOOL_NAME
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["PleaseHelpMeTool"]:
+        if params:
+            raise ValueError("PleaseHelpMeTool doesn't accept parameters")
+
+        return [
+            cls(
+                description=(
+                    "Ask the human user to complete a manual step such as CAPTCHA, "
+                    "MFA, approval, or any browser action you cannot safely do."
+                ),
+                action_type=PleaseHelpMeAction,
+                observation_type=PleaseHelpMeObservation,
+                executor=PleaseHelpMeExecutor(),
+                annotations=ToolAnnotations(
+                    title=PLEASE_HELP_ME_TOOL_NAME,
+                    readOnlyHint=True,
+                    destructiveHint=False,
+                    idempotentHint=True,
+                    openWorldHint=False,
+                ),
+            )
+        ]
+
+
+register_tool(PLEASE_HELP_ME_TOOL_NAME, PleaseHelpMeTool.create)
diff --git a/server/agent/user_help.py b/server/agent/user_help.py
new file mode 100644
index 0000000..7e3c29d
--- /dev/null
+++ b/server/agent/user_help.py
@@ -0,0 +1,54 @@
+"""Helpers for explicit agent-to-human help requests."""
+
+from collections.abc import Sequence
+from typing import Any
+
+from openhands.sdk.event import ActionEvent, MessageEvent
+from openhands.sdk.event.base import Event
+
+
+PLEASE_HELP_ME_TOOL_NAME = "please_help_me"
+
+
+def get_pending_user_help_request(events: Sequence[Event]) -> dict[str, Any] | None:
+    """Return the latest unresolved help request for the current turn, if any."""
+    for event in reversed(events):
+        if isinstance(event, MessageEvent) and event.source == "user":
+            return None
+
+        if (
+            isinstance(event, ActionEvent)
+            and event.source == "agent"
+            and event.tool_name == PLEASE_HELP_ME_TOOL_NAME
+        ):
+            action = event.action
+            message = getattr(action, "message", None)
+            if not isinstance(message, str) or not message.strip():
+                message = "The agent needs your help before it can continue."
+
+            return {
+                "message": message,
+                "tool_call_id": event.tool_call_id,
+            }
+
+    return None
+
+
+def build_completion_event_payload(
+    conversation_id: str, events: Sequence[Event]
+) -> dict[str, Any]:
+    """Build the SSE completion payload for a finished run."""
+    help_request = get_pending_user_help_request(events)
+    if help_request is not None:
+        return {
+            "conversation_id": conversation_id,
+            "message": "Agent is waiting for user help",
+            "needs_user_help": True,
+            "help_request": help_request["message"],
+            "tool_call_id": help_request["tool_call_id"],
+        }
+
+    return {
+        "conversation_id": conversation_id,
+        "message": "Conversation completed",
+    }
diff --git a/server/agent/visualizer.py b/server/agent/visualizer.py
index a037fc9..d5671d6 100644
--- a/server/agent/visualizer.py
+++ b/server/agent/visualizer.py
@@ -19,6 +19,7 @@
 )
 
 from server.api.sse import SSEEvent
+from server.agent.user_help import PLEASE_HELP_ME_TOOL_NAME
 from server.core.session_manager import session_manager
 
 logger = logging.getLogger(__name__)
@@ -88,6 +89,7 @@ def on_event(self, event: Event) -> None:
                 "type": event_type,
                 "text": text_content,
                 "timestamp": getattr(event, "timestamp", None),
+                "source": getattr(event, "source", None),
             }
 
             # Handle different event types using isinstance for clarity
@@ -97,13 +99,22 @@ def on_event(self, event: Event) -> None:
 
             # Process specific event types (mutually exclusive)
             if isinstance(event, ActionEvent):
+                sse_data["tool_name"] = event.tool_name
+                sse_data["tool_call_id"] = event.tool_call_id
                 # ActionEvent has action attribute
                 if event.action:
                     sse_data["action"] = str(event.action)
                 if event.summary:
                     sse_data["summary"] = str(event.summary)
+                if event.tool_name == PLEASE_HELP_ME_TOOL_NAME and event.action:
+                    help_request = getattr(event.action, "message", None)
+                    if isinstance(help_request, str) and help_request.strip():
+                        sse_data["help_request"] = help_request
+                        sse_data["awaiting_user_help"] = True
 
             elif isinstance(event, ObservationEvent):
+                sse_data["tool_name"] = event.tool_name
+                sse_data["tool_call_id"] = event.tool_call_id
                 # ObservationEvent has observation attribute with possible image content
                 obs = event.observation
                 # Extract observation properties (same as original hasattr checks)
@@ -121,6 +132,11 @@ def on_event(self, event: Event) -> None:
                     sse_data["image"] = obs.image_url
                 elif hasattr(obs, "image") and obs.image:
                     sse_data["image"] = obs.image
+                if event.tool_name == PLEASE_HELP_ME_TOOL_NAME:
+                    sse_data["awaiting_user_help"] = True
+                    help_request = getattr(obs, "help_message", None)
+                    if isinstance(help_request, str) and help_request.strip():
+                        sse_data["help_request"] = help_request
 
             elif isinstance(event, MessageEvent):
                 # MessageEvent has llm_message with role information
diff --git a/server/core/browser_executor_bundle.py b/server/core/browser_executor_bundle.py
index 9391570..da1b78b 100644
--- a/server/core/browser_executor_bundle.py
+++ b/server/core/browser_executor_bundle.py
@@ -19,6 +19,7 @@
 
 from server.agent.manager import OpenBrowserAgentManager
 from server.api.sse import SSEEvent
+from server.agent.user_help import build_completion_event_payload
 from server.core.processor import CommandProcessor
 from server.models.commands import parse_command, CommandResponse
 
@@ -300,10 +301,10 @@ async def execute_agent_message(
             event_queue.put(
                 SSEEvent(
                     "complete",
-                    {
-                        "conversation_id": self.conversation_id,
-                        "message": "Conversation completed",
-                    },
+                    build_completion_event_payload(
+                        self.conversation_id,
+                        conv_state.conversation.state.events,
+                    ),
                 )
             )
 
diff --git a/server/tests/unit/test_agent_manager_process.py b/server/tests/unit/test_agent_manager_process.py
index 3af806f..6a0166e 100644
--- a/server/tests/unit/test_agent_manager_process.py
+++ b/server/tests/unit/test_agent_manager_process.py
@@ -65,9 +65,8 @@ def test_multi_process_mode_initializes_infrastructure(self) -> None:
 
     def test_large_models_keep_full_browser_toolset(self) -> None:
         """Large models should keep javascript plus general tools."""
-        manager = OpenBrowserAgentManager()
-
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
+            manager = OpenBrowserAgentManager()
             mock_llm_config.reload_config.return_value = MagicMock()
             mock_llm_config.get_llm_config.return_value = MagicMock(
                 model="dashscope/qwen3.5-plus",
@@ -86,6 +85,7 @@ def test_large_models_keep_full_browser_toolset(self) -> None:
             "element_interaction",
             "dialog",
             "javascript",
+            "please_help_me",
             "terminal",
             "file_editor",
             "task_tracker",
@@ -93,9 +93,8 @@ def test_large_models_keep_full_browser_toolset(self) -> None:
 
     def test_small_models_drop_javascript_only(self) -> None:
         """Small models keep general tools but lose javascript."""
-        manager = OpenBrowserAgentManager()
-
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
+            manager = OpenBrowserAgentManager()
             mock_llm_config.reload_config.return_value = MagicMock()
             mock_llm_config.get_llm_config.return_value = MagicMock(
                 model="dashscope/qwen3.5-flash",
@@ -113,6 +112,7 @@ def test_small_models_drop_javascript_only(self) -> None:
             "highlight",
             "element_interaction",
             "dialog",
+            "please_help_me",
             "terminal",
             "file_editor",
             "task_tracker",
@@ -120,9 +120,8 @@ def test_small_models_drop_javascript_only(self) -> None:
 
     def test_unknown_models_default_to_large_profile(self) -> None:
         """Unconfigured models should keep the large-model toolset."""
-        manager = OpenBrowserAgentManager()
-
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
+            manager = OpenBrowserAgentManager()
             mock_llm_config.reload_config.return_value = MagicMock()
             mock_llm_config.get_llm_config.return_value = MagicMock(
                 model="dashscope/qwen3.5-plus",
@@ -135,6 +134,7 @@ def test_unknown_models_default_to_large_profile(self) -> None:
             ]
 
         assert "javascript" in tool_names
+        assert "please_help_me" in tool_names
 
     def test_system_prompt_kwargs_follow_large_model_profile(self) -> None:
         """Large models should advertise full browser freedom in system prompt."""
diff --git a/server/tests/unit/test_user_help.py b/server/tests/unit/test_user_help.py
new file mode 100644
index 0000000..c4c185a
--- /dev/null
+++ b/server/tests/unit/test_user_help.py
@@ -0,0 +1,80 @@
+"""Tests for explicit user-help requests."""
+
+import queue
+from unittest.mock import patch
+
+from openhands.sdk.event import ActionEvent, MessageEvent
+from openhands.sdk.llm import Message, MessageToolCall, TextContent
+
+from server.agent.user_help import build_completion_event_payload
+from server.agent.visualizer import QueueVisualizer
+from server.agent.tools.help_tool import PleaseHelpMeAction
+
+
+def _help_action_event(message: str) -> ActionEvent:
+    tool_call = MessageToolCall(
+        id="call-help-1",
+        name="please_help_me",
+        arguments='{"message": "ignored"}',
+        origin="completion",
+    )
+    return ActionEvent(
+        source="agent",
+        thought=[TextContent(text="Need help")],
+        action=PleaseHelpMeAction(message=message),
+        tool_name="please_help_me",
+        tool_call_id="call-help-1",
+        tool_call=tool_call,
+        llm_response_id="resp-help-1",
+    )
+
+
+def test_build_completion_event_payload_marks_pending_user_help() -> None:
+    payload = build_completion_event_payload(
+        "conv-help-1",
+        [_help_action_event("Please solve the CAPTCHA, then reply ok.")],
+    )
+
+    assert payload == {
+        "conversation_id": "conv-help-1",
+        "message": "Agent is waiting for user help",
+        "needs_user_help": True,
+        "help_request": "Please solve the CAPTCHA, then reply ok.",
+        "tool_call_id": "call-help-1",
+    }
+
+
+def test_build_completion_event_payload_clears_after_user_reply() -> None:
+    payload = build_completion_event_payload(
+        "conv-help-2",
+        [
+            _help_action_event("Please approve the login."),
+            MessageEvent(
+                source="user",
+                llm_message=Message(role="user", content=[TextContent(text="done")]),
+            ),
+        ],
+    )
+
+    assert payload == {
+        "conversation_id": "conv-help-2",
+        "message": "Conversation completed",
+    }
+
+
+def test_visualizer_surfaces_structured_help_request_fields() -> None:
+    event_queue: queue.Queue = queue.Queue()
+    visualizer = QueueVisualizer(event_queue=event_queue, conversation_id="conv-help-3")
+
+    with patch("server.agent.visualizer.session_manager.save_event", return_value=True):
+        visualizer.on_event(_help_action_event("Please complete MFA and reply done."))
+
+    sse_event = event_queue.get_nowait()
+
+    assert sse_event.event_type == "agent_event"
+    assert sse_event.data["type"] == "ActionEvent"
+    assert sse_event.data["tool_name"] == "please_help_me"
+    assert sse_event.data["tool_call_id"] == "call-help-1"
+    assert sse_event.data["help_request"] == "Please complete MFA and reply done."
+    assert sse_event.data["awaiting_user_help"] is True
+    assert sse_event.data["source"] == "agent"
diff --git a/uv.lock b/uv.lock
index 9eeff06..61dda0c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1675,8 +1675,8 @@ requires-dist = [
     { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=bfba5e3889829067baeab3b12d38008360913771" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" },
     { name = "numpy", specifier = ">=1.24.0" },
-    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=4e6dab44f253b3c03c46f7469dc58aba97540c88" },
-    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=4e6dab44f253b3c03c46f7469dc58aba97540c88" },
+    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" },
+    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" },
     { name = "pillow", specifier = ">=10.0.0" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" },
     { name = "pydantic", specifier = ">=2.5.0" },
@@ -2221,7 +2221,7 @@ wheels = [
 [[package]]
 name = "openhands-sdk"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=4e6dab44f253b3c03c46f7469dc58aba97540c88#4e6dab44f253b3c03c46f7469dc58aba97540c88" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" }
 dependencies = [
     { name = "agent-client-protocol" },
     { name = "deprecation" },
@@ -2241,7 +2241,7 @@ dependencies = [
 [[package]]
 name = "openhands-tools"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=4e6dab44f253b3c03c46f7469dc58aba97540c88#4e6dab44f253b3c03c46f7469dc58aba97540c88" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" }
 dependencies = [
     { name = "bashlex" },
     { name = "binaryornot" },

From 66249e0384d2120982401d99e2ba0af020525f79 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Thu, 26 Mar 2026 13:30:43 +0800
Subject: [PATCH 3/7] fix(extension): keep semantic controls clickable in swipe
 regions

---
 .../src/__tests__/highlight-detection.test.ts |  6 +++-
 .../commands/highlight-detection.injected.js  | 31 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/extension/src/__tests__/highlight-detection.test.ts b/extension/src/__tests__/highlight-detection.test.ts
index 4c16134..eb0c6c6 100644
--- a/extension/src/__tests__/highlight-detection.test.ts
+++ b/extension/src/__tests__/highlight-detection.test.ts
@@ -213,15 +213,19 @@ describe('highlight-detection helpers', () => {
     expect(script).toContain('interactionHints');
   });
 
-  test('buildHighlightDetectionScript hides clickable when swipe or scroll affordances are present', () => {
+  test('buildHighlightDetectionScript keeps semantic controls clickable in swipe regions', () => {
     const script = buildHighlightDetectionScript({ elementType: 'any' });
     const start = script.indexOf('function toInteractiveElement');
     const end = script.indexOf('function countVisibleClickableCandidates', start);
     const toInteractiveElementSource = script.slice(start, end);
 
+    expect(script).toContain('function isSemanticControlElement');
     expect(toInteractiveElementSource).toContain(
       "candidate.type === 'clickable'",
     );
+    expect(toInteractiveElementSource).toContain(
+      '!isSemanticControlElement(candidate.element)',
+    );
     expect(toInteractiveElementSource).toContain(
       "interactionHints.includes('swipable')",
     );
diff --git a/extension/src/commands/highlight-detection.injected.js b/extension/src/commands/highlight-detection.injected.js
index 80d16f7..a499e71 100644
--- a/extension/src/commands/highlight-detection.injected.js
+++ b/extension/src/commands/highlight-detection.injected.js
@@ -450,6 +450,36 @@ function getSemanticClickableSignal(el) {
   return null;
 }
 
+function isSemanticControlElement(el) {
+  if (!(el instanceof HTMLElement)) {
+    return false;
+  }
+
+  const tag = el.tagName.toLowerCase();
+  const role = (el.getAttribute('role') || '').toLowerCase();
+
+  if (tag === 'button' || tag === 'summary') {
+    return true;
+  }
+
+  if (tag === 'a' && (el.getAttribute('href') || el.hasAttribute('target'))) {
+    return true;
+  }
+
+  if (tag === 'input') {
+    const inputType = (el.getAttribute('type') || 'text').toLowerCase();
+    if (
+      ['button', 'submit', 'reset', 'image', 'checkbox', 'radio'].includes(
+        inputType,
+      )
+    ) {
+      return true;
+    }
+  }
+
+  return POINTER_ROLE_SET.has(role);
+}
+
 function isMeaningfulPointerCandidate(el) {
   if (!(el instanceof HTMLElement)) {
     return false;
@@ -1439,6 +1469,7 @@ function toInteractiveElement(candidate) {
   const interactionHints = getInteractionHints(candidate.element);
   const displayType =
     candidate.type === 'clickable' &&
+    !isSemanticControlElement(candidate.element) &&
     (interactionHints.includes('swipable') ||
       isScrollableCandidate(candidate.element))
       ? 'scrollable'

From 69c8c38b9ec31fb0e69ce0d4b79562581bf4eee2 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Thu, 26 Mar 2026 21:20:14 +0800
Subject: [PATCH 4/7] Remove JavaScript browser tool and pin agent-sdk

---
 AGENTS.md                                     |  38 +-----
 pyproject.toml                                |   4 +-
 server/AGENTS.md                              |  16 +--
 server/agent/api.py                           |   4 +-
 server/agent/manager.py                       |  21 +---
 .../big_model/element_interaction_tool.j2     |   6 +-
 .../prompts/big_model/javascript_tool.j2      |  60 ----------
 .../prompts/small_model/javascript_tool.j2    |  60 ----------
 server/agent/tools/browser_executor.py        |  52 +-------
 server/agent/tools/javascript_tool.py         | 111 ------------------
 server/agent/tools/prompt_context.py          |   6 -
 server/agent/tools/toolset.py                 |  21 ++--
 .../integration/test_toolset_integration.py   |  33 ++++--
 .../tests/unit/test_agent_manager_process.py  |  19 ++-
 server/tests/unit/test_prompt_context.py      |   1 -
 server/tests/unit/test_toolset.py             |   9 +-
 uv.lock                                       |   8 +-
 17 files changed, 64 insertions(+), 405 deletions(-)
 delete mode 100644 server/agent/prompts/big_model/javascript_tool.j2
 delete mode 100644 server/agent/prompts/small_model/javascript_tool.j2
 delete mode 100644 server/agent/tools/javascript_tool.py

diff --git a/AGENTS.md b/AGENTS.md
index f7226fe..ad00f55 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -39,8 +39,7 @@ OpenBrowser/
 | Highlight tool | `server/agent/tools/highlight_tool.py` | HighlightTool for element discovery |
 | Element interaction | `server/agent/tools/element_interaction_tool.py` | ElementInteractionTool with 2PC flow |
 | Dialog tool | `server/agent/tools/dialog_tool.py` | DialogTool for dialog handling |
-| JavaScript tool | `server/agent/tools/javascript_tool.py` | JavaScriptTool for fallback execution |
-| ToolSet aggregator | `server/agent/tools/toolset.py` | OpenBrowserToolSet aggregates all 5 tools |
+| ToolSet aggregator | `server/agent/tools/toolset.py` | OpenBrowserToolSet aggregates all 4 tools |
 | Extension entry | `extension/src/background/index.ts` | Command handler, dialog processing |
 | Dialog manager | `extension/src/commands/dialog.ts` | CDP dialog events, cascading |
 | JavaScript execution | `extension/src/commands/javascript.ts` | CDP Runtime.evaluate, dialog race |
@@ -154,26 +153,15 @@ OpenBrowser uses Jinja2 templates for agent prompts, enabling dynamic content in
 ### Template Structure
 - **Location**: `server/agent/prompts/` directory
 - **Format**: `.j2` extension with Jinja2 syntax
-- **5 Tool Templates**: Each of the 5 focused tools has its own template:
+- **4 Tool Templates**: Each of the 4 focused tools has its own template:
   - `tab_tool.j2` - Tab management documentation
   - `highlight_tool.j2` - Element discovery with color coding
   - `element_interaction_tool.j2` - 2PC flow with orange confirmations
   - `dialog_tool.j2` - Dialog handling
-  - `javascript_tool.j2` - JavaScript fallback
-
-### Dynamic JavaScript Control
-The `javascript_execute` command can be disabled via environment variable:
-```bash
-export OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE=1
-```
-When disabled:
-- Template removes all `javascript_execute` references using `{% if not disable_javascript %}` conditionals
-- `OpenBrowserAction.type` description excludes `'javascript_execute'`
-- Command execution returns error if attempted
 
 ### Template Features
 - **Conditional rendering**: Use `{% if %}` blocks for configurable sections
-- **Variable injection**: Pass context variables like `disable_javascript` at render time
+- **Variable injection**: Pass context variables like model profile flags at render time
 - **Clean output**: `trim_blocks=True` and `lstrip_blocks=True` remove extra whitespace
 - **Caching**: Templates are cached after first load for performance
 
@@ -246,8 +234,8 @@ Elements are identified by a 6-character hash string:
 | `scroll_element` | Scroll by element ID | `{element_id: "m5k2p8", direction: "down"}` |
 | `keyboard_input` | Type into element | `{element_id: "j4n7q1", text: "hello"}` |
 
-### Tool Mapping (5-Tool Architecture)
-The visual interaction workflow is implemented across 5 focused tools:
+### Tool Mapping (4-Tool Architecture)
+The visual interaction workflow is implemented across 4 focused tools:
 
 | Tool | Commands | Purpose |
 |------|----------|---------|
@@ -255,25 +243,9 @@ The visual interaction workflow is implemented across 5 focused tools:
 | `highlight` | `highlight_elements` | Element discovery with blue overlays |
 | `element_interaction` | `click_element`, `confirm_click_element`, `hover_element`, `scroll_element`, `keyboard_input`, `confirm_keyboard_input`, `select_element` | Element interaction with 2PC only for click and keyboard input |
 | `dialog` | `handle_dialog` | Dialog handling (accept/dismiss) |
-| `javascript` | `javascript_execute` | JavaScript fallback execution |
 
 ## UNIQUE PATTERNS
 
-### JavaScript-First Automation (Fallback)
-For complex interactions not covered by visual commands:
-```javascript
-// Click by visible text (universal pattern)
-(() => {
-    const text = 'YOUR_TEXT';
-    const leaf = Array.from(document.querySelectorAll('*'))
-        .find(el => el.children.length === 0 && el.textContent.includes(text));
-    if (!leaf) return 'not found';
-    const target = leaf.closest('a, button, [role="button"]') || leaf;
-    target.click();
-    return 'clicked: ' + target.tagName;
-})()
-```
-
 ### Multi-Session Tab Isolation
 - `tab init <url>` creates managed session with tab group
 - `conversation_id` ties all commands to session
diff --git a/pyproject.toml b/pyproject.toml
index cc66190..b955a1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,8 +17,8 @@ dependencies = [
     "pillow>=10.0.0",
     "numpy>=1.24.0",
     "requests>=2.31.0",
-    "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#subdirectory=openhands-sdk",
-    "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#subdirectory=openhands-tools",
+    "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@56c315b34a8d3f0b3d1c89cba57115dd859edbb0#subdirectory=openhands-sdk",
+    "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@56c315b34a8d3f0b3d1c89cba57115dd859edbb0#subdirectory=openhands-tools",
     "litellm @ git+https://github.com/softpudding/litellm.git@bfba5e3889829067baeab3b12d38008360913771",
 ]
 
diff --git a/server/AGENTS.md b/server/AGENTS.md
index 9f085fe..5e620ca 100644
--- a/server/AGENTS.md
+++ b/server/AGENTS.md
@@ -22,7 +22,6 @@ FastAPI backend handling REST API, WebSocket communication with Chrome extension
 | Highlight tool | `agent/tools/highlight_tool.py` | `HighlightTool`, `HighlightAction` |
 | Element interaction | `agent/tools/element_interaction_tool.py` | `ElementInteractionTool`, `ElementInteractionAction` |
 | Dialog tool | `agent/tools/dialog_tool.py` | `DialogTool`, `DialogHandleAction` |
-| JavaScript tool | `agent/tools/javascript_tool.py` | `JavaScriptTool`, `JavaScriptAction` |
 | ToolSet aggregator | `agent/tools/toolset.py` | `OpenBrowserToolSet` |
 ## STRUCTURE
 
@@ -209,9 +208,9 @@ After the screenshot refactor, the server layer no longer proactively triggers s
 
 **Best Practice:** Use `tab view` when you need a clean screenshot after navigation or JavaScript execution.
 
-## 5-TOOL ARCHITECTURE
+## 4-TOOL ARCHITECTURE
 
-OpenBrowser now uses 5 focused tools instead of a single monolithic tool:
+OpenBrowser now uses 4 focused tools instead of a single monolithic tool:
 
 ### 1. Tab Tool (`tab`)
 - **Purpose**: Browser tab management with session isolation
@@ -235,22 +234,17 @@ OpenBrowser now uses 5 focused tools instead of a single monolithic tool:
 - **Dialog types**: `alert` (auto-accepted), `confirm`, `prompt`, `beforeunload`
 - **Required**: Handle dialogs before continuing browser operations
 
-### 5. JavaScript Tool (`javascript`)
-- **Purpose**: Custom JavaScript execution as fallback mechanism
-- **When to use**: When visual commands fail (2-Strike Rule) or for complex DOM manipulation
-- **Guidelines**: Return JSON-serializable values, 30-second timeout
-
 ### Shared Architecture
-- **Shared executor**: All 5 tools share executor for 2PC state management
+- **Shared executor**: All 4 tools share executor for 2PC state management
 - **Conversation isolation**: Each conversation has isolated state
 - **Backward compatibility**: `OpenBrowserTool` still available with deprecation warning
-- **ToolSet**: `OpenBrowserToolSet` aggregates all 5 tools for registration
+- **ToolSet**: `OpenBrowserToolSet` aggregates all 4 tools for registration
 
 ## NOTES
 
 - WebSocket runs on port 8766, HTTP on 8765
 - `conversation_id` links all commands to session context
-- Agent uses OpenHands SDK with 5 focused tools (tab, highlight, element_interaction, dialog, javascript)
+- Agent uses OpenHands SDK with 4 focused tools (tab, highlight, element_interaction, dialog)
 - Dialogs block screenshot/JS until handled
 
 ## ANTI-PATTERNS
diff --git a/server/agent/api.py b/server/agent/api.py
index 6e07fc6..33c7d92 100644
--- a/server/agent/api.py
+++ b/server/agent/api.py
@@ -437,10 +437,8 @@ def initialize_agent():
         from .tools.highlight_tool import HighlightTool
         from .tools.element_interaction_tool import ElementInteractionTool
         from .tools.dialog_tool import DialogTool
-        from .tools.javascript_tool import JavaScriptTool
-
         logger.info(
-            "5 focused OpenBrowser tools registered: tab, highlight, element_interaction, dialog, javascript"
+            "4 focused OpenBrowser tools registered: tab, highlight, element_interaction, dialog"
         )
 
     except Exception as e:
diff --git a/server/agent/manager.py b/server/agent/manager.py
index b046cec..eff2b1d 100644
--- a/server/agent/manager.py
+++ b/server/agent/manager.py
@@ -96,7 +96,6 @@ def __init__(self, multi_process_mode: bool = False):
             Tool(name="highlight"),  # Element discovery with visual overlays
             Tool(name="element_interaction"),  # Click/input with 2PC, others direct
             Tool(name="dialog"),  # Browser dialog handling
-            Tool(name="javascript"),  # JavaScript execution fallback
         ]
         self.general_tools = [
             Tool(name=PLEASE_HELP_ME_TOOL_NAME),
@@ -135,22 +134,9 @@ def _resolve_llm_settings(
     def _get_tools_for_model(
         self, model: Optional[str] = None, model_alias: Optional[str] = None
     ) -> list[Tool]:
-        """Return the tool list for a model tier.
-
-        Small models keep the general agentic tools but use a narrower browser
-        toolset to reduce risky action branching.
-        """
-        resolved_model, _, _ = self._resolve_llm_settings(
-            model=model, model_alias=model_alias
-        )
-
-        browser_tools = list(self.browser_tools)
-        if is_small_model(resolved_model):
-            browser_tools = [
-                tool for tool in browser_tools if tool.name != "javascript"
-            ]
-
-        return browser_tools + list(self.general_tools)
+        """Return the tool list for a model tier."""
+        self._resolve_llm_settings(model=model, model_alias=model_alias)
+        return list(self.browser_tools) + list(self.general_tools)
 
     def _create_llm_from_config(
         self,
@@ -196,7 +182,6 @@ def _get_system_prompt_kwargs(
         return {
             "model_profile": get_model_profile(resolved_model),
             "small_model": small_model,
-            "javascript_available": not small_model,
         }
 
     def _build_session_metadata(
diff --git a/server/agent/prompts/big_model/element_interaction_tool.j2 b/server/agent/prompts/big_model/element_interaction_tool.j2
index 4a2e901..92ecf66 100644
--- a/server/agent/prompts/big_model/element_interaction_tool.j2
+++ b/server/agent/prompts/big_model/element_interaction_tool.j2
@@ -116,11 +116,7 @@ Use this to:
 - If highlight shows a target `scrollable` region, use `scroll` on that region rather than clicking nearby paging controls
 - If highlight shows the `swipable` hint, treat the region as a carousel/slider and use `swipe` over `scroll`
 - Do not compensate for a missing highlight target by guessing `next`, `prev`, `close`, or similar keywords
-{% if not disable_javascript %}
-- If scroll/swipe fails twice and the task still requires progress, use the JavaScript tool only as a last-resort fallback for this system
-{% else %}
-- If scroll/swipe fails repeatedly, re-orient around currently visible items because JavaScript fallback is disabled in this environment
-{% endif %}
+- If scroll/swipe fails repeatedly, re-orient around currently visible items and keep recovery inside the visual browser tools
 
 ### swipe
 Swipe a carousel / gallery / swiper region by its visual ID.
diff --git a/server/agent/prompts/big_model/javascript_tool.j2 b/server/agent/prompts/big_model/javascript_tool.j2
deleted file mode 100644
index 3431ab0..0000000
--- a/server/agent/prompts/big_model/javascript_tool.j2
+++ /dev/null
@@ -1,60 +0,0 @@
-# JavaScript Tool
-
-Custom JavaScript execution for system-specific fallback only.
-
-## Overview
-
-The JavaScript Tool executes arbitrary JavaScript code in the browser context. Use this only as a **fallback** when OpenBrowser's visual tools cannot accomplish the task. The prompt should teach system-specific constraints, not generic JavaScript techniques you already know.
-
-## When to Use JavaScript
-
-**Use visual commands first**:
-1. `highlight` to identify the target
-2. If the page state is unchanged and the target is still missing, your first reaction should be to continue `element_type: "any"` pagination
-3. `element_interaction` to click / hover / scroll / swipe / input / select
-4. `dialog` immediately if JavaScript or interaction opens a blocking dialog
-
-**Switch to JavaScript only when**:
-- Complex DOM manipulation needed
-- Visual commands fail twice (2-Strike Rule)
-- Special browser APIs or framework APIs are required
-- You need to inspect page state that is not exposed by visual tools
-- You have already exhausted the reliable highlight-driven discovery flow rather than abandoning it after the first page
-
-## Commands
-
-### javascript
-Execute arbitrary JavaScript.
-
-```json
-{ "script": "(() => { return document.title; })()" }
-```
-
-**Parameters**:
-- `script`: JavaScript code to execute
-
-**System-Specific Guidelines**:
-- **30-second timeout**: Scripts timing out will return error
-- **Return JSON-serializable values only**: No DOM nodes, functions, or circular structures
-- **Use IIFE**: Wrap in `(() => { ... })()` for explicit return statements
-- **Console output captured**: `console.log()` output is included in response
-- **Dialog detection**: Dialogs triggered during execution are automatically detected and require the dialog tool
-- **Prefer framework-safe events**: If a page action must be simulated, dispatch the event sequence the app expects rather than relying on a brittle shortcut
-- **Rebuild discovery after state-changing JS**: JavaScript does not return a screenshot, so after any page-changing script the next discovery step must be `highlight` with `element_type: "any"` before choosing the next element
-- **Use JavaScript to complement, not replace, the visual workflow**: Prefer reading state, calling a known page API, or performing a narrow fallback step rather than re-implementing the whole task in JavaScript
-- **Do not use JavaScript just because highlight page 1 did not show the target**: On the same unchanged page state, continue `highlight` pagination with `element_type: "any"` first
-
-### Minimal example
-```javascript
-(() => {
-    return { title: document.title, url: location.href };
-})()
-```
-
-## Error Handling
-
-- **Syntax error**: Returns error with stack trace
-- **Timeout**: Returns error after 30 seconds
-- **Dialog opened**: Returns dialog info, requiring the dialog tool
-- **Non-serializable return**: Returns error
-- **Security violation**: Returns error (cross-origin access, etc.)
diff --git a/server/agent/prompts/small_model/javascript_tool.j2 b/server/agent/prompts/small_model/javascript_tool.j2
deleted file mode 100644
index 3431ab0..0000000
--- a/server/agent/prompts/small_model/javascript_tool.j2
+++ /dev/null
@@ -1,60 +0,0 @@
-# JavaScript Tool
-
-Custom JavaScript execution for system-specific fallback only.
-
-## Overview
-
-The JavaScript Tool executes arbitrary JavaScript code in the browser context. Use this only as a **fallback** when OpenBrowser's visual tools cannot accomplish the task. The prompt should teach system-specific constraints, not generic JavaScript techniques you already know.
-
-## When to Use JavaScript
-
-**Use visual commands first**:
-1. `highlight` to identify the target
-2. If the page state is unchanged and the target is still missing, your first reaction should be to continue `element_type: "any"` pagination
-3. `element_interaction` to click / hover / scroll / swipe / input / select
-4. `dialog` immediately if JavaScript or interaction opens a blocking dialog
-
-**Switch to JavaScript only when**:
-- Complex DOM manipulation needed
-- Visual commands fail twice (2-Strike Rule)
-- Special browser APIs or framework APIs are required
-- You need to inspect page state that is not exposed by visual tools
-- You have already exhausted the reliable highlight-driven discovery flow rather than abandoning it after the first page
-
-## Commands
-
-### javascript
-Execute arbitrary JavaScript.
-
-```json
-{ "script": "(() => { return document.title; })()" }
-```
-
-**Parameters**:
-- `script`: JavaScript code to execute
-
-**System-Specific Guidelines**:
-- **30-second timeout**: Scripts timing out will return error
-- **Return JSON-serializable values only**: No DOM nodes, functions, or circular structures
-- **Use IIFE**: Wrap in `(() => { ... })()` for explicit return statements
-- **Console output captured**: `console.log()` output is included in response
-- **Dialog detection**: Dialogs triggered during execution are automatically detected and require the dialog tool
-- **Prefer framework-safe events**: If a page action must be simulated, dispatch the event sequence the app expects rather than relying on a brittle shortcut
-- **Rebuild discovery after state-changing JS**: JavaScript does not return a screenshot, so after any page-changing script the next discovery step must be `highlight` with `element_type: "any"` before choosing the next element
-- **Use JavaScript to complement, not replace, the visual workflow**: Prefer reading state, calling a known page API, or performing a narrow fallback step rather than re-implementing the whole task in JavaScript
-- **Do not use JavaScript just because highlight page 1 did not show the target**: On the same unchanged page state, continue `highlight` pagination with `element_type: "any"` first
-
-### Minimal example
-```javascript
-(() => {
-    return { title: document.title, url: location.href };
-})()
-```
-
-## Error Handling
-
-- **Syntax error**: Returns error with stack trace
-- **Timeout**: Returns error after 30 seconds
-- **Dialog opened**: Returns dialog info, requiring the dialog tool
-- **Non-serializable return**: Returns error
-- **Security violation**: Returns error (cross-origin access, etc.)
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index dac0f4b..1927714 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1,12 +1,11 @@
 """
-BrowserExecutor - Unified executor for handling all 5 OpenBrowser tool actions.
+BrowserExecutor - Unified executor for handling all 4 OpenBrowser tool actions.
 
-This executor can handle actions from all 5 focused tools:
+This executor can handle actions from all 4 focused tools:
 - TabAction (from tab_tool.py)
 - BaseHighlightAction (from highlight_tool.py)
 - ElementInteractionAction (from element_interaction_tool.py)
 - DialogHandleAction (from dialog_tool.py)
-- JavaScriptAction (from javascript_tool.py)
 
 All actions inherit from OpenBrowserAction and share common conversation_id.
 This executor provides consistent command execution and pending confirmation state.
@@ -24,7 +23,6 @@
 from server.models.commands import (
     TabCommand,
     GetTabsCommand,
-    JavascriptExecuteCommand,
     HandleDialogCommand,
     DialogAction,
     TabAction as TabActionEnum,
@@ -45,10 +43,6 @@
 from server.agent.tools.highlight_tool import BaseHighlightAction
 from server.agent.tools.element_interaction_tool import ElementInteractionAction
 from server.agent.tools.dialog_tool import DialogHandleAction
-from server.agent.tools.javascript_tool import (
-    JavaScriptAction,
-    DISABLE_JAVASCRIPT_EXECUTE,
-)
 
 from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation
 
@@ -85,7 +79,7 @@ def remove_browser_executor(conversation_id: str) -> None:
 
 
 class BrowserExecutor(ToolExecutor[OpenBrowserAction, OpenBrowserObservation]):
-    """Unified executor for all 5 OpenBrowser tool actions.
+    """Unified executor for all 4 OpenBrowser tool actions.
 
     This executor can handle any action that inherits from OpenBrowserAction,
     providing consistent pending confirmation state and command execution
@@ -138,7 +132,7 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation:
         Args:
             action: Any action that inherits from OpenBrowserAction
                    (TabAction, BaseHighlightAction, ElementInteractionAction,
-                   DialogHandleAction, JavaScriptAction)
+                   DialogHandleAction)
 
         Returns:
             OpenBrowserObservation with results of the operation
@@ -174,8 +168,6 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation:
                 return self._execute_element_interaction_action(action)
             elif isinstance(action, DialogHandleAction):
                 return self._execute_dialog_action(action)
-            elif isinstance(action, JavaScriptAction):
-                return self._execute_javascript_action(action)
             else:
                 raise ValueError(f"Unknown action type: {type(action).__name__}")
 
@@ -620,42 +612,6 @@ def _execute_dialog_action(
         message = f"Dialog handled: {dialog_action_str}"
         return self._build_observation_from_result(result_dict, message)
 
-    def _execute_javascript_action(
-        self, action: JavaScriptAction
-    ) -> OpenBrowserObservation:
-        """Execute a JavaScript execution action."""
-        logger.debug(
-            f"DEBUG: _execute_javascript_action called with script length={len(action.script)}"
-        )
-
-        # Check if javascript_execute is disabled via environment variable
-        if DISABLE_JAVASCRIPT_EXECUTE:
-            return OpenBrowserObservation(
-                success=False,
-                error="javascript_execute command is disabled via OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE environment variable",
-                tabs=[],
-                screenshot_data_url=None,
-            )
-
-        # Validate required parameters
-        if not action.script:
-            raise ValueError("javascript requires script parameter")
-
-        command = JavascriptExecuteCommand(
-            script=action.script, conversation_id=self.conversation_id
-        )
-        result_dict = self._execute_command_sync(command)
-
-        # Truncate long scripts for message
-        script = action.script
-        if len(script) > 50:
-            message = f"Executed JavaScript: '{script[:50]}...'"
-        else:
-            message = f"Executed JavaScript: '{script}'"
-
-        observation = self._build_observation_from_result(result_dict, message)
-        return observation
-
     # ========== 2PC State Management Methods ==========
 
     def _clear_pending_confirmation(self):
diff --git a/server/agent/tools/javascript_tool.py b/server/agent/tools/javascript_tool.py
deleted file mode 100644
index d094116..0000000
--- a/server/agent/tools/javascript_tool.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""
-JavaScriptTool - AI tool for executing JavaScript in the browser.
-
-This tool serves as a fallback mechanism for complex browser interactions
-that are not covered by visual commands (highlight, click, hover, scroll, keyboard_input).
-
-Key characteristics:
-- Results must be JSON-serializable (no DOM nodes)
-- For React/Vue apps, dispatch full event sequence (not just .click())
-- Call `screenshot` command after if visual feedback is needed
-- Can be disabled via OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE environment variable
-"""
-
-import os
-import logging
-from collections.abc import Sequence
-
-from pydantic import Field
-from openhands.sdk.tool import (
-    ToolDefinition,
-    ToolAnnotations,
-    register_tool,
-)
-
-from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation
-from server.agent.tools.prompt_loader import render_tool_prompt
-
-logger = logging.getLogger(__name__)
-
-# Environment variable to disable javascript tool
-DISABLE_JAVASCRIPT_EXECUTE = os.getenv(
-    "OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE", ""
-).lower() in ("1", "true", "yes")
-
-def get_javascript_tool_description(conv_state=None) -> str:
-    """Get the JavaScriptTool description, rendered from Jinja2 template."""
-    return render_tool_prompt(
-        "javascript_tool.j2",
-        conv_state,
-        disable_javascript=DISABLE_JAVASCRIPT_EXECUTE
-    )
-
-
-class JavaScriptAction(OpenBrowserAction):
-    """Action for executing JavaScript code in the browser."""
-
-    script: str = Field(
-        description="JavaScript code to execute. Must return a JSON-serializable value."
-    )
-
-
-class JavaScriptTool(ToolDefinition[JavaScriptAction, OpenBrowserObservation]):
-    """Tool for executing JavaScript in the browser.
-
-    This tool provides a dedicated interface for JavaScript execution,
-    following the same pattern as other tools in the system.
-    """
-
-    name = "javascript"
-
-    @classmethod
-    def create(cls, conv_state, terminal_executor=None) -> Sequence["JavaScriptTool"]:
-        """Create JavaScriptTool instance.
-
-        Args:
-            conv_state: Conversation state for session isolation.
-            terminal_executor: Optional BrowserExecutor instance for handling commands.
-                             If None, creates a new BrowserExecutor.
-
-        Returns:
-            List containing the JavaScriptTool instance
-        """
-        # Check if disabled via environment variable
-        if DISABLE_JAVASCRIPT_EXECUTE:
-            logger.info(
-                "JavaScriptTool is disabled via OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE"
-            )
-            return []
-
-        # Use provided executor or get shared executor for conversation
-        if terminal_executor is not None:
-            executor = terminal_executor
-        else:
-            # Try to get conversation ID from conv_state to share executor across tools
-            # conv_state: openhands-sdk ConversationState
-            conversation_id = getattr(conv_state, "id", None)
-
-            # Get shared executor for this conversation (or create new if no conversation_id)
-            from server.agent.tools.browser_executor import get_browser_executor
-
-            executor = get_browser_executor(conversation_id)
-
-        return [
-            cls(
-                description=get_javascript_tool_description(conv_state),
-                action_type=JavaScriptAction,
-                observation_type=OpenBrowserObservation,
-                annotations=ToolAnnotations(
-                    title="JavaScript",
-                    readOnlyHint=False,
-                    destructiveHint=False,
-                    idempotentHint=False,
-                    openWorldHint=True,
-                ),
-                executor=executor,
-            )
-        ]
-
-
-# Register the tool
-register_tool("javascript", JavaScriptTool.create)
diff --git a/server/agent/tools/prompt_context.py b/server/agent/tools/prompt_context.py
index 0609bec..dd68b65 100644
--- a/server/agent/tools/prompt_context.py
+++ b/server/agent/tools/prompt_context.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import os
 from typing import Any
 
 from server.core.llm_config import llm_config_manager
@@ -41,13 +40,8 @@ def get_prompt_render_context(conv_state: Any = None) -> dict[str, Any]:
                         model_name = None
 
     profile = get_model_profile(model_name)
-    disable_javascript = os.getenv(
-        "OPEN_BROWSER_DISABLE_JAVASCRIPT_EXECUTE", ""
-    ).lower() in ("1", "true", "yes")
-
     return {
         "model_name": model_name,
         "model_profile": profile,
         "small_model": is_small_model(model_name),
-        "disable_javascript": disable_javascript,
     }
diff --git a/server/agent/tools/toolset.py b/server/agent/tools/toolset.py
index 81bea27..0b8694e 100644
--- a/server/agent/tools/toolset.py
+++ b/server/agent/tools/toolset.py
@@ -1,8 +1,8 @@
 """
-OpenBrowserToolSet - Aggregates all 5 OpenBrowser tools into a unified toolset.
+OpenBrowserToolSet - Aggregates all 4 OpenBrowser tools into a unified toolset.
 
 This module provides the OpenBrowserToolSet class that creates and manages
-all 5 focused OpenBrowser tools with a shared executor for state consistency.
+all 4 focused OpenBrowser tools with a shared executor for state consistency.
 
 Following the OpenHands SDK ToolSet pattern from browser_use.definition.BrowserToolSet.
 """
@@ -16,12 +16,11 @@
 from server.agent.tools.dialog_tool import DialogTool
 from server.agent.tools.element_interaction_tool import ElementInteractionTool
 from server.agent.tools.highlight_tool import HighlightTool
-from server.agent.tools.javascript_tool import JavaScriptTool
 from server.agent.tools.tab_tool import TabTool
 
 
 class OpenBrowserToolSet(ToolDefinition):
-    """Aggregates all 5 OpenBrowser tools with shared executor.
+    """Aggregates all 4 OpenBrowser tools with shared executor.
 
     This toolset provides a unified interface for registering all OpenBrowser
     tools while ensuring they share the same executor instance. This is critical
@@ -33,14 +32,13 @@ class OpenBrowserToolSet(ToolDefinition):
         - HighlightTool: Element discovery with collision-free visual overlays
         - ElementInteractionTool: Click/input with 2PC, hover/scroll/swipe/select direct
         - DialogTool: Browser dialog (alert/confirm/prompt/beforeunload) handling
-        - JavaScriptTool: Custom JavaScript execution for complex interactions
 
     Example:
         >>> tools = OpenBrowserToolSet.create(None)
         >>> len(tools)
-        5
+        4
         >>> [tool.name for tool in tools]
-        ['tab', 'highlight', 'element_interaction', 'dialog', 'javascript']
+        ['tab', 'highlight', 'element_interaction', 'dialog']
     """
 
     @classmethod
@@ -49,7 +47,7 @@ def create(
         conv_state,
         executor: Optional[BrowserExecutor] = None,
     ) -> Sequence[ToolDefinition]:
-        """Create all 5 OpenBrowser tools with shared executor.
+        """Create all 4 OpenBrowser tools with shared executor.
 
         Args:
             executor: Optional BrowserExecutor instance for handling commands.
@@ -57,9 +55,9 @@ def create(
                      registration in production use).
 
         Returns:
-            List of 5 ToolDefinition instances (TabTool, HighlightTool,
-            ElementInteractionTool, DialogTool, JavaScriptTool), all sharing
-            the same executor instance.
+            List of 4 ToolDefinition instances (TabTool, HighlightTool,
+            ElementInteractionTool, DialogTool), all sharing the same executor
+            instance.
 
         Note:
             The executor must be shared across all tools to enable:
@@ -80,7 +78,6 @@ def create(
             HighlightTool,
             ElementInteractionTool,
             DialogTool,
-            JavaScriptTool,
         ]:
             tools.extend(tool_class.create(conv_state, executor))
 
diff --git a/server/tests/integration/test_toolset_integration.py b/server/tests/integration/test_toolset_integration.py
index b45a028..08f5726 100644
--- a/server/tests/integration/test_toolset_integration.py
+++ b/server/tests/integration/test_toolset_integration.py
@@ -48,7 +48,6 @@ class MockTaskTrackerTool:
 tools_module.__path__ = [str(tools_dir)]
 sys.modules.setdefault("server.agent.tools", tools_module)
 
-from server.agent.tools import javascript_tool
 from server.agent.tools.toolset import OpenBrowserToolSet
 
 
@@ -60,6 +59,7 @@ def test_pending_confirmation_is_shared_across_tool_instances(self) -> None:
 
         shared_executor._set_pending_confirmation(
             element_id="abc123",
+            highlight_snapshot_id=17,
             action_type="click",
             full_html="<button>Delete</button>",
         )
@@ -72,29 +72,36 @@ def test_pending_confirmation_is_shared_across_tool_instances(self) -> None:
             tools["tab"].executor._get_pending_confirmation()["action_type"] == "click"
         )
 
-    def test_confirmed_element_cache_is_isolated_by_conversation(self) -> None:
+    def test_pending_confirmation_is_isolated_by_conversation(self) -> None:
         tools = {tool.name: tool for tool in OpenBrowserToolSet.create(None)}
         executor = tools["element_interaction"].executor
 
         executor.conversation_id = "conv-1"
-        executor._add_confirmed_element("elem-1")
+        executor._set_pending_confirmation(
+            element_id="elem-1",
+            highlight_snapshot_id=11,
+            action_type="click",
+            full_html="<button>First</button>",
+        )
 
         executor.conversation_id = "conv-2"
 
-        assert executor._is_element_confirmed("elem-1") is False
+        assert executor._get_pending_confirmation() is None
 
-        executor._add_confirmed_element("elem-2")
-        assert executor._is_element_confirmed("elem-2") is True
+        executor._set_pending_confirmation(
+            element_id="elem-2",
+            highlight_snapshot_id=22,
+            action_type="keyboard_input",
+            full_html='<input value="second" />',
+        )
+        assert executor._get_pending_confirmation()["element_id"] == "elem-2"
+        assert executor._get_pending_confirmation()["highlight_snapshot_id"] == 22
 
         executor.conversation_id = "conv-1"
-        assert executor._is_element_confirmed("elem-1") is True
-        assert executor._is_element_confirmed("elem-2") is False
-
-    def test_disabling_javascript_keeps_core_workflow_tools_available(
-        self, monkeypatch
-    ) -> None:
-        monkeypatch.setattr(javascript_tool, "DISABLE_JAVASCRIPT_EXECUTE", True)
+        assert executor._get_pending_confirmation()["element_id"] == "elem-1"
+        assert executor._get_pending_confirmation()["highlight_snapshot_id"] == 11
 
+    def test_toolset_keeps_core_workflow_tools_available(self) -> None:
         tools = OpenBrowserToolSet.create(None)
 
         assert [tool.name for tool in tools] == [
diff --git a/server/tests/unit/test_agent_manager_process.py b/server/tests/unit/test_agent_manager_process.py
index 6a0166e..d031e78 100644
--- a/server/tests/unit/test_agent_manager_process.py
+++ b/server/tests/unit/test_agent_manager_process.py
@@ -63,8 +63,8 @@ def test_multi_process_mode_initializes_infrastructure(self) -> None:
         assert manager._process_manager is not None
         assert manager._ipc_router is not None
 
-    def test_large_models_keep_full_browser_toolset(self) -> None:
-        """Large models should keep javascript plus general tools."""
+    def test_large_models_keep_core_browser_toolset(self) -> None:
+        """Large models should expose the four browser tools plus general tools."""
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
             manager = OpenBrowserAgentManager()
             mock_llm_config.reload_config.return_value = MagicMock()
@@ -84,15 +84,14 @@ def test_large_models_keep_full_browser_toolset(self) -> None:
             "highlight",
             "element_interaction",
             "dialog",
-            "javascript",
             "please_help_me",
             "terminal",
             "file_editor",
             "task_tracker",
         ]
 
-    def test_small_models_drop_javascript_only(self) -> None:
-        """Small models keep general tools but lose javascript."""
+    def test_small_models_keep_the_same_browser_toolset(self) -> None:
+        """Small models should use the same four browser tools."""
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
             manager = OpenBrowserAgentManager()
             mock_llm_config.reload_config.return_value = MagicMock()
@@ -119,7 +118,7 @@ def test_small_models_drop_javascript_only(self) -> None:
         ]
 
     def test_unknown_models_default_to_large_profile(self) -> None:
-        """Unconfigured models should keep the large-model toolset."""
+        """Unconfigured models should keep the standard browser toolset."""
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
             manager = OpenBrowserAgentManager()
             mock_llm_config.reload_config.return_value = MagicMock()
@@ -133,11 +132,11 @@ def test_unknown_models_default_to_large_profile(self) -> None:
                 tool.name for tool in manager._get_tools_for_model("some/new-model")
             ]
 
-        assert "javascript" in tool_names
+        assert "dialog" in tool_names
         assert "please_help_me" in tool_names
 
     def test_system_prompt_kwargs_follow_large_model_profile(self) -> None:
-        """Large models should advertise full browser freedom in system prompt."""
+        """Large models should get the large-profile system prompt kwargs."""
         manager = OpenBrowserAgentManager()
 
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
@@ -153,11 +152,10 @@ def test_system_prompt_kwargs_follow_large_model_profile(self) -> None:
         assert kwargs == {
             "model_profile": "large",
             "small_model": False,
-            "javascript_available": True,
         }
 
     def test_system_prompt_kwargs_follow_small_model_profile(self) -> None:
-        """Small models should get the constrained system prompt variant."""
+        """Small models should get the small-profile system prompt kwargs."""
         manager = OpenBrowserAgentManager()
 
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
@@ -173,7 +171,6 @@ def test_system_prompt_kwargs_follow_small_model_profile(self) -> None:
         assert kwargs == {
             "model_profile": "small",
             "small_model": True,
-            "javascript_available": False,
         }
 
 
diff --git a/server/tests/unit/test_prompt_context.py b/server/tests/unit/test_prompt_context.py
index 81580c8..f996258 100644
--- a/server/tests/unit/test_prompt_context.py
+++ b/server/tests/unit/test_prompt_context.py
@@ -31,7 +31,6 @@ def test_prompt_context_defaults_to_large_profile_without_conversation() -> None
         "model_name": None,
         "model_profile": "large",
         "small_model": False,
-        "disable_javascript": False,
     }
 
 
diff --git a/server/tests/unit/test_toolset.py b/server/tests/unit/test_toolset.py
index d236a59..40f8039 100644
--- a/server/tests/unit/test_toolset.py
+++ b/server/tests/unit/test_toolset.py
@@ -50,7 +50,6 @@ class MockTaskTrackerTool:
 sys.modules["openhands.tools.preset"] = types.ModuleType("openhands.tools.preset")
 
 from server.agent.tools.browser_executor import BrowserExecutor
-from server.agent.tools import javascript_tool
 from server.agent.tools.toolset import OpenBrowserToolSet
 
 
@@ -60,7 +59,7 @@ def test_create_shares_one_executor_within_single_toolset(self) -> None:
 
         executors = {tool.executor for tool in tools}
 
-        assert len(tools) == 5
+        assert len(tools) == 4
         assert len(executors) == 1
         assert isinstance(next(iter(executors)), BrowserExecutor)
 
@@ -80,13 +79,10 @@ def test_provided_executor_is_reused_for_all_tools_in_workflow_order(self) -> No
             "highlight",
             "element_interaction",
             "dialog",
-            "javascript",
         ]
         assert all(tool.executor is executor for tool in tools)
 
-    def test_disabling_javascript_removes_only_fallback_tool(self, monkeypatch) -> None:
-        monkeypatch.setattr(javascript_tool, "DISABLE_JAVASCRIPT_EXECUTE", True)
-
+    def test_create_exposes_only_core_browser_tools(self) -> None:
         tools = OpenBrowserToolSet.create(None)
 
         assert [tool.name for tool in tools] == [
@@ -103,4 +99,3 @@ def test_capability_hints_match_actual_tool_behavior(self) -> None:
         assert tools["highlight"].annotations.readOnlyHint is True
         assert tools["element_interaction"].annotations.destructiveHint is True
         assert tools["dialog"].annotations.readOnlyHint is False
-        assert tools["javascript"].annotations.openWorldHint is True
diff --git a/uv.lock b/uv.lock
index 61dda0c..88ad2be 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1675,8 +1675,8 @@ requires-dist = [
     { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=bfba5e3889829067baeab3b12d38008360913771" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" },
     { name = "numpy", specifier = ">=1.24.0" },
-    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" },
-    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" },
+    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0" },
+    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0" },
     { name = "pillow", specifier = ">=10.0.0" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" },
     { name = "pydantic", specifier = ">=2.5.0" },
@@ -2221,7 +2221,7 @@ wheels = [
 [[package]]
 name = "openhands-sdk"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0#56c315b34a8d3f0b3d1c89cba57115dd859edbb0" }
 dependencies = [
     { name = "agent-client-protocol" },
     { name = "deprecation" },
@@ -2241,7 +2241,7 @@ dependencies = [
 [[package]]
 name = "openhands-tools"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5#e8ed7c2da95f861d48d624c2cb24df4fe6b84fe5" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0#56c315b34a8d3f0b3d1c89cba57115dd859edbb0" }
 dependencies = [
     { name = "bashlex" },
     { name = "binaryornot" },

From 18b76a94d193e31b8a1578c18a46d379139f23d6 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Thu, 26 Mar 2026 23:30:39 +0800
Subject: [PATCH 5/7] Update evaluation report

---
 eval/evaluation_report.json | 308 ++++++++++++++++++------------------
 1 file changed, 154 insertions(+), 154 deletions(-)

diff --git a/eval/evaluation_report.json b/eval/evaluation_report.json
index 7393560..e34b5d9 100644
--- a/eval/evaluation_report.json
+++ b/eval/evaluation_report.json
@@ -1,11 +1,11 @@
 {
   "evaluation": {
-    "timestamp": "2026-03-26 01:06:07",
-    "unix_timestamp": 1774458367.956289,
+    "timestamp": "2026-03-26 23:28:03",
+    "unix_timestamp": 1774538883.639733,
     "summary": {
       "total_tests": 22,
-      "passed_tests": 21,
-      "pass_rate": 95.45,
+      "passed_tests": 20,
+      "pass_rate": 90.91,
       "models_tested": [
         "dashscope/qwen3.5-plus",
         "dashscope/qwen3.5-flash"
@@ -13,27 +13,27 @@
     },
     "model_performance": {
       "dashscope/qwen3.5-plus": {
-        "pass_rate": 90.91,
-        "task_score": 61.0,
+        "pass_rate": 100.0,
+        "task_score": 62.5,
         "task_max_score": 62.5,
-        "efficiency_score": 7.2697,
-        "usage_score": 4.256,
-        "composite_score": 0.755,
-        "avg_duration": 165.39,
-        "avg_cost": 0.663257,
-        "passed_count": 10,
+        "efficiency_score": 7.6698,
+        "usage_score": 4.4243,
+        "composite_score": 0.8199,
+        "avg_duration": 146.25,
+        "avg_cost": 0.639522,
+        "passed_count": 11,
         "total_tests": 11
       },
       "dashscope/qwen3.5-flash": {
-        "pass_rate": 100.0,
-        "task_score": 62.5,
+        "pass_rate": 81.82,
+        "task_score": 54.0,
         "task_max_score": 62.5,
-        "efficiency_score": 7.3059,
-        "usage_score": 8.7355,
-        "composite_score": 0.8917,
-        "avg_duration": 161.03,
-        "avg_cost": 0.19694,
-        "passed_count": 11,
+        "efficiency_score": 6.0924,
+        "usage_score": 9.0375,
+        "composite_score": 0.766,
+        "avg_duration": 220.62,
+        "avg_cost": 0.185729,
+        "passed_count": 9,
         "total_tests": 11
       }
     },
@@ -42,26 +42,26 @@
         "name": "BlueBook Search And Like Test",
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
-            "passed": false,
-            "task_score": 4.5,
+            "passed": true,
+            "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.6649,
-            "usage_score": 0.4188,
-            "composite_score": 0.2167,
-            "total_score": 5.58,
-            "duration": 100.53,
-            "cost": 0.348724
+            "efficiency_score": 0.637,
+            "usage_score": 0.2891,
+            "composite_score": 0.7852,
+            "total_score": 6.93,
+            "duration": 108.9,
+            "cost": 0.426542
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.7076,
-            "usage_score": 0.8345,
-            "composite_score": 0.9084,
-            "total_score": 7.54,
-            "duration": 87.71,
-            "cost": 0.099273
+            "efficiency_score": 0.6236,
+            "usage_score": 0.7717,
+            "composite_score": 0.8791,
+            "total_score": 7.4,
+            "duration": 112.92,
+            "cost": 0.136968
           }
         }
       },
@@ -72,23 +72,23 @@
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7872,
-            "usage_score": 0.7718,
-            "composite_score": 0.9118,
+            "efficiency_score": 0.7897,
+            "usage_score": 0.773,
+            "composite_score": 0.9125,
             "total_score": 4.56,
-            "duration": 63.85,
-            "cost": 0.182561
+            "duration": 63.1,
+            "cost": 0.181606
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.8726,
-            "usage_score": 0.9595,
-            "composite_score": 0.9664,
-            "total_score": 4.83,
-            "duration": 38.22,
-            "cost": 0.032404
+            "efficiency_score": 0.7887,
+            "usage_score": 0.9319,
+            "composite_score": 0.9441,
+            "total_score": 4.72,
+            "duration": 63.38,
+            "cost": 0.054464
           }
         }
       },
@@ -99,23 +99,23 @@
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.404,
-            "usage_score": 0.2516,
-            "composite_score": 0.7311,
-            "total_score": 9.66,
-            "duration": 417.19,
-            "cost": 1.49673
+            "efficiency_score": 0.5631,
+            "usage_score": 0.2639,
+            "composite_score": 0.7654,
+            "total_score": 9.83,
+            "duration": 305.82,
+            "cost": 1.472194
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.6498,
-            "usage_score": 0.8601,
-            "composite_score": 0.902,
-            "total_score": 10.51,
-            "duration": 245.11,
-            "cost": 0.279726
+            "efficiency_score": 0.6021,
+            "usage_score": 0.8314,
+            "composite_score": 0.8867,
+            "total_score": 10.43,
+            "duration": 278.5,
+            "cost": 0.337292
           }
         }
       },
@@ -126,23 +126,23 @@
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.6196,
-            "usage_score": 0.5748,
-            "composite_score": 0.8389,
-            "total_score": 3.69,
-            "duration": 152.16,
-            "cost": 0.340137
+            "efficiency_score": 0.758,
+            "usage_score": 0.5909,
+            "composite_score": 0.8698,
+            "total_score": 3.85,
+            "duration": 96.81,
+            "cost": 0.32725
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.8196,
-            "usage_score": 0.9146,
-            "composite_score": 0.9468,
-            "total_score": 4.23,
-            "duration": 72.16,
-            "cost": 0.068325
+            "efficiency_score": 0.7436,
+            "usage_score": 0.9038,
+            "composite_score": 0.9295,
+            "total_score": 4.15,
+            "duration": 102.54,
+            "cost": 0.076921
           }
         }
       },
@@ -153,23 +153,23 @@
             "passed": true,
             "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.6174,
-            "usage_score": 0.0907,
-            "composite_score": 0.7416,
-            "total_score": 10.21,
-            "duration": 191.31,
-            "cost": 0.90927
+            "efficiency_score": 0.5944,
+            "usage_score": 0.0415,
+            "composite_score": 0.7272,
+            "total_score": 10.14,
+            "duration": 202.82,
+            "cost": 0.958494
           },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 9.5,
+            "passed": false,
+            "task_score": 6.0,
             "task_max_score": 9.5,
-            "efficiency_score": 0.6108,
-            "usage_score": 0.7116,
-            "composite_score": 0.8645,
-            "total_score": 10.82,
-            "duration": 194.61,
-            "cost": 0.288375
+            "efficiency_score": 0,
+            "usage_score": 0.9935,
+            "composite_score": 0.1987,
+            "total_score": 6.99,
+            "duration": 500.0,
+            "cost": 0.006517
           }
         }
       },
@@ -180,23 +180,23 @@
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.697,
-            "usage_score": 0.3845,
-            "composite_score": 0.8163,
+            "efficiency_score": 0.6687,
+            "usage_score": 0.4162,
+            "composite_score": 0.817,
             "total_score": 13.08,
-            "duration": 151.48,
-            "cost": 0.738602
+            "duration": 165.67,
+            "cost": 0.700563
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.6493,
-            "usage_score": 0.8553,
-            "composite_score": 0.9009,
-            "total_score": 13.5,
-            "duration": 175.36,
-            "cost": 0.173623
+            "efficiency_score": 0.688,
+            "usage_score": 0.8486,
+            "composite_score": 0.9073,
+            "total_score": 13.54,
+            "duration": 156.0,
+            "cost": 0.181736
           }
         }
       },
@@ -207,23 +207,23 @@
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
-            "efficiency_score": 0.8385,
-            "usage_score": 0.762,
-            "composite_score": 0.9201,
-            "total_score": 3.6,
-            "duration": 48.45,
-            "cost": 0.119009
+            "efficiency_score": 0.8682,
+            "usage_score": 0.7719,
+            "composite_score": 0.928,
+            "total_score": 3.64,
+            "duration": 39.55,
+            "cost": 0.114047
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
-            "efficiency_score": 0.8944,
-            "usage_score": 0.9556,
-            "composite_score": 0.97,
+            "efficiency_score": 0.8999,
+            "usage_score": 0.9453,
+            "composite_score": 0.969,
             "total_score": 3.85,
-            "duration": 31.69,
-            "cost": 0.022211
+            "duration": 30.03,
+            "cost": 0.027328
           }
         }
       },
@@ -234,23 +234,23 @@
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0.5267,
-            "usage_score": 0.1531,
-            "composite_score": 0.736,
-            "total_score": 5.68,
-            "duration": 189.34,
-            "cost": 0.84689
+            "efficiency_score": 0.6309,
+            "usage_score": 0.3874,
+            "composite_score": 0.8037,
+            "total_score": 6.02,
+            "duration": 147.63,
+            "cost": 0.612606
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0.0609,
-            "usage_score": 0.5408,
-            "composite_score": 0.7204,
-            "total_score": 5.6,
-            "duration": 375.62,
-            "cost": 0.459171
+            "efficiency_score": 0.622,
+            "usage_score": 0.835,
+            "composite_score": 0.8914,
+            "total_score": 6.46,
+            "duration": 151.2,
+            "cost": 0.164994
           }
         }
       },
@@ -261,23 +261,23 @@
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.737,
-            "usage_score": 0.4947,
-            "composite_score": 0.8463,
-            "total_score": 4.73,
-            "duration": 131.52,
-            "cost": 0.606343
+            "efficiency_score": 0.7444,
+            "usage_score": 0.51,
+            "composite_score": 0.8509,
+            "total_score": 4.75,
+            "duration": 127.79,
+            "cost": 0.587982
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.7542,
-            "usage_score": 0.8895,
-            "composite_score": 0.9287,
-            "total_score": 5.14,
-            "duration": 122.9,
-            "cost": 0.13258
+            "efficiency_score": 0.0687,
+            "usage_score": 0.5945,
+            "composite_score": 0.7326,
+            "total_score": 4.16,
+            "duration": 465.64,
+            "cost": 0.486634
           }
         }
       },
@@ -288,23 +288,23 @@
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7219,
+            "efficiency_score": 0.7553,
             "usage_score": 0,
-            "composite_score": 0.7444,
-            "total_score": 3.72,
-            "duration": 166.87,
-            "cost": 0.73836
+            "composite_score": 0.7511,
+            "total_score": 3.76,
+            "duration": 146.83,
+            "cost": 0.723918
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.6795,
-            "usage_score": 0.4314,
-            "composite_score": 0.8222,
-            "total_score": 4.11,
-            "duration": 192.3,
-            "cost": 0.284306
+            "efficiency_score": 0.7302,
+            "usage_score": 0.6429,
+            "composite_score": 0.8746,
+            "total_score": 4.37,
+            "duration": 161.88,
+            "cost": 0.178565
           }
         }
       },
@@ -315,23 +315,23 @@
             "passed": true,
             "task_score": 7.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.6556,
-            "usage_score": 0.3539,
-            "composite_score": 0.8019,
-            "total_score": 8.01,
-            "duration": 206.63,
-            "cost": 0.969205
+            "efficiency_score": 0.6602,
+            "usage_score": 0.3803,
+            "composite_score": 0.8081,
+            "total_score": 8.04,
+            "duration": 203.86,
+            "cost": 0.929538
           },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 7.0,
+            "passed": false,
+            "task_score": 2.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.6072,
-            "usage_score": 0.7824,
-            "composite_score": 0.8779,
-            "total_score": 8.39,
-            "duration": 235.67,
-            "cost": 0.326348
+            "efficiency_score": 0.3255,
+            "usage_score": 0.7389,
+            "composite_score": 0.2129,
+            "total_score": 3.06,
+            "duration": 404.7,
+            "cost": 0.3916
           }
         }
       }

From 566a9ba1a1149af4c60102717586d242f516a65a Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Fri, 27 Mar 2026 12:26:01 +0800
Subject: [PATCH 6/7] Improve confirmation shortcuts and activation handling

---
 eval/evaluation_report.json                   | 300 +++++++++---------
 .../element-actions-regression.test.ts        |  15 +
 extension/src/commands/element-actions.ts     |  37 ++-
 server/agent/tools/base.py                    |   6 -
 server/agent/tools/browser_executor.py        |  96 ++++++
 .../tests/unit/test_agent_browser_executor.py | 194 ++++++++++-
 server/tests/unit/test_base_classes.py        |   3 +-
 7 files changed, 491 insertions(+), 160 deletions(-)

diff --git a/eval/evaluation_report.json b/eval/evaluation_report.json
index e34b5d9..c59c71c 100644
--- a/eval/evaluation_report.json
+++ b/eval/evaluation_report.json
@@ -1,11 +1,11 @@
 {
   "evaluation": {
-    "timestamp": "2026-03-26 23:28:03",
-    "unix_timestamp": 1774538883.639733,
+    "timestamp": "2026-03-27 00:45:37",
+    "unix_timestamp": 1774543537.602308,
     "summary": {
       "total_tests": 22,
-      "passed_tests": 20,
-      "pass_rate": 90.91,
+      "passed_tests": 21,
+      "pass_rate": 95.45,
       "models_tested": [
         "dashscope/qwen3.5-plus",
         "dashscope/qwen3.5-flash"
@@ -16,24 +16,24 @@
         "pass_rate": 100.0,
         "task_score": 62.5,
         "task_max_score": 62.5,
-        "efficiency_score": 7.6698,
-        "usage_score": 4.4243,
-        "composite_score": 0.8199,
-        "avg_duration": 146.25,
-        "avg_cost": 0.639522,
+        "efficiency_score": 7.9068,
+        "usage_score": 4.7439,
+        "composite_score": 0.83,
+        "avg_duration": 135.49,
+        "avg_cost": 0.588661,
         "passed_count": 11,
         "total_tests": 11
       },
       "dashscope/qwen3.5-flash": {
-        "pass_rate": 81.82,
-        "task_score": 54.0,
+        "pass_rate": 90.91,
+        "task_score": 56.0,
         "task_max_score": 62.5,
-        "efficiency_score": 6.0924,
-        "usage_score": 9.0375,
-        "composite_score": 0.766,
-        "avg_duration": 220.62,
-        "avg_cost": 0.185729,
-        "passed_count": 9,
+        "efficiency_score": 7.8906,
+        "usage_score": 9.2849,
+        "composite_score": 0.8577,
+        "avg_duration": 133.22,
+        "avg_cost": 0.152816,
+        "passed_count": 10,
         "total_tests": 11
       }
     },
@@ -45,23 +45,23 @@
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.637,
-            "usage_score": 0.2891,
-            "composite_score": 0.7852,
-            "total_score": 6.93,
-            "duration": 108.9,
-            "cost": 0.426542
+            "efficiency_score": 0.6587,
+            "usage_score": 0.2884,
+            "composite_score": 0.7894,
+            "total_score": 6.95,
+            "duration": 102.4,
+            "cost": 0.42695
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.6236,
-            "usage_score": 0.7717,
-            "composite_score": 0.8791,
-            "total_score": 7.4,
-            "duration": 112.92,
-            "cost": 0.136968
+            "efficiency_score": 0.6927,
+            "usage_score": 0.8331,
+            "composite_score": 0.9052,
+            "total_score": 7.53,
+            "duration": 92.2,
+            "cost": 0.100139
           }
         }
       },
@@ -72,23 +72,23 @@
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7897,
-            "usage_score": 0.773,
-            "composite_score": 0.9125,
-            "total_score": 4.56,
-            "duration": 63.1,
-            "cost": 0.181606
+            "efficiency_score": 0.7931,
+            "usage_score": 0.7763,
+            "composite_score": 0.9139,
+            "total_score": 4.57,
+            "duration": 62.06,
+            "cost": 0.178958
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7887,
-            "usage_score": 0.9319,
-            "composite_score": 0.9441,
-            "total_score": 4.72,
-            "duration": 63.38,
-            "cost": 0.054464
+            "efficiency_score": 0.733,
+            "usage_score": 0.8868,
+            "composite_score": 0.9239,
+            "total_score": 4.62,
+            "duration": 80.11,
+            "cost": 0.090585
           }
         }
       },
@@ -99,23 +99,23 @@
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.5631,
-            "usage_score": 0.2639,
-            "composite_score": 0.7654,
-            "total_score": 9.83,
-            "duration": 305.82,
-            "cost": 1.472194
+            "efficiency_score": 0.5473,
+            "usage_score": 0.2619,
+            "composite_score": 0.7618,
+            "total_score": 9.81,
+            "duration": 316.88,
+            "cost": 1.47621
           },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 9.0,
+            "passed": false,
+            "task_score": 2.5,
             "task_max_score": 9.0,
-            "efficiency_score": 0.6021,
-            "usage_score": 0.8314,
-            "composite_score": 0.8867,
-            "total_score": 10.43,
-            "duration": 278.5,
-            "cost": 0.337292
+            "efficiency_score": 0.7064,
+            "usage_score": 0.8895,
+            "composite_score": 0.3192,
+            "total_score": 4.1,
+            "duration": 205.55,
+            "cost": 0.221029
           }
         }
       },
@@ -126,23 +126,23 @@
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.758,
-            "usage_score": 0.5909,
-            "composite_score": 0.8698,
-            "total_score": 3.85,
-            "duration": 96.81,
-            "cost": 0.32725
+            "efficiency_score": 0.7329,
+            "usage_score": 0.5884,
+            "composite_score": 0.8643,
+            "total_score": 3.82,
+            "duration": 106.82,
+            "cost": 0.329246
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.7436,
-            "usage_score": 0.9038,
-            "composite_score": 0.9295,
-            "total_score": 4.15,
-            "duration": 102.54,
-            "cost": 0.076921
+            "efficiency_score": 0.7885,
+            "usage_score": 0.8918,
+            "composite_score": 0.9361,
+            "total_score": 4.18,
+            "duration": 84.61,
+            "cost": 0.08655
           }
         }
       },
@@ -153,23 +153,23 @@
             "passed": true,
             "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.5944,
-            "usage_score": 0.0415,
-            "composite_score": 0.7272,
-            "total_score": 10.14,
-            "duration": 202.82,
-            "cost": 0.958494
+            "efficiency_score": 0.676,
+            "usage_score": 0.1585,
+            "composite_score": 0.7669,
+            "total_score": 10.33,
+            "duration": 162.0,
+            "cost": 0.841473
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 6.0,
+            "passed": true,
+            "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0,
-            "usage_score": 0.9935,
-            "composite_score": 0.1987,
-            "total_score": 6.99,
-            "duration": 500.0,
-            "cost": 0.006517
+            "efficiency_score": 0.6157,
+            "usage_score": 0.7408,
+            "composite_score": 0.8713,
+            "total_score": 10.86,
+            "duration": 192.13,
+            "cost": 0.259165
           }
         }
       },
@@ -180,23 +180,23 @@
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.6687,
-            "usage_score": 0.4162,
-            "composite_score": 0.817,
-            "total_score": 13.08,
-            "duration": 165.67,
-            "cost": 0.700563
+            "efficiency_score": 0.6307,
+            "usage_score": 0.3382,
+            "composite_score": 0.7938,
+            "total_score": 12.97,
+            "duration": 184.64,
+            "cost": 0.79412
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.688,
-            "usage_score": 0.8486,
-            "composite_score": 0.9073,
+            "efficiency_score": 0.7021,
+            "usage_score": 0.8414,
+            "composite_score": 0.9087,
             "total_score": 13.54,
-            "duration": 156.0,
-            "cost": 0.181736
+            "duration": 148.94,
+            "cost": 0.190283
           }
         }
       },
@@ -207,23 +207,23 @@
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
-            "efficiency_score": 0.8682,
-            "usage_score": 0.7719,
-            "composite_score": 0.928,
-            "total_score": 3.64,
-            "duration": 39.55,
-            "cost": 0.114047
+            "efficiency_score": 0.8652,
+            "usage_score": 0.7683,
+            "composite_score": 0.9267,
+            "total_score": 3.63,
+            "duration": 40.44,
+            "cost": 0.115867
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
             "efficiency_score": 0.8999,
-            "usage_score": 0.9453,
-            "composite_score": 0.969,
+            "usage_score": 0.946,
+            "composite_score": 0.9692,
             "total_score": 3.85,
             "duration": 30.03,
-            "cost": 0.027328
+            "cost": 0.027012
           }
         }
       },
@@ -234,23 +234,23 @@
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0.6309,
-            "usage_score": 0.3874,
-            "composite_score": 0.8037,
-            "total_score": 6.02,
-            "duration": 147.63,
-            "cost": 0.612606
+            "efficiency_score": 0.6882,
+            "usage_score": 0.4682,
+            "composite_score": 0.8313,
+            "total_score": 6.16,
+            "duration": 124.71,
+            "cost": 0.531767
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0.622,
-            "usage_score": 0.835,
-            "composite_score": 0.8914,
-            "total_score": 6.46,
-            "duration": 151.2,
-            "cost": 0.164994
+            "efficiency_score": 0.535,
+            "usage_score": 0.7822,
+            "composite_score": 0.8634,
+            "total_score": 6.32,
+            "duration": 186.0,
+            "cost": 0.217815
           }
         }
       },
@@ -261,23 +261,23 @@
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.7444,
-            "usage_score": 0.51,
-            "composite_score": 0.8509,
-            "total_score": 4.75,
-            "duration": 127.79,
-            "cost": 0.587982
+            "efficiency_score": 0.7916,
+            "usage_score": 0.621,
+            "composite_score": 0.8825,
+            "total_score": 4.91,
+            "duration": 104.2,
+            "cost": 0.454825
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.0687,
-            "usage_score": 0.5945,
-            "composite_score": 0.7326,
-            "total_score": 4.16,
-            "duration": 465.64,
-            "cost": 0.486634
+            "efficiency_score": 0.7624,
+            "usage_score": 0.8874,
+            "composite_score": 0.9299,
+            "total_score": 5.15,
+            "duration": 118.82,
+            "cost": 0.135177
           }
         }
       },
@@ -288,23 +288,23 @@
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7553,
+            "efficiency_score": 0.792,
             "usage_score": 0,
-            "composite_score": 0.7511,
-            "total_score": 3.76,
-            "duration": 146.83,
-            "cost": 0.723918
+            "composite_score": 0.7584,
+            "total_score": 3.79,
+            "duration": 124.81,
+            "cost": 0.537748
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7302,
-            "usage_score": 0.6429,
-            "composite_score": 0.8746,
-            "total_score": 4.37,
-            "duration": 161.88,
-            "cost": 0.178565
+            "efficiency_score": 0.781,
+            "usage_score": 0.7322,
+            "composite_score": 0.9026,
+            "total_score": 4.51,
+            "duration": 131.41,
+            "cost": 0.133914
           }
         }
       },
@@ -315,23 +315,23 @@
             "passed": true,
             "task_score": 7.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.6602,
-            "usage_score": 0.3803,
-            "composite_score": 0.8081,
-            "total_score": 8.04,
-            "duration": 203.86,
-            "cost": 0.929538
+            "efficiency_score": 0.731,
+            "usage_score": 0.4746,
+            "composite_score": 0.8411,
+            "total_score": 8.21,
+            "duration": 161.37,
+            "cost": 0.78811
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 2.0,
+            "passed": true,
+            "task_score": 7.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.3255,
-            "usage_score": 0.7389,
-            "composite_score": 0.2129,
-            "total_score": 3.06,
-            "duration": 404.7,
-            "cost": 0.3916
+            "efficiency_score": 0.674,
+            "usage_score": 0.8538,
+            "composite_score": 0.9056,
+            "total_score": 8.53,
+            "duration": 195.59,
+            "cost": 0.219313
           }
         }
       }
diff --git a/extension/src/__tests__/element-actions-regression.test.ts b/extension/src/__tests__/element-actions-regression.test.ts
index c13b04f..e1ec551 100644
--- a/extension/src/__tests__/element-actions-regression.test.ts
+++ b/extension/src/__tests__/element-actions-regression.test.ts
@@ -26,6 +26,9 @@ describe('Element action regressions', () => {
     expect(elementActionsSource).toContain(
       'function getInteractiveActivationTarget(target)',
     );
+    expect(elementActionsSource).toContain(
+      'function resolveActivationDispatchTarget(target, activationTarget)',
+    );
     expect(elementActionsSource).toContain(
       'dispatchActivationPress(activationTarget, activation.point);',
     );
@@ -34,6 +37,18 @@ describe('Element action regressions', () => {
     );
   });
 
+  test('performElementClick preserves structured interactive targets like anchors', () => {
+    expect(elementActionsSource).toContain(
+      'if (isPlaceholderCoverForInput(activationTarget, target))',
+    );
+    expect(elementActionsSource).toContain(
+      'if (isStructuredInteractiveElement(target))',
+    );
+    expect(elementActionsSource).toContain(
+      'keep dispatch on that exact element instead of drifting to a non-interactive ancestor',
+    );
+  });
+
   test('performKeyboardInput primes editable activation and beforeinput events', () => {
     expect(elementActionsSource).toContain('const alreadyFocused =');
     expect(elementActionsSource).toContain("new InputEvent('beforeinput'");
diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts
index 05082e3..2871107 100644
--- a/extension/src/commands/element-actions.ts
+++ b/extension/src/commands/element-actions.ts
@@ -207,6 +207,33 @@ function buildEditableActivationHelpersScript(): string {
       return { target, point: fallbackPoint };
     }
 
+    function resolveActivationDispatchTarget(target, activationTarget) {
+      if (!(target instanceof Element)) {
+        return activationTarget instanceof Element ? activationTarget : null;
+      }
+
+      if (!(activationTarget instanceof Element)) {
+        return target;
+      }
+
+      // Placeholder covers for text inputs need the visible overlay surface.
+      if (isPlaceholderCoverForInput(activationTarget, target)) {
+        return activationTarget;
+      }
+
+      // If highlight selected a structured interactive element such as <a> or <button>,
+      // keep dispatch on that exact element instead of drifting to a non-interactive ancestor.
+      if (isStructuredInteractiveElement(target)) {
+        return target;
+      }
+
+      if (isStructuredInteractiveElement(activationTarget)) {
+        return activationTarget;
+      }
+
+      return activationTarget;
+    }
+
     function getActivationEventPoint(target, point) {
       if (
         point &&
@@ -479,7 +506,10 @@ export async function performElementClick(
       try {
         const activation = getInteractiveActivationTarget(el);
         const activationTarget =
-          activation.target instanceof Element ? activation.target : el;
+          resolveActivationDispatchTarget(
+            el,
+            activation.target instanceof Element ? activation.target : el,
+          ) || el;
 
         dispatchActivationPress(activationTarget, activation.point);
         const focused = focusInteractionTarget(
@@ -2561,7 +2591,10 @@ export async function performKeyboardInput(
       try {
         const activation = getInteractiveActivationTarget(el);
         const activationTarget =
-          activation.target instanceof Element ? activation.target : el;
+          resolveActivationDispatchTarget(
+            el,
+            activation.target instanceof Element ? activation.target : el,
+          ) || el;
         const alreadyFocused =
           (el instanceof HTMLElement && document.activeElement === el) ||
           (el instanceof HTMLElement &&
diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py
index 7a04083..62615ba 100644
--- a/server/agent/tools/base.py
+++ b/server/agent/tools/base.py
@@ -120,10 +120,6 @@ def _pending_confirmation_llm_content(
         text_parts = [
             "## Pending Confirmation",
             "",
-            "Inspect the screenshot first.",
-            "Confirm only if the single highlighted element is exactly the intended target.",
-            "The pending element and highlight snapshot are already stored by the system.",
-            "",
             f"**Highlight Snapshot ID**: {highlight_snapshot_id}",
             f"**Element ID**: {element_id}",
             f"**Action Type**: {action_type}",
@@ -142,8 +138,6 @@ def _pending_confirmation_llm_content(
 
         text_parts.append("**Confirm with:**")
         text_parts.append(f"```json\n{confirm_cmd}\n```")
-        text_parts.append("")
-        text_parts.append("Use a different action to cancel.")
 
         content_items.append(TextContent(text="\n".join(text_parts)))
         return content_items
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 1927714..65f1b8e 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -12,6 +12,7 @@
 """
 
 import asyncio
+from collections import OrderedDict
 import logging
 import threading
 from typing import Any, Dict, Optional, Union
@@ -48,6 +49,9 @@
 
 logger = logging.getLogger(__name__)
 
+CONFIRMED_CLICK_HTML_CACHE_SIZE = 10
+ELEMENT_HTML_CACHE_MISS_PLACEHOLDER = "<element not found in cache>"
+
 # Global registry for shared BrowserExecutor instances per conversation
 # Key: conversation_id (str), Value: BrowserExecutor instance
 _executor_registry: Dict[str, "BrowserExecutor"] = {}
@@ -96,6 +100,10 @@ def __init__(self):
         self.conversation_id = None
         # Pending confirmations per conversation for 2PC actions.
         self.pending_confirmations: Dict[str, Dict[str, Any]] = {}
+        # Recently confirmed element targets keyed by action_type then exact element HTML.
+        self.confirmed_action_html_lru: Dict[
+            str, Dict[str, OrderedDict[str, None]]
+        ] = {}
 
     def __call__(
         self, action: OpenBrowserAction, conversation
@@ -337,6 +345,21 @@ def _execute_element_interaction_action(
             full_html, screenshot = self._get_element_full_html(
                 action.element_id, action.highlight_snapshot_id
             )
+            if self._has_confirmed_action_html("click", full_html):
+                command = ClickElementCommand(
+                    element_id=action.element_id,
+                    highlight_snapshot_id=action.highlight_snapshot_id,
+                    conversation_id=self.conversation_id,
+                    tab_id=action.tab_id,
+                )
+                result_dict = self._execute_element_command(command, "click element")
+                self._remember_confirmed_action_html("click", full_html)
+                return self._build_observation_from_result(
+                    result_dict,
+                    f"Clicked element: {action.element_id}",
+                    element_id=action.element_id,
+                    highlight_snapshot_id=action.highlight_snapshot_id,
+                )
             self._set_pending_confirmation(
                 element_id=action.element_id,
                 highlight_snapshot_id=action.highlight_snapshot_id,
@@ -454,6 +477,22 @@ def _execute_element_interaction_action(
             full_html, screenshot = self._get_element_full_html(
                 action.element_id, action.highlight_snapshot_id
             )
+            if self._has_confirmed_action_html("keyboard_input", full_html):
+                command = KeyboardInputCommand(
+                    element_id=action.element_id,
+                    highlight_snapshot_id=action.highlight_snapshot_id,
+                    text=action.text,
+                    conversation_id=self.conversation_id,
+                    tab_id=action.tab_id,
+                )
+                result_dict = self._execute_element_command(command, "input text")
+                self._remember_confirmed_action_html("keyboard_input", full_html)
+                return self._build_observation_from_result(
+                    result_dict,
+                    f"Input text to element: {action.element_id}",
+                    element_id=action.element_id,
+                    highlight_snapshot_id=action.highlight_snapshot_id,
+                )
             self._set_pending_confirmation(
                 element_id=action.element_id,
                 highlight_snapshot_id=action.highlight_snapshot_id,
@@ -531,6 +570,7 @@ def _execute_element_interaction_action(
             if not result_dict or not result_dict.get("success"):
                 ext_error = self._extract_result_error(result_dict)
                 raise RuntimeError(f"Failed to click element: {ext_error}")
+            self._remember_confirmed_action_html("click", pending.get("full_html"))
             message = f"Confirmed and clicked element: {pending_element_id}"
             self._clear_pending_confirmation()
             return self._build_observation_from_result(
@@ -570,6 +610,9 @@ def _execute_element_interaction_action(
             if not result_dict or not result_dict.get("success"):
                 ext_error = self._extract_result_error(result_dict)
                 raise RuntimeError(f"Failed to input text: {ext_error}")
+            self._remember_confirmed_action_html(
+                "keyboard_input", pending.get("full_html")
+            )
             message = f"Confirmed and input text to element: {pending_element_id}"
             self._clear_pending_confirmation()
             return self._build_observation_from_result(
@@ -642,6 +685,59 @@ def _get_pending_confirmation(self) -> Optional[Dict[str, Any]]:
         """Get pending confirmation for current conversation"""
         return self.pending_confirmations.get(self.conversation_id)
 
+    def _normalize_confirmed_action_html(self, full_html: Optional[str]) -> Optional[str]:
+        """Normalize cached element HTML used for repeat-click shortcut matching."""
+        if not isinstance(full_html, str):
+            return None
+
+        normalized = full_html.strip()
+        if (
+            not normalized
+            or normalized == ELEMENT_HTML_CACHE_MISS_PLACEHOLDER
+        ):
+            return None
+
+        return normalized
+
+    def _get_confirmed_action_html_lru(self, action_type: str) -> OrderedDict[str, None]:
+        """Get or create the confirmed-action LRU cache for current conversation."""
+        conversation_lru = self.confirmed_action_html_lru.setdefault(
+            self.conversation_id, {}
+        )
+        return conversation_lru.setdefault(action_type, OrderedDict())
+
+    def _has_confirmed_action_html(
+        self, action_type: str, full_html: Optional[str]
+    ) -> bool:
+        """Return whether this exact HTML was recently confirmed for the action."""
+        normalized = self._normalize_confirmed_action_html(full_html)
+        if normalized is None:
+            return False
+
+        lru = self._get_confirmed_action_html_lru(action_type)
+        if normalized not in lru:
+            return False
+
+        lru.move_to_end(normalized)
+        return True
+
+    def _remember_confirmed_action_html(
+        self, action_type: str, full_html: Optional[str]
+    ) -> None:
+        """Record a confirmed-action HTML entry in the per-conversation LRU cache."""
+        normalized = self._normalize_confirmed_action_html(full_html)
+        if normalized is None:
+            return
+
+        lru = self._get_confirmed_action_html_lru(action_type)
+        if normalized in lru:
+            lru.move_to_end(normalized)
+        else:
+            lru[normalized] = None
+
+        while len(lru) > CONFIRMED_CLICK_HTML_CACHE_SIZE:
+            lru.popitem(last=False)
+
     def _extract_result_error(
         self, result_dict: Optional[Dict[str, Any]], default: str = "Unknown error"
     ) -> str:
diff --git a/server/tests/unit/test_agent_browser_executor.py b/server/tests/unit/test_agent_browser_executor.py
index ed9d3fb..2ea9c7d 100644
--- a/server/tests/unit/test_agent_browser_executor.py
+++ b/server/tests/unit/test_agent_browser_executor.py
@@ -4,7 +4,7 @@
 
 from server.agent.tools.browser_executor import BrowserExecutor
 from server.agent.tools.element_interaction_tool import ElementInteractionAction
-from server.models.commands import HoverElementCommand, SwipeElementCommand
+from server.models.commands import ClickElementCommand, HoverElementCommand, SwipeElementCommand
 
 
 class _FakeResponse:
@@ -144,6 +144,198 @@ def fake_execute(command):
     assert executor._get_pending_confirmation() is None
 
 
+def test_repeat_click_with_confirmed_html_executes_without_pending_confirmation(
+    monkeypatch,
+) -> None:
+    executor = BrowserExecutor()
+    executor.conversation_id = "conv-repeat-click"
+    executor._remember_confirmed_action_html(
+        "click",
+        '<a href="/gbr/articles/opinion-article2.html">Why the Fed Must Act Decisively</a>'
+    )
+
+    monkeypatch.setattr(
+        executor,
+        "_get_element_full_html",
+        lambda element_id, highlight_snapshot_id: (
+            '<a href="/gbr/articles/opinion-article2.html">Why the Fed Must Act Decisively</a>',
+            "data:image/png;base64,repeat",
+        ),
+    )
+
+    captured = {}
+
+    def fake_execute(command):
+        captured["command"] = command
+        return {
+            "success": True,
+            "data": {"screenshot": "data:image/png;base64,clicked-direct"},
+        }
+
+    monkeypatch.setattr(executor, "_execute_command_sync", fake_execute)
+
+    observation = executor._execute_action_sync(
+        ElementInteractionAction(
+            action="click",
+            element_id="15",
+            highlight_snapshot_id=195,
+            tab_id=456,
+            conversation_id="conv-repeat-click",
+        )
+    )
+
+    assert observation.success is True
+    assert observation.message == "Clicked element: 15"
+    assert observation.pending_confirmation is None
+    assert isinstance(captured["command"], ClickElementCommand)
+    assert captured["command"].element_id == "15"
+    assert captured["command"].highlight_snapshot_id == 195
+    assert captured["command"].tab_id == 456
+    assert executor._get_pending_confirmation() is None
+
+
+def test_repeat_click_does_not_shortcut_cache_miss_placeholder(monkeypatch) -> None:
+    executor = BrowserExecutor()
+    executor.conversation_id = "conv-repeat-placeholder"
+    executor._remember_confirmed_action_html(
+        "click", "<element not found in cache>"
+    )
+
+    monkeypatch.setattr(
+        executor,
+        "_get_element_full_html",
+        lambda element_id, highlight_snapshot_id: (
+            "<element not found in cache>",
+            "data:image/png;base64,pending",
+        ),
+    )
+    monkeypatch.setattr(
+        executor,
+        "_execute_command_sync",
+        lambda command: (_ for _ in ()).throw(AssertionError("should stay in 2PC")),
+    )
+
+    observation = executor._execute_action_sync(
+        ElementInteractionAction(
+            action="click",
+            element_id="15",
+            highlight_snapshot_id=195,
+            conversation_id="conv-repeat-placeholder",
+        )
+    )
+
+    assert observation.success is True
+    assert observation.message == "Click action pending confirmation for element: 15"
+    assert observation.pending_confirmation is not None
+    assert observation.pending_confirmation["full_html"] == "<element not found in cache>"
+
+
+def test_confirmed_action_html_lru_keeps_only_most_recent_ten_entries() -> None:
+    executor = BrowserExecutor()
+    executor.conversation_id = "conv-repeat-lru"
+
+    for i in range(11):
+        executor._remember_confirmed_action_html("click", f"<button>Action {i}</button>")
+
+    lru = executor._get_confirmed_action_html_lru("click")
+
+    assert len(lru) == 10
+    assert "<button>Action 0</button>" not in lru
+    assert "<button>Action 10</button>" in lru
+
+
+def test_repeat_keyboard_input_with_confirmed_html_executes_without_pending_confirmation(
+    monkeypatch,
+) -> None:
+    executor = BrowserExecutor()
+    executor.conversation_id = "conv-repeat-input"
+    executor._remember_confirmed_action_html(
+        "keyboard_input", '<input type="text" id="search-input" value="fed" />'
+    )
+
+    monkeypatch.setattr(
+        executor,
+        "_get_element_full_html",
+        lambda element_id, highlight_snapshot_id: (
+            '<input type="text" id="search-input" value="fed" />',
+            "data:image/png;base64,input-repeat",
+        ),
+    )
+
+    captured = {}
+
+    def fake_execute(command):
+        captured["command"] = command
+        return {
+            "success": True,
+            "data": {"screenshot": "data:image/png;base64,input-direct"},
+        }
+
+    monkeypatch.setattr(executor, "_execute_command_sync", fake_execute)
+
+    observation = executor._execute_action_sync(
+        ElementInteractionAction(
+            action="keyboard_input",
+            element_id="inp123",
+            highlight_snapshot_id=17,
+            text="hello",
+            tab_id=321,
+            conversation_id="conv-repeat-input",
+        )
+    )
+
+    assert observation.success is True
+    assert observation.message == "Input text to element: inp123"
+    assert observation.pending_confirmation is None
+    assert captured["command"].element_id == "inp123"
+    assert captured["command"].highlight_snapshot_id == 17
+    assert captured["command"].text == "hello"
+    assert captured["command"].tab_id == 321
+    assert executor._get_pending_confirmation() is None
+
+
+def test_confirmed_click_html_does_not_skip_keyboard_input_confirmation(
+    monkeypatch,
+) -> None:
+    executor = BrowserExecutor()
+    executor.conversation_id = "conv-input-action-isolation"
+    executor._remember_confirmed_action_html(
+        "click", '<input type="text" id="search-input" value="fed" />'
+    )
+
+    monkeypatch.setattr(
+        executor,
+        "_get_element_full_html",
+        lambda element_id, highlight_snapshot_id: (
+            '<input type="text" id="search-input" value="fed" />',
+            "data:image/png;base64,input-pending",
+        ),
+    )
+    monkeypatch.setattr(
+        executor,
+        "_execute_command_sync",
+        lambda command: (_ for _ in ()).throw(AssertionError("should stay in 2PC")),
+    )
+
+    observation = executor._execute_action_sync(
+        ElementInteractionAction(
+            action="keyboard_input",
+            element_id="inp123",
+            highlight_snapshot_id=17,
+            text="hello",
+            conversation_id="conv-input-action-isolation",
+        )
+    )
+
+    assert observation.success is True
+    assert (
+        observation.message
+        == "Keyboard input action pending confirmation for element: inp123"
+    )
+    assert observation.pending_confirmation is not None
+    assert observation.pending_confirmation["action_type"] == "keyboard_input"
+
+
 def test_confirm_keyboard_input_reports_nested_extension_error(monkeypatch) -> None:
     executor = BrowserExecutor()
     executor.conversation_id = "conv-input-error"
diff --git a/server/tests/unit/test_base_classes.py b/server/tests/unit/test_base_classes.py
index 0793763..32598e5 100644
--- a/server/tests/unit/test_base_classes.py
+++ b/server/tests/unit/test_base_classes.py
@@ -175,11 +175,11 @@ def test_pending_confirmation_includes_follow_up_command(self) -> None:
         text = _text_content(observation)
 
         assert "## Pending Confirmation" in text
-        assert "Inspect the screenshot first." in text
         assert '{"action": "confirm_click"}' in text
         assert '"highlight_snapshot_id"' not in text
         assert '"element_id"' not in text
         assert "**Element ID**: a1b2c3" in text
+        assert "**Action Type**: click" in text
 
     def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) -> None:
         observation = OpenBrowserObservation(
@@ -197,6 +197,7 @@ def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) ->
         assert '"highlight_snapshot_id"' not in text
         assert '"element_id"' not in text
         assert "**Element ID**: inp789" in text
+        assert "**Action Type**: keyboard_input" in text
 
     def test_pending_confirmation_with_screenshot_is_image_first_and_text_minimal(self) -> None:
         observation = OpenBrowserObservation(

From 6e80a552f59347d8a3623f9871330ad74e5f31ad Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Fri, 27 Mar 2026 12:36:35 +0800
Subject: [PATCH 7/7] Stabilize confirmation flow and test coverage

---
 eval/evaluate_browser_agent.py                | 14 ++-
 .../background-cleanup-regression.test.ts     |  8 +-
 .../element-actions-regression.test.ts        |  8 +-
 .../src/__tests__/highlight-detection.test.ts | 20 +++--
 .../__tests__/highlight-integration.test.ts   | 17 ++--
 .../__tests__/highlight-screenshot.test.ts    |  8 +-
 extension/src/background/index.ts             | 25 +++---
 .../src/commands/__tests__/element-id.test.ts | 22 +++--
 .../__tests__/single-highlight.test.ts        |  4 +-
 .../__tests__/tab-manager-cleanup.test.ts     | 10 +--
 extension/src/commands/element-actions.ts     | 27 ++++--
 extension/src/commands/element-cache.ts       | 87 +++++++++++++++----
 extension/src/commands/screenshot.ts          |  8 +-
 extension/src/utils/collision-detection.ts    | 25 +++---
 server/agent/api.py                           |  1 +
 server/agent/tools/base.py                    |  4 +-
 server/agent/tools/browser_executor.py        | 35 ++++----
 server/agent/tools/help_tool.py               |  8 +-
 server/core/session_manager.py                | 42 ++++++---
 .../integration/test_element_operations.py    |  4 +-
 .../tests/unit/test_agent_browser_executor.py | 35 +++++---
 server/tests/unit/test_base_classes.py        |  8 +-
 server/tests/unit/test_command_models.py      | 21 ++++-
 server/tests/unit/test_eval_client.py         |  4 +-
 server/tests/unit/test_prompt_contracts.py    | 70 +++++++++++----
 server/tests/unit/test_screenshot_behavior.py |  4 +-
 server/tests/unit/test_state.py               | 31 ++++++-
 .../tests/unit/test_tool_prompt_profiles.py   | 84 +++++++++++++-----
 28 files changed, 437 insertions(+), 197 deletions(-)

diff --git a/eval/evaluate_browser_agent.py b/eval/evaluate_browser_agent.py
index 7666549..d8686c8 100644
--- a/eval/evaluate_browser_agent.py
+++ b/eval/evaluate_browser_agent.py
@@ -546,6 +546,7 @@ def cleanup_managed_tabs(self, conversation_id: str) -> bool:
 
         return all_closed
 
+
 class EvalServerClient:
     """Client for evaluation server tracking API"""
 
@@ -609,7 +610,8 @@ def start_openbrowser(self) -> bool:
                 return True
 
             root_dir = EVAL_DIR.parent
-            logger.error(f"""
+            logger.error(
+                f"""
 ❌ OpenBrowser server is not running!
    Please start the OpenBrowser server manually with:
 
@@ -617,7 +619,8 @@ def start_openbrowser(self) -> bool:
    uv run local-chrome-server serve
 
    The server should start on port 8765 (REST API) and 8766 (WebSocket).
-""")
+"""
+            )
             return False
 
         except Exception as e:
@@ -634,7 +637,8 @@ def start_eval_server(self) -> bool:
 
             eval_dir = EVAL_DIR
             root_dir = EVAL_DIR.parent
-            logger.error(f"""
+            logger.error(
+                f"""
 ❌ Eval server is not running!
    Please start the eval server manually with:
 
@@ -646,7 +650,8 @@ def start_eval_server(self) -> bool:
    uv run python eval/server.py
 
    The server should start on port 16605.
-""")
+"""
+            )
             return False
 
         except Exception as e:
@@ -1397,6 +1402,7 @@ def _check_count_min_condition(
 
     def _event_matches_expected(self, event: Dict, expected: Dict) -> bool:
         """Check if a track event matches expected criteria"""
+
         def normalize_text(value: Any) -> str:
             return str(value or "").casefold()
 
diff --git a/extension/src/__tests__/background-cleanup-regression.test.ts b/extension/src/__tests__/background-cleanup-regression.test.ts
index 22fe2cb..7938fe4 100644
--- a/extension/src/__tests__/background-cleanup-regression.test.ts
+++ b/extension/src/__tests__/background-cleanup-regression.test.ts
@@ -19,12 +19,16 @@ describe('Background cleanup regressions', () => {
 
   test('tab close events are wired into browser-state cleanup', () => {
     expect(backgroundSource).toContain('tabManager.addTabClosedListener');
-    expect(backgroundSource).toContain('cleanupTabState(conversationId, tabId);');
+    expect(backgroundSource).toContain(
+      'cleanupTabState(conversationId, tabId);',
+    );
   });
 
   test('swipe screenshots reuse tab-view warmup capture options', () => {
     expect(backgroundSource).toContain("case 'swipe_element': {");
     expect(backgroundSource).toContain('          900,');
-    expect(backgroundSource).toContain('          TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,');
+    expect(backgroundSource).toContain(
+      '          TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,',
+    );
   });
 });
diff --git a/extension/src/__tests__/element-actions-regression.test.ts b/extension/src/__tests__/element-actions-regression.test.ts
index e1ec551..845b8ea 100644
--- a/extension/src/__tests__/element-actions-regression.test.ts
+++ b/extension/src/__tests__/element-actions-regression.test.ts
@@ -55,7 +55,9 @@ describe('Element action regressions', () => {
   });
 
   test('performElementSwipe includes gesture fallback and settle wait', () => {
-    expect(elementActionsSource).toContain('async function performGestureSwipe');
+    expect(elementActionsSource).toContain(
+      'async function performGestureSwipe',
+    );
     expect(elementActionsSource).toContain('await waitForSwipeToSettle(');
     expect(elementActionsSource).toContain("stepMethod = 'gesture';");
   });
@@ -71,6 +73,8 @@ describe('Element action regressions', () => {
     expect(elementActionsSource).toContain('currentApi.slideNext(0)');
     expect(elementActionsSource).toContain('currentApi.slidePrev(0)');
     expect(elementActionsSource).toContain('currentApi.scrollNext(true)');
-    expect(elementActionsSource).toContain('currentApi.slideTo(targetIndex, 0)');
+    expect(elementActionsSource).toContain(
+      'currentApi.slideTo(targetIndex, 0)',
+    );
   });
 });
diff --git a/extension/src/__tests__/highlight-detection.test.ts b/extension/src/__tests__/highlight-detection.test.ts
index eb0c6c6..cd080b8 100644
--- a/extension/src/__tests__/highlight-detection.test.ts
+++ b/extension/src/__tests__/highlight-detection.test.ts
@@ -29,7 +29,12 @@ function createElement(
 describe('highlight-detection helpers', () => {
   test('normalizeHighlightKeywords trims, lowercases, removes whitespace, and deduplicates', () => {
     expect(
-      normalizeHighlightKeywords([' Like ', 'like', ' REPLY ', " John's   reply "]),
+      normalizeHighlightKeywords([
+        ' Like ',
+        'like',
+        ' REPLY ',
+        " John's   reply ",
+      ]),
     ).toEqual(['like', 'reply', "john'sreply"]);
   });
 
@@ -75,7 +80,7 @@ describe('highlight-detection helpers', () => {
           selector: 'button.reply-target',
           text: "John's reply",
           searchText: "john ' s    reply",
-          html: '<button>John <span>\'s</span> reply</button>',
+          html: "<button>John <span>'s</span> reply</button>",
         }),
         createElement({
           id: 'reply456',
@@ -133,7 +138,9 @@ describe('highlight-detection helpers', () => {
     const script = buildHighlightDetectionScript({ elementType: 'inputable' });
 
     expect(script).toContain('return el.isContentEditable;');
-    expect(script).not.toContain("return el.getAttribute('contenteditable') === 'true';");
+    expect(script).not.toContain(
+      "return el.getAttribute('contenteditable') === 'true';",
+    );
   });
 
   test("buildHighlightDetectionScript keeps 'any' candidate selection across all element types", () => {
@@ -216,7 +223,10 @@ describe('highlight-detection helpers', () => {
   test('buildHighlightDetectionScript keeps semantic controls clickable in swipe regions', () => {
     const script = buildHighlightDetectionScript({ elementType: 'any' });
     const start = script.indexOf('function toInteractiveElement');
-    const end = script.indexOf('function countVisibleClickableCandidates', start);
+    const end = script.indexOf(
+      'function countVisibleClickableCandidates',
+      start,
+    );
     const toInteractiveElementSource = script.slice(start, end);
 
     expect(script).toContain('function isSemanticControlElement');
@@ -230,7 +240,7 @@ describe('highlight-detection helpers', () => {
       "interactionHints.includes('swipable')",
     );
     expect(toInteractiveElementSource).toContain(
-      "isScrollableCandidate(candidate.element)",
+      'isScrollableCandidate(candidate.element)',
     );
     expect(toInteractiveElementSource).toContain("? 'scrollable'");
   });
diff --git a/extension/src/__tests__/highlight-integration.test.ts b/extension/src/__tests__/highlight-integration.test.ts
index 2122ff2..2dfdb57 100644
--- a/extension/src/__tests__/highlight-integration.test.ts
+++ b/extension/src/__tests__/highlight-integration.test.ts
@@ -193,7 +193,6 @@ describe('Highlight Integration', () => {
         expect(elem.labelPosition).toBeDefined();
       }
     });
-
   });
 
   describe('Collision detection', () => {
@@ -370,12 +369,7 @@ describe('Highlight Integration', () => {
 
       const result = sortElementsByVisualOrder(elements);
 
-      expect(result.map((element) => element.id)).toEqual([
-        'a',
-        'b',
-        'd',
-        'c',
-      ]);
+      expect(result.map((element) => element.id)).toEqual(['a', 'b', 'd', 'c']);
     });
 
     test('should handle empty elements array', () => {
@@ -461,12 +455,13 @@ describe('Highlight Integration', () => {
         elements.push(createElement(`elem${i}`, 'clickable', x, y, 80, 30));
       }
 
-      const startTime = Date.now();
+      const startTime = performance.now();
       const result = selectCollisionFreePage(elements, 1);
-      const duration = Date.now() - startTime;
+      const duration = performance.now() - startTime;
 
-      // Should complete in reasonable time (< 100ms for 100 elements)
-      expect(duration).toBeLessThan(100);
+      // Keep this as a coarse smoke test rather than a microbenchmark.
+      // Bun's wall-clock timing varies noticeably under full-suite load.
+      expect(duration).toBeLessThan(1000);
 
       // Should return some elements
       expect(result.length).toBeGreaterThan(0);
diff --git a/extension/src/__tests__/highlight-screenshot.test.ts b/extension/src/__tests__/highlight-screenshot.test.ts
index f4c500d..0f95377 100644
--- a/extension/src/__tests__/highlight-screenshot.test.ts
+++ b/extension/src/__tests__/highlight-screenshot.test.ts
@@ -55,9 +55,13 @@ describe('Highlight Screenshot', () => {
     expect(HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.maxOutputHeight).toBe(1080);
     expect(HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.warmupBeforeCapture).toBe(true);
     expect(HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.warmupMaxAttempts).toBe(2);
-    expect(HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.settleBeforeCapture).toBeUndefined();
+    expect(
+      HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS.settleBeforeCapture,
+    ).toBeUndefined();
     expect(TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS.warmupBeforeCapture).toBe(true);
     expect(TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS.warmupMaxAttempts).toBe(3);
-    expect(TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS.settleBeforeCapture).toBeUndefined();
+    expect(
+      TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS.settleBeforeCapture,
+    ).toBeUndefined();
   });
 });
diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index 965ad65..ce63ef3 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -117,7 +117,9 @@ function buildStoredHighlightPages(options: {
       viewportWidth,
       viewportHeight,
     );
-    pages.push(assignSequentialElementIds(sortElementsByVisualOrder(pageElements)));
+    pages.push(
+      assignSequentialElementIds(sortElementsByVisualOrder(pageElements)),
+    );
   }
 
   return pages;
@@ -138,7 +140,7 @@ function buildSnapshotPageRefreshScript(options: {
   return `
     (() => {
       const expectedDocumentId = ${JSON.stringify(expectedDocumentId || '')};
-      const highlightSnapshotId = ${highlightSnapshotId ?? "null"};
+      const highlightSnapshotId = ${highlightSnapshotId ?? 'null'};
       const refreshTargets = ${JSON.stringify(refreshTargets)};
 
       function normalizeIdentityWhitespace(value, maxLength = 240) {
@@ -1883,10 +1885,7 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         const highlightSnapshotId = command.highlight_snapshot_id;
         const requestedKeywords = normalizeHighlightKeywords(keywords);
 
-        if (
-          highlightSnapshotId !== undefined &&
-          highlightSnapshotId !== null
-        ) {
+        if (highlightSnapshotId !== undefined && highlightSnapshotId !== null) {
           const baseSnapshot = elementCache.getSnapshotPage(
             conversationId,
             activeTabId,
@@ -1900,7 +1899,9 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
             };
           }
 
-          const cachedKeywords = normalizeHighlightKeywords(baseSnapshot.keywords);
+          const cachedKeywords = normalizeHighlightKeywords(
+            baseSnapshot.keywords,
+          );
           if (baseSnapshot.elementType !== elementType) {
             return {
               success: false,
@@ -2122,9 +2123,7 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
             1;
           const viewportWidth = screenshotResult.metadata?.viewportWidth || 0;
           const viewportHeight = screenshotResult.metadata?.viewportHeight || 0;
-          console.log(
-            `📐 [HighlightElements] Image scale: ${imageScale}`,
-          );
+          console.log(`📐 [HighlightElements] Image scale: ${imageScale}`);
           console.log(
             `📐 [HighlightElements] Viewport: ${viewportWidth}x${viewportHeight} CSS pixels`,
           );
@@ -2889,9 +2888,9 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
               JSON.stringify(freshBbox),
             );
           } else if (
-            bboxResult.success
-            && bboxResult.result?.value
-            && bboxResult.result.value.ok === false
+            bboxResult.success &&
+            bboxResult.result?.value &&
+            bboxResult.result.value.ok === false
           ) {
             return {
               success: false,
diff --git a/extension/src/commands/__tests__/element-id.test.ts b/extension/src/commands/__tests__/element-id.test.ts
index 6c95cd8..632fca1 100644
--- a/extension/src/commands/__tests__/element-id.test.ts
+++ b/extension/src/commands/__tests__/element-id.test.ts
@@ -25,7 +25,11 @@ describe('element-id', () => {
     ]);
 
     expect(result.map((element) => element.id)).toEqual(['1', '2', '3']);
-    expect(result.map((element) => element.selector)).toEqual(['#a', '#b', '#c']);
+    expect(result.map((element) => element.selector)).toEqual([
+      '#a',
+      '#b',
+      '#c',
+    ]);
   });
 
   test('does not mutate the caller-owned element objects', () => {
@@ -49,10 +53,7 @@ describe('element-cache highlight snapshots', () => {
       documentId: 'doc-1',
       elementType: 'any',
       totalElements: 2,
-      pages: [
-        [createElement('1', '#page-1')],
-        [createElement('1', '#page-2')],
-      ],
+      pages: [[createElement('1', '#page-1')], [createElement('1', '#page-2')]],
       page: 1,
     });
 
@@ -62,7 +63,12 @@ describe('element-cache highlight snapshots', () => {
       '#page-1',
     ]);
 
-    const lookup = elementCache.getElementById('conv-1', 101, snapshot.snapshotId, '1');
+    const lookup = elementCache.getElementById(
+      'conv-1',
+      101,
+      snapshot.snapshotId,
+      '1',
+    );
     expect(lookup?.element.selector).toBe('#page-1');
     expect(lookup?.documentId).toBe('doc-1');
   });
@@ -102,8 +108,8 @@ describe('element-cache highlight snapshots', () => {
         .selector,
     ).toBe('#first-page');
     expect(
-      elementCache.getElementById('conv-2', 101, page2!.snapshotId, '1')?.element
-        .selector,
+      elementCache.getElementById('conv-2', 101, page2!.snapshotId, '1')
+        ?.element.selector,
     ).toBe('#second-page');
   });
 });
diff --git a/extension/src/commands/__tests__/single-highlight.test.ts b/extension/src/commands/__tests__/single-highlight.test.ts
index 1268896..5e9aa11 100644
--- a/extension/src/commands/__tests__/single-highlight.test.ts
+++ b/extension/src/commands/__tests__/single-highlight.test.ts
@@ -3,9 +3,7 @@ import { describe, expect, test } from 'bun:test';
 import type { InteractiveElement } from '../../types';
 import { calculateConfirmationPreviewLayout } from '../single-highlight';
 
-function createElement(
-  bbox: InteractiveElement['bbox'],
-): InteractiveElement {
+function createElement(bbox: InteractiveElement['bbox']): InteractiveElement {
   return {
     id: 'abc123',
     type: 'clickable',
diff --git a/extension/src/commands/__tests__/tab-manager-cleanup.test.ts b/extension/src/commands/__tests__/tab-manager-cleanup.test.ts
index a7d706c..69ce1b9 100644
--- a/extension/src/commands/__tests__/tab-manager-cleanup.test.ts
+++ b/extension/src/commands/__tests__/tab-manager-cleanup.test.ts
@@ -123,9 +123,8 @@ describe('TabManager cleanup behavior', () => {
     const onTabClosed = mock(() => {});
     manager.addTabClosedListener(onTabClosed);
 
-    const [tabRemovedListener] = (globalThis as any).__tabRemovedListeners as Array<
-      (tabId: number) => void
-    >;
+    const [tabRemovedListener] = (globalThis as any)
+      .__tabRemovedListeners as Array<(tabId: number) => void>;
     tabRemovedListener(11);
 
     expect(onTabClosed).toHaveBeenCalledWith('conv-last-tab', 11);
@@ -133,8 +132,9 @@ describe('TabManager cleanup behavior', () => {
   });
 
   test('cleanup sweep removes sessions whose tracked tabs are already gone', async () => {
-    const chromeTabsQuery = (globalThis as any).chrome.tabs
-      .query as ReturnType<typeof mock>;
+    const chromeTabsQuery = (globalThis as any).chrome.tabs.query as ReturnType<
+      typeof mock
+    >;
     chromeTabsQuery.mockResolvedValue([{ id: 99 }]);
 
     const manager = new TabManager();
diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts
index 2871107..fae90a0 100644
--- a/extension/src/commands/element-actions.ts
+++ b/extension/src/commands/element-actions.ts
@@ -435,7 +435,9 @@ export async function performElementClick(
   // STEP 2: Build JavaScript to click with full event sequence
   // ============================================================
   // Escape quotes in selector for safe injection
-  const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector);
+  const escapedSelector = escapeForDoubleQuotedJavaScriptString(
+    element.selector,
+  );
   const escapedDocumentId = escapeForDoubleQuotedJavaScriptString(
     cachedElement.documentId,
   );
@@ -726,7 +728,9 @@ export async function performElementHover(
   // ============================================================
   // STEP 2: Build JavaScript to dispatch hover events
   // ============================================================
-  const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector);
+  const escapedSelector = escapeForDoubleQuotedJavaScriptString(
+    element.selector,
+  );
   const escapedDocumentId = escapeForDoubleQuotedJavaScriptString(
     cachedElement.documentId,
   );
@@ -1001,7 +1005,8 @@ export async function performElementScroll(
         success: false,
         elementId,
         scrolled: false,
-        error: 'highlight_snapshot_id is required when scrolling a highlighted element.',
+        error:
+          'highlight_snapshot_id is required when scrolling a highlighted element.',
       };
     }
 
@@ -1025,7 +1030,9 @@ export async function performElementScroll(
     console.log(
       `✅ [ElementScroll] Found element: selector="${element.selector}"`,
     );
-    const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector);
+    const escapedSelector = escapeForDoubleQuotedJavaScriptString(
+      element.selector,
+    );
     const escapedDocumentId = escapeForDoubleQuotedJavaScriptString(
       cachedElement.documentId,
     );
@@ -1361,7 +1368,9 @@ export async function performElementSwipe(
     `✅ [ElementSwipe] Found element: selector="${element.selector}"`,
   );
 
-  const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector);
+  const escapedSelector = escapeForDoubleQuotedJavaScriptString(
+    element.selector,
+  );
   const escapedDocumentId = escapeForDoubleQuotedJavaScriptString(
     cachedElement.documentId,
   );
@@ -2538,7 +2547,9 @@ export async function performKeyboardInput(
   // STEP 2: Build JavaScript to input text
   // ============================================================
   // Escape quotes and backslashes in selector and text for safe injection
-  const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector);
+  const escapedSelector = escapeForDoubleQuotedJavaScriptString(
+    element.selector,
+  );
   const escapedDocumentId = escapeForDoubleQuotedJavaScriptString(
     cachedElement.documentId,
   );
@@ -2842,7 +2853,9 @@ export async function performElementSelect(
   // STEP 2: Build JavaScript to select option(s)
   // ============================================================
   // Escape quotes and backslashes in selector for safe injection
-  const escapedSelector = escapeForDoubleQuotedJavaScriptString(element.selector);
+  const escapedSelector = escapeForDoubleQuotedJavaScriptString(
+    element.selector,
+  );
   const escapedDocumentId = escapeForDoubleQuotedJavaScriptString(
     cachedElement.documentId,
   );
diff --git a/extension/src/commands/element-cache.ts b/extension/src/commands/element-cache.ts
index f16787f..a257817 100644
--- a/extension/src/commands/element-cache.ts
+++ b/extension/src/commands/element-cache.ts
@@ -122,7 +122,9 @@ class ElementCacheImpl {
     for (const [snapshotKey, snapshot] of this.snapshotViews.entries()) {
       if (this.isExpired(snapshot.createdAt)) {
         this.snapshotViews.delete(snapshotKey);
-        console.log(`⏰ [ElementCache] Snapshot expired for key ${snapshotKey}`);
+        console.log(
+          `⏰ [ElementCache] Snapshot expired for key ${snapshotKey}`,
+        );
         continue;
       }
 
@@ -140,7 +142,10 @@ class ElementCacheImpl {
         continue;
       }
 
-      if (!activeInventoryKeys.has(inventoryKey) && this.isExpired(inventory.createdAt)) {
+      if (
+        !activeInventoryKeys.has(inventoryKey) &&
+        this.isExpired(inventory.createdAt)
+      ) {
         inventoryKeysToDelete.push(inventoryKey);
       }
     }
@@ -195,8 +200,16 @@ class ElementCacheImpl {
     const inventoryId = this.nextInventoryId++;
     const snapshotId = this.nextSnapshotId++;
     const now = Date.now();
-    const inventoryKey = this.buildInventoryKey(conversationId, tabId, inventoryId);
-    const snapshotKey = this.buildSnapshotKey(conversationId, tabId, snapshotId);
+    const inventoryKey = this.buildInventoryKey(
+      conversationId,
+      tabId,
+      inventoryId,
+    );
+    const snapshotKey = this.buildSnapshotKey(
+      conversationId,
+      tabId,
+      snapshotId,
+    );
 
     this.inventories.set(inventoryKey, {
       tabId,
@@ -223,7 +236,11 @@ class ElementCacheImpl {
 
     this.pruneInventoriesForTab(conversationId, tabId);
 
-    const snapshotPage = this.getSnapshotPage(conversationId, tabId, snapshotId);
+    const snapshotPage = this.getSnapshotPage(
+      conversationId,
+      tabId,
+      snapshotId,
+    );
     if (!snapshotPage) {
       throw new Error(
         `Failed to retrieve newly stored highlight snapshot ${snapshotId}`,
@@ -244,13 +261,21 @@ class ElementCacheImpl {
   ): HighlightSnapshotPage | undefined {
     this.cleanupExpired();
 
-    const baseSnapshot = this.getSnapshotView(conversationId, tabId, baseSnapshotId);
+    const baseSnapshot = this.getSnapshotView(
+      conversationId,
+      tabId,
+      baseSnapshotId,
+    );
     if (!baseSnapshot) {
       return undefined;
     }
 
     const snapshotId = this.nextSnapshotId++;
-    const snapshotKey = this.buildSnapshotKey(conversationId, tabId, snapshotId);
+    const snapshotKey = this.buildSnapshotKey(
+      conversationId,
+      tabId,
+      snapshotId,
+    );
     const now = Date.now();
 
     this.snapshotViews.set(snapshotKey, {
@@ -260,12 +285,20 @@ class ElementCacheImpl {
       page,
     });
 
-    const inventory = this.getInventory(conversationId, tabId, baseSnapshot.inventoryId);
+    const inventory = this.getInventory(
+      conversationId,
+      tabId,
+      baseSnapshot.inventoryId,
+    );
     if (inventory) {
       this.touchInventory(inventory);
     }
 
-    const snapshotPage = this.getSnapshotPage(conversationId, tabId, snapshotId);
+    const snapshotPage = this.getSnapshotPage(
+      conversationId,
+      tabId,
+      snapshotId,
+    );
     if (snapshotPage) {
       console.log(
         `📄 [ElementCache] Forked snapshot ${snapshotId} from base ${baseSnapshotId} for conversation ${conversationId}, tab ${tabId}, page ${page}`,
@@ -286,7 +319,11 @@ class ElementCacheImpl {
       return undefined;
     }
 
-    const inventory = this.getInventory(conversationId, tabId, snapshot.inventoryId);
+    const inventory = this.getInventory(
+      conversationId,
+      tabId,
+      snapshot.inventoryId,
+    );
     if (!inventory) {
       return undefined;
     }
@@ -318,7 +355,11 @@ class ElementCacheImpl {
     snapshotId: number,
     elementId: string,
   ): CachedElementLookup | undefined {
-    const snapshotPage = this.getSnapshotPage(conversationId, tabId, snapshotId);
+    const snapshotPage = this.getSnapshotPage(
+      conversationId,
+      tabId,
+      snapshotId,
+    );
     if (!snapshotPage) {
       return undefined;
     }
@@ -352,7 +393,11 @@ class ElementCacheImpl {
       return undefined;
     }
 
-    const snapshotKey = this.buildSnapshotKey(conversationId, tabId, snapshotId);
+    const snapshotKey = this.buildSnapshotKey(
+      conversationId,
+      tabId,
+      snapshotId,
+    );
     const snapshot = this.snapshotViews.get(snapshotKey);
     if (!snapshot) {
       return undefined;
@@ -360,7 +405,9 @@ class ElementCacheImpl {
 
     if (snapshot.tabId !== tabId || this.isExpired(snapshot.createdAt)) {
       this.snapshotViews.delete(snapshotKey);
-      console.log(`⏰ [ElementCache] Snapshot expired or mismatched for key ${snapshotKey}`);
+      console.log(
+        `⏰ [ElementCache] Snapshot expired or mismatched for key ${snapshotKey}`,
+      );
       return undefined;
     }
 
@@ -376,7 +423,11 @@ class ElementCacheImpl {
       return undefined;
     }
 
-    const inventoryKey = this.buildInventoryKey(conversationId, tabId, inventoryId);
+    const inventoryKey = this.buildInventoryKey(
+      conversationId,
+      tabId,
+      inventoryId,
+    );
     const inventory = this.inventories.get(inventoryKey);
     if (!inventory) {
       return undefined;
@@ -400,11 +451,11 @@ class ElementCacheImpl {
         ? `${conversationId}:${tabId}:snapshot:`
         : `${conversationId}:`;
 
-    const inventoryKeysToDelete = Array.from(this.inventories.keys()).filter((key) =>
-      key.startsWith(inventoryPrefix),
+    const inventoryKeysToDelete = Array.from(this.inventories.keys()).filter(
+      (key) => key.startsWith(inventoryPrefix),
     );
-    const snapshotKeysToDelete = Array.from(this.snapshotViews.keys()).filter((key) =>
-      key.startsWith(snapshotPrefix),
+    const snapshotKeysToDelete = Array.from(this.snapshotViews.keys()).filter(
+      (key) => key.startsWith(snapshotPrefix),
     );
 
     for (const key of inventoryKeysToDelete) {
diff --git a/extension/src/commands/screenshot.ts b/extension/src/commands/screenshot.ts
index e66d066..fa3005e 100644
--- a/extension/src/commands/screenshot.ts
+++ b/extension/src/commands/screenshot.ts
@@ -702,12 +702,16 @@ async function inspectImageDimensions(dataUrl: string): Promise<{
   height: number;
 }> {
   if (!dataUrl.startsWith('data:')) {
-    throw new Error('[Screenshot] Invalid data URL while inspecting dimensions');
+    throw new Error(
+      '[Screenshot] Invalid data URL while inspecting dimensions',
+    );
   }
 
   const commaIndex = dataUrl.indexOf(',');
   if (commaIndex === -1) {
-    throw new Error('[Screenshot] Screenshot data URL missing payload separator');
+    throw new Error(
+      '[Screenshot] Screenshot data URL missing payload separator',
+    );
   }
 
   const header = dataUrl.slice(0, commaIndex);
diff --git a/extension/src/utils/collision-detection.ts b/extension/src/utils/collision-detection.ts
index 609fd41..2af05ae 100644
--- a/extension/src/utils/collision-detection.ts
+++ b/extension/src/utils/collision-detection.ts
@@ -287,12 +287,10 @@ function chooseNextCandidate(
 ): (PlacementEvaluation & { candidate: RemainingCandidate }) | null {
   const currentLabelText = String(selected.length + 1);
   let minFeasiblePositions = Number.POSITIVE_INFINITY;
-  let constrainedCandidate:
-    | {
-        candidate: RemainingCandidate;
-        feasiblePositions: LabelPosition[];
-      }
-    | null = null;
+  let constrainedCandidate: {
+    candidate: RemainingCandidate;
+    feasiblePositions: LabelPosition[];
+  } | null = null;
 
   for (const candidate of remaining) {
     const feasiblePositions = getFeasiblePositions(
@@ -342,7 +340,8 @@ function chooseLeastBlockingPlacement(
 ): PlacementEvaluation {
   const currentLabelText = String(selected.length + 1);
   const futureCandidates = remaining.filter(
-    (remainingCandidate) => remainingCandidate.sourceIndex !== candidate.sourceIndex,
+    (remainingCandidate) =>
+      remainingCandidate.sourceIndex !== candidate.sourceIndex,
   );
   let bestPlacement: PlacementEvaluation | null = null;
 
@@ -392,11 +391,13 @@ function chooseLeastBlockingPlacement(
     }
   }
 
-  return bestPlacement ?? {
-    position: POSITION_PRIORITY[0],
-    blockedCandidateCount: Number.POSITIVE_INFINITY,
-    totalFutureOptions: Number.NEGATIVE_INFINITY,
-  };
+  return (
+    bestPlacement ?? {
+      position: POSITION_PRIORITY[0],
+      blockedCandidateCount: Number.POSITIVE_INFINITY,
+      totalFutureOptions: Number.NEGATIVE_INFINITY,
+    }
+  );
 }
 
 function getFeasiblePositions(
diff --git a/server/agent/api.py b/server/agent/api.py
index 33c7d92..681fb1b 100644
--- a/server/agent/api.py
+++ b/server/agent/api.py
@@ -437,6 +437,7 @@ def initialize_agent():
         from .tools.highlight_tool import HighlightTool
         from .tools.element_interaction_tool import ElementInteractionTool
         from .tools.dialog_tool import DialogTool
+
         logger.info(
             "4 focused OpenBrowser tools registered: tab, highlight, element_interaction, dialog"
         )
diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py
index 62615ba..2d945af 100644
--- a/server/agent/tools/base.py
+++ b/server/agent/tools/base.py
@@ -113,7 +113,9 @@ def _pending_confirmation_llm_content(
         action_type = str(pending.get("action_type", "unknown"))
         element_id = str(pending.get("element_id", "unknown"))
         highlight_snapshot_id = str(
-            pending.get("highlight_snapshot_id", self.highlight_snapshot_id or "unknown")
+            pending.get(
+                "highlight_snapshot_id", self.highlight_snapshot_id or "unknown"
+            )
         )
         confirm_cmd = f'{{"action": "confirm_{action_type}"}}'
 
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 65f1b8e..4c9fa87 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -101,9 +101,9 @@ def __init__(self):
         # Pending confirmations per conversation for 2PC actions.
         self.pending_confirmations: Dict[str, Dict[str, Any]] = {}
         # Recently confirmed element targets keyed by action_type then exact element HTML.
-        self.confirmed_action_html_lru: Dict[
-            str, Dict[str, OrderedDict[str, None]]
-        ] = {}
+        self.confirmed_action_html_lru: Dict[str, Dict[str, OrderedDict[str, None]]] = (
+            {}
+        )
 
     def __call__(
         self, action: OpenBrowserAction, conversation
@@ -394,9 +394,7 @@ def _execute_element_interaction_action(
                 conversation_id=self.conversation_id,
                 tab_id=action.tab_id,
             )
-            result_dict = self._execute_element_command(
-                command, "hover element"
-            )
+            result_dict = self._execute_element_command(command, "hover element")
             return self._build_observation_from_result(
                 result_dict,
                 f"Hovered element: {action.element_id}",
@@ -418,9 +416,7 @@ def _execute_element_interaction_action(
                     conversation_id=self.conversation_id,
                     tab_id=action.tab_id,
                 )
-                result_dict = self._execute_element_command(
-                    command, "scroll element"
-                )
+                result_dict = self._execute_element_command(command, "scroll element")
                 return self._build_observation_from_result(
                     result_dict,
                     f"Scrolled element: {action.element_id}",
@@ -473,7 +469,9 @@ def _execute_element_interaction_action(
             if not action.text:
                 raise ValueError("keyboard_input requires text parameter")
             if action.highlight_snapshot_id is None:
-                raise ValueError("keyboard_input requires highlight_snapshot_id parameter")
+                raise ValueError(
+                    "keyboard_input requires highlight_snapshot_id parameter"
+                )
             full_html, screenshot = self._get_element_full_html(
                 action.element_id, action.highlight_snapshot_id
             )
@@ -529,9 +527,7 @@ def _execute_element_interaction_action(
                 conversation_id=self.conversation_id,
                 tab_id=action.tab_id,
             )
-            result_dict = self._execute_element_command(
-                command, "select option"
-            )
+            result_dict = self._execute_element_command(command, "select option")
             return self._build_observation_from_result(
                 result_dict,
                 f"Selected option in element: {action.element_id}",
@@ -685,21 +681,22 @@ def _get_pending_confirmation(self) -> Optional[Dict[str, Any]]:
         """Get pending confirmation for current conversation"""
         return self.pending_confirmations.get(self.conversation_id)
 
-    def _normalize_confirmed_action_html(self, full_html: Optional[str]) -> Optional[str]:
+    def _normalize_confirmed_action_html(
+        self, full_html: Optional[str]
+    ) -> Optional[str]:
         """Normalize cached element HTML used for repeat-click shortcut matching."""
         if not isinstance(full_html, str):
             return None
 
         normalized = full_html.strip()
-        if (
-            not normalized
-            or normalized == ELEMENT_HTML_CACHE_MISS_PLACEHOLDER
-        ):
+        if not normalized or normalized == ELEMENT_HTML_CACHE_MISS_PLACEHOLDER:
             return None
 
         return normalized
 
-    def _get_confirmed_action_html_lru(self, action_type: str) -> OrderedDict[str, None]:
+    def _get_confirmed_action_html_lru(
+        self, action_type: str
+    ) -> OrderedDict[str, None]:
         """Get or create the confirmed-action LRU cache for current conversation."""
         conversation_lru = self.confirmed_action_html_lru.setdefault(
             self.conversation_id, {}
diff --git a/server/agent/tools/help_tool.py b/server/agent/tools/help_tool.py
index 1a87881..a9b451d 100644
--- a/server/agent/tools/help_tool.py
+++ b/server/agent/tools/help_tool.py
@@ -57,9 +57,7 @@ def to_llm_content(self) -> Sequence[TextContent]:
         return [TextContent(text="Pending user's decision")]
 
 
-class PleaseHelpMeExecutor(
-    ToolExecutor[PleaseHelpMeAction, PleaseHelpMeObservation]
-):
+class PleaseHelpMeExecutor(ToolExecutor[PleaseHelpMeAction, PleaseHelpMeObservation]):
     """Executor that pauses the current turn and surfaces a user-help request."""
 
     def __call__(
@@ -76,9 +74,7 @@ def __call__(
         )
 
 
-class PleaseHelpMeTool(
-    ToolDefinition[PleaseHelpMeAction, PleaseHelpMeObservation]
-):
+class PleaseHelpMeTool(ToolDefinition[PleaseHelpMeAction, PleaseHelpMeObservation]):
     """Tool for requesting human intervention."""
 
     name = PLEASE_HELP_ME_TOOL_NAME
diff --git a/server/core/session_manager.py b/server/core/session_manager.py
index c02d146..1e0f3bd 100644
--- a/server/core/session_manager.py
+++ b/server/core/session_manager.py
@@ -122,7 +122,8 @@ def _init_database(self):
             cursor = conn.cursor()
 
             # Create sessions table
-            cursor.execute("""
+            cursor.execute(
+                """
                 CREATE TABLE IF NOT EXISTS sessions (
                     conversation_id TEXT PRIMARY KEY,
                     status TEXT NOT NULL DEFAULT 'idle',
@@ -135,13 +136,15 @@ def _init_database(self):
                     tags TEXT DEFAULT '[]',
                     metadata TEXT DEFAULT '{}'
                 )
-            """)
+            """
+            )
 
             # Run migrations to add missing columns
             self._migrate_database(cursor)
 
             # Create user_messages table
-            cursor.execute("""
+            cursor.execute(
+                """
                 CREATE TABLE IF NOT EXISTS user_messages (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     conversation_id TEXT NOT NULL,
@@ -151,10 +154,12 @@ def _init_database(self):
                     FOREIGN KEY (conversation_id) REFERENCES sessions(conversation_id),
                     UNIQUE(conversation_id, message_index)
                 )
-            """)
+            """
+            )
 
             # Create session_events table for SSE event history
-            cursor.execute("""
+            cursor.execute(
+                """
                 CREATE TABLE IF NOT EXISTS session_events (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     conversation_id TEXT NOT NULL,
@@ -164,23 +169,30 @@ def _init_database(self):
                     created_at TEXT NOT NULL,
                     FOREIGN KEY (conversation_id) REFERENCES sessions(conversation_id)
                 )
-            """)
+            """
+            )
 
             # Create indexes for faster queries
-            cursor.execute("""
+            cursor.execute(
+                """
                 CREATE INDEX IF NOT EXISTS idx_user_messages_conversation 
                 ON user_messages(conversation_id)
-            """)
+            """
+            )
 
-            cursor.execute("""
+            cursor.execute(
+                """
                 CREATE INDEX IF NOT EXISTS idx_session_events_conversation 
                 ON session_events(conversation_id)
-            """)
+            """
+            )
 
-            cursor.execute("""
+            cursor.execute(
+                """
                 CREATE INDEX IF NOT EXISTS idx_session_events_index 
                 ON session_events(conversation_id, event_index)
-            """)
+            """
+            )
 
             conn.commit()
             conn.close()
@@ -198,9 +210,11 @@ def _migrate_database(self, cursor):
 
             if "first_user_message" not in columns:
                 logger.info("Adding missing column: first_user_message")
-                cursor.execute("""
+                cursor.execute(
+                    """
                     ALTER TABLE sessions ADD COLUMN first_user_message TEXT
-                """)
+                """
+                )
                 logger.info("Migration completed: first_user_message column added")
 
         except Exception as e:
diff --git a/server/tests/integration/test_element_operations.py b/server/tests/integration/test_element_operations.py
index 6b62aff..9b20d89 100644
--- a/server/tests/integration/test_element_operations.py
+++ b/server/tests/integration/test_element_operations.py
@@ -108,8 +108,8 @@ def test_returns_page_local_numeric_ids(
                 assert re.match(
                     r"^\d+$", element_id
                 ), f"Element ID should be numeric: {element_id}"
-                assert (
-                    element_id == str(index)
+                assert element_id == str(
+                    index
                 ), f"Element ID should follow response order: expected {index}, got {element_id}"
 
 
diff --git a/server/tests/unit/test_agent_browser_executor.py b/server/tests/unit/test_agent_browser_executor.py
index 2ea9c7d..eaa07e4 100644
--- a/server/tests/unit/test_agent_browser_executor.py
+++ b/server/tests/unit/test_agent_browser_executor.py
@@ -4,7 +4,11 @@
 
 from server.agent.tools.browser_executor import BrowserExecutor
 from server.agent.tools.element_interaction_tool import ElementInteractionAction
-from server.models.commands import ClickElementCommand, HoverElementCommand, SwipeElementCommand
+from server.models.commands import (
+    ClickElementCommand,
+    HoverElementCommand,
+    SwipeElementCommand,
+)
 
 
 class _FakeResponse:
@@ -103,7 +107,10 @@ def test_keyboard_input_sets_pending_confirmation(monkeypatch) -> None:
     )
 
     assert observation.success is True
-    assert observation.message == "Keyboard input action pending confirmation for element: inp123"
+    assert (
+        observation.message
+        == "Keyboard input action pending confirmation for element: inp123"
+    )
     assert observation.pending_confirmation is not None
     assert observation.pending_confirmation["action_type"] == "keyboard_input"
     assert observation.pending_confirmation["element_id"] == "inp123"
@@ -125,7 +132,10 @@ def test_confirm_click_uses_pending_confirmation_state(monkeypatch) -> None:
 
     def fake_execute(command):
         captured["command"] = command
-        return {"success": True, "data": {"screenshot": "data:image/png;base64,clicked"}}
+        return {
+            "success": True,
+            "data": {"screenshot": "data:image/png;base64,clicked"},
+        }
 
     monkeypatch.setattr(executor, "_execute_command_sync", fake_execute)
 
@@ -151,7 +161,7 @@ def test_repeat_click_with_confirmed_html_executes_without_pending_confirmation(
     executor.conversation_id = "conv-repeat-click"
     executor._remember_confirmed_action_html(
         "click",
-        '<a href="/gbr/articles/opinion-article2.html">Why the Fed Must Act Decisively</a>'
+        '<a href="/gbr/articles/opinion-article2.html">Why the Fed Must Act Decisively</a>',
     )
 
     monkeypatch.setattr(
@@ -197,9 +207,7 @@ def fake_execute(command):
 def test_repeat_click_does_not_shortcut_cache_miss_placeholder(monkeypatch) -> None:
     executor = BrowserExecutor()
     executor.conversation_id = "conv-repeat-placeholder"
-    executor._remember_confirmed_action_html(
-        "click", "<element not found in cache>"
-    )
+    executor._remember_confirmed_action_html("click", "<element not found in cache>")
 
     monkeypatch.setattr(
         executor,
@@ -227,7 +235,9 @@ def test_repeat_click_does_not_shortcut_cache_miss_placeholder(monkeypatch) -> N
     assert observation.success is True
     assert observation.message == "Click action pending confirmation for element: 15"
     assert observation.pending_confirmation is not None
-    assert observation.pending_confirmation["full_html"] == "<element not found in cache>"
+    assert (
+        observation.pending_confirmation["full_html"] == "<element not found in cache>"
+    )
 
 
 def test_confirmed_action_html_lru_keeps_only_most_recent_ten_entries() -> None:
@@ -235,7 +245,9 @@ def test_confirmed_action_html_lru_keeps_only_most_recent_ten_entries() -> None:
     executor.conversation_id = "conv-repeat-lru"
 
     for i in range(11):
-        executor._remember_confirmed_action_html("click", f"<button>Action {i}</button>")
+        executor._remember_confirmed_action_html(
+            "click", f"<button>Action {i}</button>"
+        )
 
     lru = executor._get_confirmed_action_html_lru("click")
 
@@ -371,10 +383,7 @@ def fake_execute(command):
     )
 
     assert observation.success is False
-    assert (
-        observation.error
-        == "Failed to input text: Input element is detached"
-    )
+    assert observation.error == "Failed to input text: Input element is detached"
     assert captured["command"].element_id == "inp123"
     assert captured["command"].highlight_snapshot_id == 17
     assert captured["command"].tab_id == 321
diff --git a/server/tests/unit/test_base_classes.py b/server/tests/unit/test_base_classes.py
index 32598e5..cd67bd2 100644
--- a/server/tests/unit/test_base_classes.py
+++ b/server/tests/unit/test_base_classes.py
@@ -181,7 +181,9 @@ def test_pending_confirmation_includes_follow_up_command(self) -> None:
         assert "**Element ID**: a1b2c3" in text
         assert "**Action Type**: click" in text
 
-    def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) -> None:
+    def test_pending_keyboard_confirmation_uses_matching_follow_up_command(
+        self,
+    ) -> None:
         observation = OpenBrowserObservation(
             success=True,
             pending_confirmation={
@@ -199,7 +201,9 @@ def test_pending_keyboard_confirmation_uses_matching_follow_up_command(self) ->
         assert "**Element ID**: inp789" in text
         assert "**Action Type**: keyboard_input" in text
 
-    def test_pending_confirmation_with_screenshot_is_image_first_and_text_minimal(self) -> None:
+    def test_pending_confirmation_with_screenshot_is_image_first_and_text_minimal(
+        self,
+    ) -> None:
         observation = OpenBrowserObservation(
             success=True,
             screenshot_data_url="data:image/png;base64,confirm123",
diff --git a/server/tests/unit/test_command_models.py b/server/tests/unit/test_command_models.py
index 03a321e..140e3a3 100644
--- a/server/tests/unit/test_command_models.py
+++ b/server/tests/unit/test_command_models.py
@@ -64,15 +64,21 @@ def test_scroll_rejects_out_of_range_amounts(self, amount: float) -> None:
     def test_keyboard_input_allows_empty_text_for_clear_style_interactions(
         self,
     ) -> None:
-        command = KeyboardInputCommand(element_id="field123", text="")
+        command = KeyboardInputCommand(
+            element_id="field123",
+            highlight_snapshot_id=101,
+            text="",
+        )
 
         assert command.text == ""
+        assert command.highlight_snapshot_id == 101
 
     def test_swipe_defaults_match_carousel_workflow(self) -> None:
-        command = SwipeElementCommand(element_id="carousel1")
+        command = SwipeElementCommand(element_id="carousel1", highlight_snapshot_id=202)
 
         assert command.direction == "next"
         assert command.swipe_count == 1
+        assert command.highlight_snapshot_id == 202
 
     def test_swipe_direction_description_uses_content_semantics(self) -> None:
         description = SwipeElementCommand.model_fields["direction"].description
@@ -84,7 +90,11 @@ def test_swipe_direction_description_uses_content_semantics(self) -> None:
 
     @pytest.mark.parametrize("count", [1, 5])
     def test_swipe_accepts_documented_count_boundaries(self, count: int) -> None:
-        command = SwipeElementCommand(element_id="carousel1", swipe_count=count)
+        command = SwipeElementCommand(
+            element_id="carousel1",
+            highlight_snapshot_id=202,
+            swipe_count=count,
+        )
 
         assert command.swipe_count == count
 
@@ -107,6 +117,7 @@ class TestParseCommandContracts:
                 {
                     "type": "click_element",
                     "element_id": "abc123",
+                    "highlight_snapshot_id": 11,
                     "conversation_id": "conv-1",
                     "browser_id": "browser-1",
                 },
@@ -116,6 +127,7 @@ class TestParseCommandContracts:
                 {
                     "type": "keyboard_input",
                     "element_id": "field123",
+                    "highlight_snapshot_id": 12,
                     "text": "hello",
                     "tab_id": 7,
                     "conversation_id": "conv-2",
@@ -136,6 +148,7 @@ class TestParseCommandContracts:
                 {
                     "type": "swipe_element",
                     "element_id": "carousel1",
+                    "highlight_snapshot_id": 13,
                     "direction": "prev",
                     "swipe_count": 2,
                     "conversation_id": "conv-4",
@@ -153,6 +166,8 @@ def test_parse_command_preserves_routing_metadata(
         assert isinstance(command, expected_type)
         assert command.conversation_id == payload["conversation_id"]
         assert command.browser_id == payload["browser_id"]
+        if "highlight_snapshot_id" in payload:
+            assert command.highlight_snapshot_id == payload["highlight_snapshot_id"]
 
     def test_parse_command_rejects_missing_type(self) -> None:
         with pytest.raises(ValueError, match="must have 'type' field"):
diff --git a/server/tests/unit/test_eval_client.py b/server/tests/unit/test_eval_client.py
index c597d90..dcca19a 100644
--- a/server/tests/unit/test_eval_client.py
+++ b/server/tests/unit/test_eval_client.py
@@ -252,9 +252,7 @@ def test_run_test_cleans_managed_tabs_before_delete(tmp_path) -> None:
     evaluator.openbrowser.create_conversation.return_value = "conv-123"
     evaluator.openbrowser.send_message.return_value = MessageRunResult(events=[])
     evaluator.openbrowser.cleanup_managed_tabs.side_effect = (
-        lambda conversation_id: teardown_calls.append(
-            f"cleanup:{conversation_id}"
-        )
+        lambda conversation_id: teardown_calls.append(f"cleanup:{conversation_id}")
         or False
     )
     evaluator.openbrowser.delete_conversation.side_effect = (
diff --git a/server/tests/unit/test_prompt_contracts.py b/server/tests/unit/test_prompt_contracts.py
index e26a17d..1a8a825 100644
--- a/server/tests/unit/test_prompt_contracts.py
+++ b/server/tests/unit/test_prompt_contracts.py
@@ -72,37 +72,61 @@ def test_highlight_prompt_keeps_icon_targets_on_any_pagination(self) -> None:
         assert "icon-only controls" in description
         assert "Stay on the same `element_type` across pages" in description
         assert "actual button may simply be on the next page" in description
-        assert "Keep generic controls, buttons, links, dense toolbars, and icon-only targets inside `any`" in description
-        assert '`clickable`' not in description
+        assert (
+            "Keep generic controls, buttons, links, dense toolbars, and icon-only targets inside `any`"
+            in description
+        )
+        assert "`clickable`" not in description
 
-    def test_highlight_prompt_requires_exact_text_keywords_and_pagination_before_guessing(self) -> None:
+    def test_highlight_prompt_requires_exact_text_keywords_and_pagination_before_guessing(
+        self,
+    ) -> None:
         description = get_highlight_tool_description()
 
-        assert "Treat pages as reliable collision-free slices of the same candidate set" in description
+        assert (
+            "Treat pages as reliable collision-free slices of the same candidate set"
+            in description
+        )
         assert "Do not jump from a first-page miss to `keywords`" in description
-        assert "Use keywords only for exact literal text characters you can already see on the target itself in the current screenshot" in description
+        assert (
+            "Use keywords only for exact literal text characters you can already see on the target itself in the current screenshot"
+            in description
+        )
         assert '`{"keywords": ["52"]}`' in description
         assert '`["star"]`, `["favorite"]`, or `["bookmark"]`' in description
         assert "DO NOT use synonym bundles like" in description
         assert "Examples of broad search" not in description
         assert "Phase 2: Broad Search" not in description
 
-    def test_highlight_prompt_requires_rehighlight_after_significant_page_change(self) -> None:
+    def test_highlight_prompt_requires_rehighlight_after_significant_page_change(
+        self,
+    ) -> None:
         description = get_highlight_tool_description()
 
         assert "After any significant page-state change" in description
-        assert 'call `highlight` with `element_type: "any"` again before choosing the next element' in description
-        assert "Do not jump straight to `keywords` or another narrower type on that changed page" in description
+        assert (
+            'call `highlight` with `element_type: "any"` again before choosing the next element'
+            in description
+        )
+        assert (
+            "Do not jump straight to `keywords` or another narrower type on that changed page"
+            in description
+        )
 
     def test_highlight_prompt_omits_clickable_mode_from_agent_guidance(self) -> None:
         description = get_highlight_tool_description()
 
-        assert '`clickable`' not in description
+        assert "`clickable`" not in description
 
-    def test_highlight_prompt_requires_click_before_keyboard_input_for_inputable_targets(self) -> None:
+    def test_highlight_prompt_requires_click_before_keyboard_input_for_inputable_targets(
+        self,
+    ) -> None:
         description = get_highlight_tool_description()
 
-        assert "always `click` it first and complete that confirmation before `keyboard_input`" in description
+        assert (
+            "always `click` it first and complete that confirmation before `keyboard_input`"
+            in description
+        )
 
     def test_small_model_highlight_prompt_omits_keywords_guidance(self) -> None:
         with patch.object(
@@ -139,11 +163,19 @@ def test_tab_prompt_points_agents_to_tab_view_for_clean_screenshots(self) -> Non
         assert "tab view" in description
         assert "clean screenshot" in description.lower()
 
-    def test_element_interaction_prompt_requires_click_before_keyboard_input(self) -> None:
+    def test_element_interaction_prompt_requires_click_before_keyboard_input(
+        self,
+    ) -> None:
         description = get_element_interaction_tool_description()
 
-        assert "Always `click` the target first and complete that confirmation before `keyboard_input`." in description
-        assert "only after you already clicked the same input target and completed that click confirmation" in description
+        assert (
+            "Always `click` the target first and complete that confirmation before `keyboard_input`."
+            in description
+        )
+        assert (
+            "only after you already clicked the same input target and completed that click confirmation"
+            in description
+        )
 
     def test_element_interaction_prompt_explains_swipe_semantics(self) -> None:
         description = get_element_interaction_tool_description()
@@ -157,8 +189,14 @@ def test_element_interaction_confirm_examples_use_pending_state_only(self) -> No
 
         assert '{ "action": "confirm_click" }' in description
         assert '{ "action": "confirm_keyboard_input" }' in description
-        assert '{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 }' not in description
-        assert '{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 }' not in description
+        assert (
+            '{ "action": "confirm_click", "highlight_snapshot_id": 17, "element_id": "3", "tab_id": 123 }'
+            not in description
+        )
+        assert (
+            '{ "action": "confirm_keyboard_input", "highlight_snapshot_id": 17, "element_id": "1", "tab_id": 123 }'
+            not in description
+        )
 
     def test_element_interaction_action_schema_explains_swipe_semantics(self) -> None:
         description = ElementInteractionAction.model_fields["direction"].description
diff --git a/server/tests/unit/test_screenshot_behavior.py b/server/tests/unit/test_screenshot_behavior.py
index 001011a..a575bd2 100644
--- a/server/tests/unit/test_screenshot_behavior.py
+++ b/server/tests/unit/test_screenshot_behavior.py
@@ -149,7 +149,9 @@ async def mock_send_cmd(cmd):
     async def test_click_element_returns_screenshot(self, processor):
         """click_element command should return screenshot (from extension)."""
         command = ClickElementCommand(
-            element_id="1", conversation_id="test-conv-5"
+            element_id="1",
+            highlight_snapshot_id=55,
+            conversation_id="test-conv-5",
         )
 
         mock_response = CommandResponse(
diff --git a/server/tests/unit/test_state.py b/server/tests/unit/test_state.py
index a49882f..9ee0400 100644
--- a/server/tests/unit/test_state.py
+++ b/server/tests/unit/test_state.py
@@ -25,11 +25,13 @@ def test_creation_with_required_fields(self) -> None:
         """Test creating PendingConfirmation with only required fields."""
         confirmation = PendingConfirmation(
             element_id="abc123",
+            highlight_snapshot_id=101,
             action_type="click",
             full_html="<button>Click me</button>",
         )
 
         assert confirmation.element_id == "abc123"
+        assert confirmation.highlight_snapshot_id == 101
         assert confirmation.action_type == "click"
         assert confirmation.full_html == "<button>Click me</button>"
         assert confirmation.extra_data == {}
@@ -39,6 +41,7 @@ def test_creation_with_all_fields(self) -> None:
         """Test creating PendingConfirmation with all fields."""
         confirmation = PendingConfirmation(
             element_id="xyz789",
+            highlight_snapshot_id=202,
             action_type="keyboard_input",
             full_html='<input type="text" />',
             extra_data={"text": "hello world"},
@@ -46,6 +49,7 @@ def test_creation_with_all_fields(self) -> None:
         )
 
         assert confirmation.element_id == "xyz789"
+        assert confirmation.highlight_snapshot_id == 202
         assert confirmation.action_type == "keyboard_input"
         assert confirmation.full_html == '<input type="text" />'
         assert confirmation.extra_data == {"text": "hello world"}
@@ -53,9 +57,17 @@ def test_creation_with_all_fields(self) -> None:
 
     def test_extra_data_default_is_mutable(self) -> None:
         """Test that extra_data defaults to a new empty dict each time."""
-        c1 = PendingConfirmation(element_id="a", action_type="click", full_html="")
+        c1 = PendingConfirmation(
+            element_id="a",
+            highlight_snapshot_id=1,
+            action_type="click",
+            full_html="",
+        )
         c2 = PendingConfirmation(
-            element_id="b", action_type="keyboard_input", full_html=""
+            element_id="b",
+            highlight_snapshot_id=2,
+            action_type="keyboard_input",
+            full_html="",
         )
 
         c1.extra_data["key"] = "value"
@@ -76,6 +88,7 @@ def test_set_pending(self) -> None:
         state.set_pending(
             conversation_id="conv-123",
             element_id="elem-456",
+            highlight_snapshot_id=101,
             action_type="click",
             full_html="<button>Submit</button>",
         )
@@ -83,6 +96,7 @@ def test_set_pending(self) -> None:
         assert "conv-123" in state.pending_confirmations
         pending = state.pending_confirmations["conv-123"]
         assert pending.element_id == "elem-456"
+        assert pending.highlight_snapshot_id == 101
         assert pending.action_type == "click"
         assert pending.full_html == "<button>Submit</button>"
 
@@ -92,6 +106,7 @@ def test_set_pending_with_optional_fields(self) -> None:
         state.set_pending(
             conversation_id="conv-789",
             element_id="elem-abc",
+            highlight_snapshot_id=202,
             action_type="keyboard_input",
             full_html="<input />",
             extra_data={"text": "hello"},
@@ -108,6 +123,7 @@ def test_get_pending_existing(self) -> None:
         state.set_pending(
             conversation_id="conv-1",
             element_id="elem-1",
+            highlight_snapshot_id=303,
             action_type="keyboard_input",
             full_html="<input />",
         )
@@ -115,6 +131,7 @@ def test_get_pending_existing(self) -> None:
         pending = state.get_pending("conv-1")
         assert pending is not None
         assert pending.element_id == "elem-1"
+        assert pending.highlight_snapshot_id == 303
         assert pending.action_type == "keyboard_input"
 
     def test_get_pending_nonexistent(self) -> None:
@@ -129,6 +146,7 @@ def test_clear_pending_existing(self) -> None:
         state.set_pending(
             conversation_id="conv-to-clear",
             element_id="elem-1",
+            highlight_snapshot_id=404,
             action_type="click",
             full_html="<button/>",
         )
@@ -152,6 +170,7 @@ def test_multiple_conversations_isolation(self) -> None:
         state.set_pending(
             conversation_id="conv-1",
             element_id="elem-1",
+            highlight_snapshot_id=501,
             action_type="click",
             full_html="<button>A</button>",
         )
@@ -160,6 +179,7 @@ def test_multiple_conversations_isolation(self) -> None:
         state.set_pending(
             conversation_id="conv-2",
             element_id="elem-2",
+            highlight_snapshot_id=502,
             action_type="keyboard_input",
             full_html="<input>B</input>",
         )
@@ -167,7 +187,9 @@ def test_multiple_conversations_isolation(self) -> None:
         # Verify isolation
         assert len(state.pending_confirmations) == 2
         assert state.get_pending("conv-1").element_id == "elem-1"
+        assert state.get_pending("conv-1").highlight_snapshot_id == 501
         assert state.get_pending("conv-2").element_id == "elem-2"
+        assert state.get_pending("conv-2").highlight_snapshot_id == 502
 
         # Clear one doesn't affect the other
         state.clear_pending("conv-1")
@@ -181,6 +203,7 @@ def test_overwrite_pending(self) -> None:
         state.set_pending(
             conversation_id="conv-1",
             element_id="elem-1",
+            highlight_snapshot_id=601,
             action_type="click",
             full_html="<button>First</button>",
         )
@@ -188,12 +211,14 @@ def test_overwrite_pending(self) -> None:
         state.set_pending(
             conversation_id="conv-1",
             element_id="elem-2",
+            highlight_snapshot_id=602,
             action_type="keyboard_input",
             full_html="<input>Second</input>",
         )
 
         pending = state.get_pending("conv-1")
         assert pending.element_id == "elem-2"
+        assert pending.highlight_snapshot_id == 602
         assert pending.action_type == "keyboard_input"
         assert pending.full_html == "<input>Second</input>"
 
@@ -206,10 +231,12 @@ def test_all_action_types(self) -> None:
             state.set_pending(
                 conversation_id=f"conv-{i}",
                 element_id=f"elem-{i}",
+                highlight_snapshot_id=700 + i,
                 action_type=action_type,
                 full_html=f"<{action_type}>",
             )
 
         for i, action_type in enumerate(action_types):
             pending = state.get_pending(f"conv-{i}")
+            assert pending.highlight_snapshot_id == 700 + i
             assert pending.action_type == action_type
diff --git a/server/tests/unit/test_tool_prompt_profiles.py b/server/tests/unit/test_tool_prompt_profiles.py
index ba90e0e..8d562ab 100644
--- a/server/tests/unit/test_tool_prompt_profiles.py
+++ b/server/tests/unit/test_tool_prompt_profiles.py
@@ -55,22 +55,37 @@ def test_small_model_highlight_prompt_stays_compact_and_actionable() -> None:
         description = get_highlight_tool_description()
 
     assert "## Core Rules" in description
-    assert 'default first pass for each new page state' in description
-    assert 'extension-derived page insight across element types' in description
+    assert "default first pass for each new page state" in description
+    assert "extension-derived page insight across element types" in description
     assert "Treat highlight pagination as reliable" in description
     assert "After any significant page-state change" in description
-    assert "Do not jump away from `element_type: \"any\"` on a newly changed page" in description
+    assert (
+        'Do not jump away from `element_type: "any"` on a newly changed page'
+        in description
+    )
     assert "icon-only toolbar or header control" in description
-    assert "continue `any` pagination and inspect the next pages instead of switching modes" in description
+    assert (
+        "continue `any` pagination and inspect the next pages instead of switching modes"
+        in description
+    )
     assert "you may narrow to `inputable`" in description
-    assert "always `click` it first and complete that confirmation before `keyboard_input`" in description
+    assert (
+        "always `click` it first and complete that confirmation before `keyboard_input`"
+        in description
+    )
     assert "After typing, continue discovery with `any`" in description
-    assert "When a search results page loads, call `highlight` with `element_type: \"any\"`" in description
+    assert (
+        'When a search results page loads, call `highlight` with `element_type: "any"`'
+        in description
+    )
     assert "use `tab back`" in description
-    assert "Do not use guessed labels such as \"settings\", \"gear\", \"bell\", \"next\", \"prev\", or \"close\"" in description
+    assert (
+        'Do not use guessed labels such as "settings", "gear", "bell", "next", "prev", or "close"'
+        in description
+    )
     assert "icon button next to a count or badge" in description
     assert "`keywords`" not in description
-    assert '`clickable`' not in description
+    assert "`clickable`" not in description
     assert "Phase 1: Precise Search" not in description
     assert "Collision-Aware Pagination" not in description
 
@@ -105,27 +120,45 @@ def test_large_model_highlight_prompt_keeps_detailed_pagination_guidance_without
         description = get_highlight_tool_description()
 
     assert "Collision-Aware Pagination" in description
-    assert '## Any-First Discovery Rule' in description
-    assert 'default first pass for each new page state' in description
-    assert 'extension-derived page insight across element types' in description
-    assert "After any significant page-state change, restart with `highlight` on `element_type: \"any\"`" in description
-    assert "Do not jump away from `element_type: \"any\"` on that changed page" in description
+    assert "## Any-First Discovery Rule" in description
+    assert "default first pass for each new page state" in description
+    assert "extension-derived page insight across element types" in description
+    assert (
+        'After any significant page-state change, restart with `highlight` on `element_type: "any"`'
+        in description
+    )
+    assert (
+        'Do not jump away from `element_type: "any"` on that changed page'
+        in description
+    )
     assert "Exact-Text Search Only" in description
     assert "icon-only controls" in description
     assert 'Prefer `element_type: "any"` as the default first pass' in description
-    assert "always `click` it first and complete that confirmation before `keyboard_input`" in description
-    assert "Treat pages as reliable collision-free slices of the same candidate set" in description
+    assert (
+        "always `click` it first and complete that confirmation before `keyboard_input`"
+        in description
+    )
+    assert (
+        "Treat pages as reliable collision-free slices of the same candidate set"
+        in description
+    )
     assert "Do not jump from a first-page miss to `keywords`" in description
-    assert "prefer more `any` pages over broad keyword search or a narrower generic-control mode" in description
+    assert (
+        "prefer more `any` pages over broad keyword search or a narrower generic-control mode"
+        in description
+    )
     assert (
         'DO NOT search for unlabeled toolbar icons or ambiguous controls with guessed words like "settings", "gear", "bell", "chat", "next", "prev", or "close"'
         in description
     )
-    assert "Use keywords only for exact literal text characters you can already see on the target itself in the current screenshot" in description
+    assert (
+        "Use keywords only for exact literal text characters you can already see on the target itself in the current screenshot"
+        in description
+    )
     assert '`{"keywords": ["52"]}`' in description
     assert '`["star"]`, `["favorite"]`, or `["bookmark"]`' in description
     assert "the actual button may simply be on the next page" in description
-    assert '`clickable`' not in description
+    assert "`clickable`" not in description
     assert "Phase 2: Broad Search" not in description
     assert "Examples of broad search" not in description
 
@@ -142,7 +175,10 @@ def test_small_model_element_interaction_requires_click_before_keyboard_input()
     ):
         description = get_element_interaction_tool_description()
 
-    assert "If the target is `inputable`, always `click` it first and complete that confirmation before `keyboard_input`." in description
+    assert (
+        "If the target is `inputable`, always `click` it first and complete that confirmation before `keyboard_input`."
+        in description
+    )
     assert "always `click` first, then use `keyboard_input`" in description
     assert '`direction: "next"` means show the next picture' in description
     assert '`direction: "prev"` means show the previous picture' in description
@@ -161,8 +197,14 @@ def test_large_model_element_interaction_requires_click_before_keyboard_input()
     ):
         description = get_element_interaction_tool_description()
 
-    assert "Always `click` the target first and complete that confirmation before `keyboard_input`." in description
-    assert "only after you already clicked the same input target and completed that click confirmation" in description
+    assert (
+        "Always `click` the target first and complete that confirmation before `keyboard_input`."
+        in description
+    )
+    assert (
+        "only after you already clicked the same input target and completed that click confirmation"
+        in description
+    )
     assert '`direction: "next"` means show the next picture' in description
     assert '`direction: "prev"` means show the previous picture' in description
     assert "not finger or gesture directions" in description