diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8beb36d..07dc554 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: hooks: - id: black name: black - entry: uv run black + entry: uv run --extra dev black language: system types_or: [python, pyi] require_serial: true diff --git a/AGENTS.md b/AGENTS.md index ad00f55..93244e4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -204,6 +204,9 @@ Elements are paginated to ensure **no visual overlap** in each screenshot: - `highlight_elements` now uses a **snapshot-first** readiness check instead of page-side polling loops. - Reason: OpenBrowser intentionally keeps automated tabs in the browser background, and Chrome may heavily throttle hidden-tab timers. A page-side `setTimeout` stability loop can therefore take far longer than its nominal budget and become the main cause of highlight timeouts. +- In practice, the main cause of unstable first-highlight screenshots is often **missing warmup**, not a bad readiness classifier. A background tab may answer lightweight `Runtime.evaluate` probes while still sitting in a partially painted / partially decoded state. +- A screenshot-style warmup is therefore the default precondition for `highlight_elements`. It helps force hidden-tab paint/compositor/image-decode work before interactive-element detection runs. +- If `highlight_elements` keeps returning `not_ready` but `tab view` immediately makes the next highlight succeed, treat that as a warmup issue first. - The extension samples viewport readiness signals once per attempt: document readiness, viewport text/media density, pending images, and loading placeholders such as skeleton/shimmer/spinner indicators. - Readiness is graded as `ready`, `provisionally_ready`, or `not_ready`. - If readiness is `not_ready`, the extension performs only a couple of short **background-side** retries before proceeding or returning the latest result. @@ -213,7 +216,7 @@ Elements are paginated to ensure **no visual overlap** in each screenshot: ``` # Highlight mixed elements first (default) highlight_elements() → Page 1 of any interactive elements -highlight_elements(page=2) → Page 2 of the same any inventory +highlight_elements(page=2) → Page 2 of the current page state's any results highlight_elements(element_type="any", page=1) → Explicit any-first discovery # Highlight other types (one at a time) @@ -312,6 +315,8 @@ cd extension && npm run build OpenBrowser has explicit screenshot control for maximum flexibility: +- Screenshots also serve as a practical page warmup mechanism for background tabs. They can unblock page paint and media decode work that passive DOM/readiness inspection does not reliably trigger on its own. + ### Commands That Return Screenshots | Command | Auto-Screenshot | Notes | diff --git a/eval/bluebook/js/bluebook.js b/eval/bluebook/js/bluebook.js index 981c00c..dec9406 100644 --- a/eval/bluebook/js/bluebook.js +++ b/eval/bluebook/js/bluebook.js @@ -612,6 +612,12 @@ window.tracker = new AgentTracker('bluebook.life', 'hard'); return notes; } + function stabilizeDefaultFeedOrder(notes) { + keepNoteAwayFromTop(notes, 'note-openclaw-config', 18); + keepNoteAwayFromTop(notes, 'note-arigato-ai', 24); + return notes; + } + function getCurrentNote() { return state.notes.find((note) => note.id === state.currentNoteId) || null; } @@ -1059,7 +1065,7 @@ window.tracker = new AgentTracker('bluebook.life', 'hard'); state.notes[swapIndex] = temp; } - keepNoteAwayFromTop(state.notes, 'note-openclaw-config', 18); + stabilizeDefaultFeedOrder(state.notes); } function handleFeedReload() { @@ -1467,7 +1473,7 @@ window.tracker = new AgentTracker('bluebook.life', 'hard'); function initialize() { state.notes = buildNotes(); - keepNoteAwayFromTop(state.notes, 'note-openclaw-config', 18); + stabilizeDefaultFeedOrder(state.notes); state.query = getSearchQueryFromUrl(); cacheDom(); diff --git a/eval/dataset/cloudstack.yaml b/eval/dataset/cloudstack.yaml index fcae0ee..c35ce7e 100644 --- a/eval/dataset/cloudstack.yaml +++ b/eval/dataset/cloudstack.yaml @@ -21,10 +21,14 @@ criteria: event_type: click element_id: "das-agent-toggle" page: "/cloudstack/das.html" - alternative: - event_type: click - element_id: "open-chat-btn" - page: "/cloudstack/das.html" + alternatives: + - event_type: click + element_id: "open-chat-btn" + page: "/cloudstack/das.html" + - event_type: click + element_text: "DAS Agent" + parent_text_contains: "AI" + page: "/cloudstack/das.html" - type: greet_das_agent description: "Send a greeting message to DAS agent" points: 1 @@ -40,4 +44,4 @@ criteria: event_type: click element_id: "send-btn" page: "/cloudstack/das.html" - optional: true \ No newline at end of file + optional: true diff --git a/eval/dataset/cloudstack_interactive.yaml b/eval/dataset/cloudstack_interactive.yaml index cc5883f..e49d06d 100644 --- a/eval/dataset/cloudstack_interactive.yaml +++ b/eval/dataset/cloudstack_interactive.yaml @@ -24,10 +24,14 @@ criteria: event_type: click element_id: "das-agent-toggle" page: "/cloudstack/das.html" - alternative: - event_type: click - element_id: "open-chat-btn" - page: "/cloudstack/das.html" + alternatives: + - event_type: click + element_id: "open-chat-btn" + page: "/cloudstack/das.html" + - event_type: click + element_text: "DAS Agent" + parent_text_contains: "AI" + page: "/cloudstack/das.html" # New: Send initial greeting - type: greet_das_agent @@ -114,4 +118,4 @@ criteria: event_type: count_min condition: "chat_interactions" count: 3 - page: "/cloudstack/das.html" \ No newline at end of file + page: "/cloudstack/das.html" diff --git a/eval/evaluate_browser_agent.py b/eval/evaluate_browser_agent.py index d8686c8..d23d99c 100644 --- a/eval/evaluate_browser_agent.py +++ b/eval/evaluate_browser_agent.py @@ -610,8 +610,7 @@ def start_openbrowser(self) -> bool: return True root_dir = EVAL_DIR.parent - logger.error( - f""" + logger.error(f""" ❌ OpenBrowser server is not running! Please start the OpenBrowser server manually with: @@ -619,8 +618,7 @@ def start_openbrowser(self) -> bool: uv run local-chrome-server serve The server should start on port 8765 (REST API) and 8766 (WebSocket). -""" - ) +""") return False except Exception as e: @@ -637,8 +635,7 @@ def start_eval_server(self) -> bool: eval_dir = EVAL_DIR root_dir = EVAL_DIR.parent - logger.error( - f""" + logger.error(f""" ❌ Eval server is not running! Please start the eval server manually with: @@ -650,8 +647,7 @@ def start_eval_server(self) -> bool: uv run python eval/server.py The server should start on port 16605. -""" - ) +""") return False except Exception as e: @@ -1304,6 +1300,7 @@ def _evaluate_criteria( expected = criterion.get("expected") points = criterion.get("points", 1) alternative = criterion.get("alternative") + alternatives = criterion.get("alternatives", []) optional = criterion.get("optional", False) # For optional criteria, we give the points automatically (treat as satisfied) @@ -1314,9 +1311,15 @@ def _evaluate_criteria( ) continue - if self._check_criterion(expected, track_events, sse_events) or ( - alternative - and self._check_criterion(alternative, track_events, sse_events) + candidate_expectations = [expected] + if alternative: + candidate_expectations.append(alternative) + if alternatives: + candidate_expectations.extend(alternatives) + + if any( + candidate and self._check_criterion(candidate, track_events, sse_events) + for candidate in candidate_expectations ): score += points logger.debug( diff --git a/eval/evaluation_report.json b/eval/evaluation_report.json index c59c71c..3f53998 100644 --- a/eval/evaluation_report.json +++ b/eval/evaluation_report.json @@ -1,340 +1,340 @@ { "evaluation": { - "timestamp": "2026-03-27 00:45:37", - "unix_timestamp": 1774543537.602308, + "timestamp": "2026-03-28 21:09:34", + "unix_timestamp": 1774703374.492846, "summary": { "total_tests": 22, "passed_tests": 21, "pass_rate": 95.45, "models_tested": [ - "dashscope/qwen3.5-plus", - "dashscope/qwen3.5-flash" + "dashscope/qwen3.5-flash", + "dashscope/qwen3.5-plus" ] }, "model_performance": { + "dashscope/qwen3.5-flash": { + "pass_rate": 90.91, + "task_score": 59.0, + "task_max_score": 62.5, + "efficiency_score": 7.0914, + "usage_score": 8.6369, + "composite_score": 0.8314, + "avg_duration": 165.81, + "avg_cost": 0.2005, + "passed_count": 10, + "total_tests": 11 + }, "dashscope/qwen3.5-plus": { "pass_rate": 100.0, "task_score": 62.5, "task_max_score": 62.5, - "efficiency_score": 7.9068, - "usage_score": 4.7439, - "composite_score": 0.83, - "avg_duration": 135.49, - "avg_cost": 0.588661, + "efficiency_score": 7.638, + "usage_score": 4.4691, + "composite_score": 0.8201, + "avg_duration": 144.49, + "avg_cost": 0.620445, "passed_count": 11, "total_tests": 11 - }, - "dashscope/qwen3.5-flash": { - "pass_rate": 90.91, - "task_score": 56.0, - "task_max_score": 62.5, - "efficiency_score": 7.8906, - "usage_score": 9.2849, - "composite_score": 0.8577, - "avg_duration": 133.22, - "avg_cost": 0.152816, - "passed_count": 10, - "total_tests": 11 } }, "test_results": { "bluebook_simple": { "name": "BlueBook Search And Like Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, "task_score": 6.0, "task_max_score": 6.0, - "efficiency_score": 0.6587, - "usage_score": 0.2884, - "composite_score": 0.7894, - "total_score": 6.95, - "duration": 102.4, - "cost": 0.42695 + "efficiency_score": 0.656, + "usage_score": 0.7975, + "composite_score": 0.8907, + "total_score": 7.45, + "duration": 103.21, + "cost": 0.121497 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 6.0, "task_max_score": 6.0, - "efficiency_score": 0.6927, - "usage_score": 0.8331, - "composite_score": 0.9052, - "total_score": 7.53, - "duration": 92.2, - "cost": 0.100139 + "efficiency_score": 0.6513, + "usage_score": 0.2662, + "composite_score": 0.7835, + "total_score": 6.92, + "duration": 104.6, + "cost": 0.440265 } } }, "finviz_simple": { "name": "Finviz Simple Screener Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7931, - "usage_score": 0.7763, - "composite_score": 0.9139, - "total_score": 4.57, - "duration": 62.06, - "cost": 0.178958 + "efficiency_score": 0.817, + "usage_score": 0.9464, + "composite_score": 0.9527, + "total_score": 4.76, + "duration": 54.91, + "cost": 0.042865 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.733, - "usage_score": 0.8868, - "composite_score": 0.9239, - "total_score": 4.62, - "duration": 80.11, - "cost": 0.090585 + "efficiency_score": 0.7766, + "usage_score": 0.777, + "composite_score": 0.9107, + "total_score": 4.55, + "duration": 67.03, + "cost": 0.178378 } } }, "cloudstack_interactive": { "name": "CloudStack DAS Interactive Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, - "task_score": 9.0, + "task_score": 7.5, "task_max_score": 9.0, - "efficiency_score": 0.5473, - "usage_score": 0.2619, - "composite_score": 0.7618, - "total_score": 9.81, - "duration": 316.88, - "cost": 1.47621 + "efficiency_score": 0.7165, + "usage_score": 0.873, + "composite_score": 0.9179, + "total_score": 9.09, + "duration": 198.43, + "cost": 0.254081 }, - "dashscope/qwen3.5-flash": { - "passed": false, - "task_score": 2.5, + "dashscope/qwen3.5-plus": { + "passed": true, + "task_score": 9.0, "task_max_score": 9.0, - "efficiency_score": 0.7064, - "usage_score": 0.8895, - "composite_score": 0.3192, - "total_score": 4.1, - "duration": 205.55, - "cost": 0.221029 + "efficiency_score": 0.6635, + "usage_score": 0.3651, + "composite_score": 0.8057, + "total_score": 10.03, + "duration": 235.55, + "cost": 1.269709 } } }, "gbr": { "name": "GBR Search Test", "results_by_model": { - "dashscope/qwen3.5-plus": { - "passed": true, - "task_score": 2.5, + "dashscope/qwen3.5-flash": { + "passed": false, + "task_score": 0.5, "task_max_score": 2.5, - "efficiency_score": 0.7329, - "usage_score": 0.5884, - "composite_score": 0.8643, - "total_score": 3.82, - "duration": 106.82, - "cost": 0.329246 + "efficiency_score": 0.0806, + "usage_score": 0.4183, + "composite_score": 0.0998, + "total_score": 1.0, + "duration": 367.76, + "cost": 0.465362 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 2.5, "task_max_score": 2.5, - "efficiency_score": 0.7885, - "usage_score": 0.8918, - "composite_score": 0.9361, - "total_score": 4.18, - "duration": 84.61, - "cost": 0.08655 + "efficiency_score": 0.7707, + "usage_score": 0.5913, + "composite_score": 0.8724, + "total_score": 3.86, + "duration": 91.71, + "cost": 0.326986 } } }, "techforum_reply": { "name": "TechForum Comment Reply Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, "task_score": 9.5, "task_max_score": 9.5, - "efficiency_score": 0.676, - "usage_score": 0.1585, - "composite_score": 0.7669, - "total_score": 10.33, - "duration": 162.0, - "cost": 0.841473 + "efficiency_score": 0.2831, + "usage_score": 0.5356, + "composite_score": 0.7637, + "total_score": 10.32, + "duration": 358.43, + "cost": 0.464415 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 9.5, "task_max_score": 9.5, - "efficiency_score": 0.6157, - "usage_score": 0.7408, - "composite_score": 0.8713, - "total_score": 10.86, - "duration": 192.13, - "cost": 0.259165 + "efficiency_score": 0.6498, + "usage_score": 0.0922, + "composite_score": 0.7484, + "total_score": 10.24, + "duration": 175.1, + "cost": 0.907785 } } }, "bluebook_complex": { "name": "BlueBook Multi-Image Reply Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, "task_score": 12.0, "task_max_score": 12.0, - "efficiency_score": 0.6307, - "usage_score": 0.3382, - "composite_score": 0.7938, - "total_score": 12.97, - "duration": 184.64, - "cost": 0.79412 + "efficiency_score": 0.7024, + "usage_score": 0.8476, + "composite_score": 0.91, + "total_score": 13.55, + "duration": 148.82, + "cost": 0.182839 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 12.0, "task_max_score": 12.0, - "efficiency_score": 0.7021, - "usage_score": 0.8414, - "composite_score": 0.9087, - "total_score": 13.54, - "duration": 148.94, - "cost": 0.190283 + "efficiency_score": 0.6839, + "usage_score": 0.3953, + "composite_score": 0.8158, + "total_score": 13.08, + "duration": 158.07, + "cost": 0.725694 } } }, "techforum": { "name": "TechForum Upvote Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, "task_score": 2, "task_max_score": 2, - "efficiency_score": 0.8652, - "usage_score": 0.7683, - "composite_score": 0.9267, - "total_score": 3.63, - "duration": 40.44, - "cost": 0.115867 + "efficiency_score": 0.9001, + "usage_score": 0.9546, + "composite_score": 0.9709, + "total_score": 3.85, + "duration": 29.97, + "cost": 0.022705 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 2, "task_max_score": 2, - "efficiency_score": 0.8999, - "usage_score": 0.946, - "composite_score": 0.9692, - "total_score": 3.85, - "duration": 30.03, - "cost": 0.027012 + "efficiency_score": 0.8533, + "usage_score": 0.7708, + "composite_score": 0.9248, + "total_score": 3.62, + "duration": 44.02, + "cost": 0.114576 } } }, "finviz_complex": { "name": "Finviz Multi-Filter Screener Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, "task_score": 5.0, "task_max_score": 5.0, - "efficiency_score": 0.6882, - "usage_score": 0.4682, - "composite_score": 0.8313, - "total_score": 6.16, - "duration": 124.71, - "cost": 0.531767 + "efficiency_score": 0.7334, + "usage_score": 0.8726, + "composite_score": 0.9212, + "total_score": 6.61, + "duration": 106.62, + "cost": 0.127404 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 5.0, "task_max_score": 5.0, - "efficiency_score": 0.535, - "usage_score": 0.7822, - "composite_score": 0.8634, - "total_score": 6.32, - "duration": 186.0, - "cost": 0.217815 + "efficiency_score": 0.5376, + "usage_score": 0.2214, + "composite_score": 0.7518, + "total_score": 5.76, + "duration": 184.97, + "cost": 0.778617 } } }, "cloudstack": { "name": "CloudStack DAS Agent Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, "task_score": 3.5, "task_max_score": 3.5, - "efficiency_score": 0.7916, - "usage_score": 0.621, - "composite_score": 0.8825, - "total_score": 4.91, - "duration": 104.2, - "cost": 0.454825 + "efficiency_score": 0.7705, + "usage_score": 0.8949, + "composite_score": 0.9331, + "total_score": 5.17, + "duration": 114.74, + "cost": 0.126168 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 3.5, "task_max_score": 3.5, - "efficiency_score": 0.7624, - "usage_score": 0.8874, - "composite_score": 0.9299, - "total_score": 5.15, - "duration": 118.82, - "cost": 0.135177 + "efficiency_score": 0.592, + "usage_score": 0.5048, + "composite_score": 0.8194, + "total_score": 4.6, + "duration": 203.98, + "cost": 0.594295 } } }, "dataflow": { "name": "DataFlow Visual Challenge Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.792, - "usage_score": 0, - "composite_score": 0.7584, - "total_score": 3.79, - "duration": 124.81, - "cost": 0.537748 + "efficiency_score": 0.7678, + "usage_score": 0.6428, + "composite_score": 0.8821, + "total_score": 4.41, + "duration": 139.31, + "cost": 0.178579 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.781, - "usage_score": 0.7322, - "composite_score": 0.9026, - "total_score": 4.51, - "duration": 131.41, - "cost": 0.133914 + "efficiency_score": 0.7283, + "usage_score": 0, + "composite_score": 0.7457, + "total_score": 3.73, + "duration": 163.02, + "cost": 0.716118 } } }, "gbr_detailed": { "name": "GBR Detailed Search & Read Test", "results_by_model": { - "dashscope/qwen3.5-plus": { + "dashscope/qwen3.5-flash": { "passed": true, "task_score": 7.0, "task_max_score": 7.0, - "efficiency_score": 0.731, - "usage_score": 0.4746, - "composite_score": 0.8411, - "total_score": 8.21, - "duration": 161.37, - "cost": 0.78811 + "efficiency_score": 0.6639, + "usage_score": 0.8536, + "composite_score": 0.9035, + "total_score": 8.52, + "duration": 201.65, + "cost": 0.219589 }, - "dashscope/qwen3.5-flash": { + "dashscope/qwen3.5-plus": { "passed": true, "task_score": 7.0, "task_max_score": 7.0, - "efficiency_score": 0.674, - "usage_score": 0.8538, - "composite_score": 0.9056, - "total_score": 8.53, - "duration": 195.59, - "cost": 0.219313 + "efficiency_score": 0.7311, + "usage_score": 0.485, + "composite_score": 0.8432, + "total_score": 8.22, + "duration": 161.35, + "cost": 0.772474 } } } } } -} \ No newline at end of file +} diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index ce63ef3..992a006 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -18,8 +18,6 @@ import { tabManager } from '../commands/tab-manager'; import { javascript } from '../commands/javascript'; import { debuggerSessionManager } from '../commands/debugger-manager'; import { dialogManager } from '../commands/dialog'; -import { extractGroundedElements } from '../commands/grounded-elements'; -import { handleGetAccessibilityTree } from '../commands/accessibility'; import { clearScreenshotCache } from '../commands/computer'; import { drawHighlights } from '../commands/visual-highlight'; @@ -29,7 +27,6 @@ import { assignSequentialElementIds } from '../commands/element-id'; import { buildHighlightDetectionScript, filterHighlightElementsByKeywords, - normalizeHighlightKeywords, } from '../commands/highlight-detection'; import { performElementClick, @@ -62,6 +59,7 @@ import { type HighlightPageState, } from '../utils/layout-stability'; import { + HIGHLIGHT_PRECONDITION_CAPTURE_OPTIONS, HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS, TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS, } from '../utils/highlight-screenshot'; @@ -86,6 +84,33 @@ async function compressScreenshotResult( return (compressedResult as T | null | undefined) ?? screenshotResult; } +async function runHighlightPreconditionWarmup(options: { + tabId: number; + conversationId: string; + elementType: string; + page: number; +}): Promise { + const { tabId, conversationId, elementType, page } = options; + const warmupStart = Date.now(); + console.log( + `🔥 [HighlightElements] Starting screenshot warmup precondition for elementType=${elementType}, page=${page}`, + ); + + await captureScreenshot( + tabId, + conversationId, + true, + 90, + false, + 350, + HIGHLIGHT_PRECONDITION_CAPTURE_OPTIONS, + ); + + console.log( + `🔥 [HighlightElements] Screenshot warmup precondition completed in ${Date.now() - warmupStart}ms`, + ); +} + function buildStoredHighlightPages(options: { filteredElements: InteractiveElement[]; totalPages: number; @@ -125,369 +150,6 @@ function buildStoredHighlightPages(options: { return pages; } -function buildSnapshotPageRefreshScript(options: { - elements: InteractiveElement[]; - expectedDocumentId?: string; - highlightSnapshotId?: number; -}): string { - const { elements, expectedDocumentId, highlightSnapshotId } = options; - const refreshTargets = elements.map((element) => ({ - id: element.id, - selector: element.selector, - fingerprint: element.fingerprint || '', - })); - - return ` - (() => { - const expectedDocumentId = ${JSON.stringify(expectedDocumentId || '')}; - const highlightSnapshotId = ${highlightSnapshotId ?? 'null'}; - const refreshTargets = ${JSON.stringify(refreshTargets)}; - - function normalizeIdentityWhitespace(value, maxLength = 240) { - const normalized = String(value ?? '') - .replace(/\\s+/g, ' ') - .trim(); - return normalized.slice(0, maxLength).toLowerCase(); - } - - function getIdentityAttributeTokens(el, attributeNames) { - const tokens = []; - for (const attributeName of attributeNames) { - const value = el.getAttribute(attributeName); - if (!value) { - continue; - } - - const normalized = normalizeIdentityWhitespace(value, 80); - if (normalized) { - tokens.push(normalized); - } - } - return tokens; - } - - function getIdentityClassTokens(el) { - return Array.from(el.classList) - .filter( - (token) => - token.length > 1 && - token.length <= 40 && - /^[a-z0-9_-]+$/i.test(token), - ) - .slice(0, 4) - .map((token) => token.toLowerCase()); - } - - function getElementTextForIdentity(el) { - if (el instanceof HTMLInputElement) { - const inputType = (el.type || '').toLowerCase(); - if ( - inputType === 'button' || - inputType === 'submit' || - inputType === 'reset' - ) { - return normalizeIdentityWhitespace(el.value, 120); - } - } - - return normalizeIdentityWhitespace(el.textContent || '', 160); - } - - function getCurrentDocumentId() { - return \`\${Math.trunc(performance.timeOrigin)}|\${location.href}\`; - } - - function getElementFingerprint(el) { - const tokens = [ - el.tagName.toLowerCase(), - ...getIdentityAttributeTokens(el, [ - 'role', - 'type', - 'name', - 'id', - 'aria-label', - 'title', - 'placeholder', - 'data-testid', - 'data-test-id', - ]), - ...getIdentityClassTokens(el), - ]; - - const text = getElementTextForIdentity(el); - if (text) { - tokens.push(text); - } - - return normalizeIdentityWhitespace(tokens.join(' | '), 240); - } - - function splitFingerprintTokens(value) { - return Array.from( - new Set( - String(value ?? '') - .toLowerCase() - .split(/[^a-z0-9]+/i) - .filter((token) => token.length > 1), - ), - ); - } - - function fingerprintsLookCompatible(expected, current) { - if (!expected || !current) { - return true; - } - if (expected === current) { - return true; - } - - const expectedTokens = splitFingerprintTokens(expected); - const currentTokens = new Set(splitFingerprintTokens(current)); - if (expectedTokens.length === 0) { - return true; - } - - let overlap = 0; - for (const token of expectedTokens) { - if (currentTokens.has(token)) { - overlap += 1; - } - } - - return overlap >= Math.max(2, Math.min(4, Math.ceil(expectedTokens.length * 0.5))); - } - - const currentDocumentId = getCurrentDocumentId(); - if (expectedDocumentId && currentDocumentId !== expectedDocumentId) { - return { - ok: false, - stale: true, - error: - \`Highlight snapshot \${highlightSnapshotId} is stale because the document changed. Call highlight_elements() again.\`, - }; - } - - const refreshed = []; - for (const target of refreshTargets) { - let el = null; - try { - el = document.querySelector(target.selector); - } catch (error) { - return { - ok: false, - stale: true, - error: - 'Highlight snapshot is stale because a cached selector is no longer valid. Call highlight_elements() again.', - }; - } - - if (!el) { - return { - ok: false, - stale: true, - error: - 'Highlight snapshot is stale because a highlighted element disappeared. Call highlight_elements() again.', - }; - } - - const currentFingerprint = getElementFingerprint(el); - if ( - !fingerprintsLookCompatible(target.fingerprint, currentFingerprint) - ) { - return { - ok: false, - stale: true, - error: - 'Highlight snapshot is stale because highlighted element identities changed. Call highlight_elements() again.', - }; - } - - const rect = el.getBoundingClientRect(); - refreshed.push({ - id: target.id, - bbox: { - x: rect.x, - y: rect.y, - width: rect.width, - height: rect.height, - }, - }); - } - - return { - ok: true, - refreshed, - }; - })(); - `; -} - -async function renderHighlightSnapshotPage(options: { - tabId: number; - conversationId: string; - elements: InteractiveElement[]; - totalElements: number; - totalPages: number; - page: number; - highlightSnapshotId: number; - expectedDocumentId?: string; - pageState: HighlightPageState | 'snapshot_reused'; - readinessReasons: string[]; -}): Promise { - const { - tabId, - conversationId, - elements, - totalElements, - totalPages, - page, - highlightSnapshotId, - expectedDocumentId, - pageState, - readinessReasons, - } = options; - - let renderElements = elements; - const refreshResult = await javascript.executeJavaScript( - tabId, - conversationId, - buildSnapshotPageRefreshScript({ - elements, - expectedDocumentId, - highlightSnapshotId, - }), - true, - false, - 2500, - ); - const refreshPayload = refreshResult.result?.value; - - if (refreshResult.success && refreshPayload?.ok) { - const refreshedById = new Map< - string, - { - bbox: { - x: number; - y: number; - width: number; - height: number; - }; - } - >( - Array.isArray(refreshPayload.refreshed) - ? refreshPayload.refreshed - .filter( - ( - refreshedElement: unknown, - ): refreshedElement is { - id: string; - bbox: { - x: number; - y: number; - width: number; - height: number; - }; - } => - typeof refreshedElement === 'object' && - refreshedElement !== null && - 'id' in refreshedElement && - 'bbox' in refreshedElement, - ) - .map((refreshedElement) => [ - refreshedElement.id, - { bbox: refreshedElement.bbox }, - ]) - : [], - ); - - renderElements = elements.map((element) => ({ - ...element, - bbox: refreshedById.get(element.id)?.bbox || element.bbox, - })); - } else if ( - refreshResult.success && - refreshPayload && - refreshPayload.ok === false - ) { - return { - success: false, - error: - refreshPayload.error || - `Highlight snapshot ${highlightSnapshotId} is stale. Call highlight_elements() again.`, - timestamp: Date.now(), - }; - } else if (!refreshResult.success) { - console.warn( - `⚠️ [HighlightElements] Failed to refresh cached snapshot ${highlightSnapshotId} before rendering page ${page}: ${refreshResult.error || 'unknown error'}`, - ); - } - - const screenshotResult = await captureScreenshot( - tabId, - conversationId, - true, - 90, - false, - 0, - HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS, - ); - - if (!screenshotResult?.success || !screenshotResult?.imageData) { - return { - success: false, - error: `Failed to capture screenshot: ${screenshotResult?.success === false ? 'Screenshot command failed' : 'No image data returned'}`, - timestamp: Date.now(), - }; - } - - const imageScale = - screenshotResult.metadata?.imageScale || - screenshotResult.metadata?.devicePixelRatio || - 1; - const viewportWidth = screenshotResult.metadata?.viewportWidth || 0; - const viewportHeight = screenshotResult.metadata?.viewportHeight || 0; - - const highlightedScreenshot = await drawHighlights( - screenshotResult.imageData, - renderElements, - { - scale: imageScale, - viewportWidth, - viewportHeight, - }, - ); - const compressedScreenshot = await compressIfNeeded( - highlightedScreenshot, - getCompressionThreshold(), - ); - - return { - success: true, - data: { - highlight_snapshot_id: highlightSnapshotId, - elements: renderElements, - totalElements, - totalPages, - page, - pageState, - readinessReasons, - screenshot: compressedScreenshot, - ...(screenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: screenshotResult.dialog_auto_accepted, - } - : {}), - ...(screenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - screenshotResult.dialog_auto_accepted_list, - } - : {}), - }, - timestamp: Date.now(), - }; -} - function buildHighlightConsistencyScript( elements: InteractiveElement[], ): string { @@ -1882,88 +1544,18 @@ async function handleCommand(command: Command): Promise { const keywords = command.keywords; const elementType = command.element_type || 'any'; const page = command.page || 1; - const highlightSnapshotId = command.highlight_snapshot_id; - const requestedKeywords = normalizeHighlightKeywords(keywords); - - if (highlightSnapshotId !== undefined && highlightSnapshotId !== null) { - const baseSnapshot = elementCache.getSnapshotPage( - conversationId, - activeTabId, - highlightSnapshotId, - ); - if (!baseSnapshot) { - return { - success: false, - error: `Highlight snapshot ${highlightSnapshotId} was not found or expired. Call highlight_elements() again.`, - timestamp: Date.now(), - }; - } - - const cachedKeywords = normalizeHighlightKeywords( - baseSnapshot.keywords, - ); - if (baseSnapshot.elementType !== elementType) { - return { - success: false, - error: `Highlight snapshot ${highlightSnapshotId} was created for element_type="${baseSnapshot.elementType}", but the current request asked for "${elementType}". Start a new highlight from page 1 instead.`, - timestamp: Date.now(), - }; - } - - if ( - cachedKeywords.length !== requestedKeywords.length || - cachedKeywords.some( - (keyword, index) => keyword !== requestedKeywords[index], - ) - ) { - return { - success: false, - error: `Highlight snapshot ${highlightSnapshotId} was created with different keywords. Start a new highlight from page 1 instead.`, - timestamp: Date.now(), - }; - } - - const continuedSnapshot = elementCache.forkSnapshotPage( - conversationId, - activeTabId, - highlightSnapshotId, - page, - ); - if (!continuedSnapshot) { - return { - success: false, - error: `Failed to continue from highlight snapshot ${highlightSnapshotId}. Call highlight_elements() again.`, - timestamp: Date.now(), - }; - } - - return await renderHighlightSnapshotPage({ - tabId: activeTabId, - conversationId, - elements: continuedSnapshot.elements, - totalElements: continuedSnapshot.totalElements, - totalPages: continuedSnapshot.totalPages, - page: continuedSnapshot.page, - highlightSnapshotId: continuedSnapshot.snapshotId, - expectedDocumentId: continuedSnapshot.documentId, - pageState: 'snapshot_reused', - readinessReasons: [], - }); - } - - if (page > 1) { - return { - success: false, - error: - 'page > 1 requires highlight_snapshot_id so pagination stays on the same frozen highlight inventory. Call highlight_elements() page 1 first.', - timestamp: Date.now(), - }; - } const detectionScript = buildHighlightDetectionScript({ elementType, }); + await runHighlightPreconditionWarmup({ + tabId: activeTabId, + conversationId, + elementType, + page, + }); + const maxHighlightAttempts = 3; const highlightDetectionTimeoutMs = 18000; let previousConsistency: HighlightConsistencyResult | null = null; diff --git a/extension/src/commands/__tests__/element-id.test.ts b/extension/src/commands/__tests__/element-id.test.ts index 632fca1..12ad0cb 100644 --- a/extension/src/commands/__tests__/element-id.test.ts +++ b/extension/src/commands/__tests__/element-id.test.ts @@ -72,44 +72,4 @@ describe('element-cache highlight snapshots', () => { expect(lookup?.element.selector).toBe('#page-1'); expect(lookup?.documentId).toBe('doc-1'); }); - - test('forks a new snapshot page from the same frozen inventory', () => { - elementCache.clearAll(); - - const page1 = elementCache.storeSnapshot({ - conversationId: 'conv-2', - tabId: 101, - documentId: 'doc-2', - elementType: 'any', - totalElements: 2, - pages: [ - [createElement('1', '#first-page')], - [createElement('1', '#second-page')], - ], - page: 1, - }); - - const page2 = elementCache.forkSnapshotPage( - 'conv-2', - 101, - page1.snapshotId, - 2, - ); - - expect(page2).toBeDefined(); - expect(page2?.snapshotId).not.toBe(page1.snapshotId); - expect(page2?.page).toBe(2); - expect(page2?.elements.map((element) => element.selector)).toEqual([ - '#second-page', - ]); - - expect( - elementCache.getElementById('conv-2', 101, page1.snapshotId, '1')?.element - .selector, - ).toBe('#first-page'); - expect( - elementCache.getElementById('conv-2', 101, page2!.snapshotId, '1') - ?.element.selector, - ).toBe('#second-page'); - }); }); diff --git a/extension/src/commands/element-cache.ts b/extension/src/commands/element-cache.ts index a257817..61babb5 100644 --- a/extension/src/commands/element-cache.ts +++ b/extension/src/commands/element-cache.ts @@ -2,7 +2,8 @@ * Highlight snapshot cache manager. * * Two cache layers are maintained: - * 1. Frozen highlight inventories used for stable pagination across pages. + * 1. Per-call highlight inventories used to serve requested pages and keep + * page-local element IDs stable within one highlight response. * 2. Page-scoped highlight snapshots returned to callers and used for * element interactions together with page-local element IDs. */ @@ -253,60 +254,6 @@ class ElementCacheImpl { return snapshotPage; } - forkSnapshotPage( - conversationId: string, - tabId: number, - baseSnapshotId: number, - page: number, - ): HighlightSnapshotPage | undefined { - this.cleanupExpired(); - - const baseSnapshot = this.getSnapshotView( - conversationId, - tabId, - baseSnapshotId, - ); - if (!baseSnapshot) { - return undefined; - } - - const snapshotId = this.nextSnapshotId++; - const snapshotKey = this.buildSnapshotKey( - conversationId, - tabId, - snapshotId, - ); - const now = Date.now(); - - this.snapshotViews.set(snapshotKey, { - tabId, - inventoryId: baseSnapshot.inventoryId, - createdAt: now, - page, - }); - - const inventory = this.getInventory( - conversationId, - tabId, - baseSnapshot.inventoryId, - ); - if (inventory) { - this.touchInventory(inventory); - } - - const snapshotPage = this.getSnapshotPage( - conversationId, - tabId, - snapshotId, - ); - if (snapshotPage) { - console.log( - `📄 [ElementCache] Forked snapshot ${snapshotId} from base ${baseSnapshotId} for conversation ${conversationId}, tab ${tabId}, page ${page}`, - ); - } - return snapshotPage; - } - getSnapshotPage( conversationId: string, tabId: number, diff --git a/extension/src/types.ts b/extension/src/types.ts index 909ba7a..0685fd4 100644 --- a/extension/src/types.ts +++ b/extension/src/types.ts @@ -126,7 +126,6 @@ export interface HighlightElementsCommand extends BaseCommand { element_type?: ElementType; // Single element type for stable pagination page?: number; // 1-indexed page number for collision-aware pagination keywords?: string[]; // Keywords list to filter elements by detected semantic text (no pagination needed when provided) - highlight_snapshot_id?: number; // Previous highlight snapshot ID to continue the same frozen inventory across pages } export interface ClickElementCommand extends BaseCommand { diff --git a/extension/src/utils/highlight-screenshot.ts b/extension/src/utils/highlight-screenshot.ts index bb2d7fa..5c43102 100644 --- a/extension/src/utils/highlight-screenshot.ts +++ b/extension/src/utils/highlight-screenshot.ts @@ -22,6 +22,13 @@ export const HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS: ScreenshotCaptureOptions = { warmupMaxAttempts: 2, }; +export const HIGHLIGHT_PRECONDITION_CAPTURE_OPTIONS: ScreenshotCaptureOptions = + { + ...DEFAULT_SCREENSHOT_CAPTURE_OPTIONS, + warmupBeforeCapture: true, + warmupMaxAttempts: 3, + }; + export const TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS: ScreenshotCaptureOptions = { ...DEFAULT_SCREENSHOT_CAPTURE_OPTIONS, warmupBeforeCapture: true, diff --git a/pyproject.toml b/pyproject.toml index b955a1b..0b3e1d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ dependencies = [ "pillow>=10.0.0", "numpy>=1.24.0", "requests>=2.31.0", - "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@56c315b34a8d3f0b3d1c89cba57115dd859edbb0#subdirectory=openhands-sdk", - "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@56c315b34a8d3f0b3d1c89cba57115dd859edbb0#subdirectory=openhands-tools", + "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@df47da7429a04cc2a5681e701331d85fcb798f1e#subdirectory=openhands-sdk", + "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@df47da7429a04cc2a5681e701331d85fcb798f1e#subdirectory=openhands-tools", "litellm @ git+https://github.com/softpudding/litellm.git@bfba5e3889829067baeab3b12d38008360913771", ] diff --git a/server/agent/prompts/big_model/element_interaction_tool.j2 b/server/agent/prompts/big_model/element_interaction_tool.j2 index 92ecf66..d307b86 100644 --- a/server/agent/prompts/big_model/element_interaction_tool.j2 +++ b/server/agent/prompts/big_model/element_interaction_tool.j2 @@ -45,7 +45,7 @@ Direct-execution actions **Snapshot rule**: Every initiating or direct element-targeted action must include both `highlight_snapshot_id` and `element_id`. `element_id` is page-local within one highlight snapshot response and is not valid by itself. `confirm_click` and `confirm_keyboard_input` operate on the current pending confirmation and do not need those fields repeated. -**Discovery rule**: If you do not yet have the correct `element_id`, continue discovery with highlight pagination instead of replacing the missing target with guessed keywords. On the same unchanged page state, your default next step is another `highlight` call with `element_type: "any"`, the next page, and the previous `highlight_snapshot_id`. After any significant page-state change caused by your last action, restart discovery with `highlight` on `element_type: "any"` page 1 without reusing the old snapshot, because it exposes extension-derived page insight you cannot infer reliably from intent alone. Do not jump straight to `keywords` or another narrower type on that changed page. Use `keywords` only when you already see the target's exact literal text on the target itself in the current screenshot and can copy it verbatim. If a control itself shows an icon plus `52`, the literal keyword is `52`, not guessed icon words like `star`, `favorite`, or `bookmark`. +**Discovery rule**: If you do not yet have the correct `element_id`, continue discovery with highlight pagination instead of replacing the missing target with guessed keywords. On the same unchanged page state, your default next step is another `highlight` call with `element_type: "any"` and the next page number. Each new `highlight` response returns a fresh `highlight_snapshot_id`, so use the newest returned pair for any later element action. After any significant page-state change caused by your last action, restart discovery with `highlight` on `element_type: "any"` page 1, because it exposes extension-derived page insight you cannot infer reliably from intent alone. Do not jump straight to `keywords` or another narrower type on that changed page. Use `keywords` only when you already see the target's exact literal text on the target itself in the current screenshot and can copy it verbatim. If a control itself shows an icon plus `52`, the literal keyword is `52`, not guessed icon words like `star`, `favorite`, or `bookmark`. ## Commands diff --git a/server/agent/prompts/big_model/highlight_tool.j2 b/server/agent/prompts/big_model/highlight_tool.j2 index 5ae0fcd..ec35eaa 100644 --- a/server/agent/prompts/big_model/highlight_tool.j2 +++ b/server/agent/prompts/big_model/highlight_tool.j2 @@ -36,7 +36,7 @@ When you use the highlight tool, elements are marked with **BLUE boxes**: Each `highlight` response returns a numeric `highlight_snapshot_id` plus page-local numeric `element_id` labels such as `1`, `2`, `3`. `element_id` values are only meaningful together with the matching `highlight_snapshot_id`. Each new `highlight` response creates a fresh `highlight_snapshot_id`. -To continue pagination on the same frozen inventory, pass the previous `highlight_snapshot_id` into the next `highlight` call. +Highlight pagination is based on the current page state, so use `page=2,3...` directly. The confirmation-only `highlight_single_element` flow does not create a new `highlight_snapshot_id`. ## Collision-Aware Pagination @@ -60,14 +60,13 @@ Capture a screenshot with numbered visual markers on interactive elements of ONE { "element_type": "hoverable" } // Hoverable elements { "element_type": "selectable" } // Select dropdowns { "element_type": "any" } // All interactive elements combined -{ "page": 2, "highlight_snapshot_id": 17 } // Next page of the same frozen any inventory +{ "page": 2 } // Page 2 of the current page state's any results { "keywords": ["Continue with Email"] } // Exact observed readable text only ``` **Parameters**: - `element_type`: Single type to highlight - "any" (default), "scrollable", "inputable", "hoverable", or "selectable" - `page`: Page number for pagination (1-indexed, default 1). Ignored when `keywords` is provided. -- `highlight_snapshot_id`: Previous highlight snapshot ID. Required when `page > 1` so pagination continues on the same frozen inventory instead of rescanning the page. - `keywords`: Exact literal text already visible on the target itself in the current screenshot that you can copy verbatim. Use this only to accelerate a known text match, not to discover guessed controls. When provided, only matching elements are returned without pagination. Example: ["Continue with Email", "View comments"] **When to Use Pagination**: @@ -76,10 +75,10 @@ Capture a screenshot with numbered visual markers on interactive elements of ONE - Use `any` first because it captures extension-derived structure and cross-type context that a narrower pass can hide - If the element you want to interact with is NOT visible on the current page, increment `page` to see more elements - If the page state is unchanged and you still have not found the target, your default next step is the next `any` page -- Treat pages as reliable collision-free slices of the same candidate set, not random samples +- Treat pages as reliable collision-free slices of the current page state's candidate set, not random samples - Continue to the next page until you find the most appropriate element for your task or exhaust the relevant pages - Stay on the same `element_type` across pages to browse through all elements of that category -- When paginating, reuse the previous `highlight_snapshot_id` so each next page comes from the same frozen inventory +- Each pagination call returns a fresh `highlight_snapshot_id`; use the newest one for any later element action - Do not jump from a first-page miss to `keywords` - Exhaust relevant highlight pages before switching strategies unless the page state changed or you already see the target's exact literal text on the target itself in the screenshot - Use `any` to inventory the changed page first, then use type-specific pagination only if you have a narrower reason on the same unchanged page state diff --git a/server/agent/prompts/small_model/element_interaction_tool.j2 b/server/agent/prompts/small_model/element_interaction_tool.j2 index 18f68cf..5c62aa2 100644 --- a/server/agent/prompts/small_model/element_interaction_tool.j2 +++ b/server/agent/prompts/small_model/element_interaction_tool.j2 @@ -15,11 +15,11 @@ Interact with one highlighted element at a time. Never skip from intent directly to action without a matching `highlight_snapshot_id` + `element_id` pair from highlight. -If the right `element_id` is missing, return to highlight and continue pagination instead of guessing a control name or target. On the same unchanged page state, your first reaction should be to continue `element_type: "any"` pagination with the previous `highlight_snapshot_id`. +If the right `element_id` is missing, return to highlight and continue pagination instead of guessing a control name or target. On the same unchanged page state, your first reaction should be to continue `element_type: "any"` pagination with the next `page` value. After any significant page-state change caused by your last action, such as navigation, tab switch, modal expansion, or opening a detail view, the next discovery step must be `highlight` with `element_type: "any"` because it exposes extension-derived page insight you cannot infer reliably from intent alone. -If the page did not significantly change, stay on the current `any` inventory and continue pagination until the target appears. Narrow to `inputable`, `scrollable`, `hoverable`, or `selectable` only when the task directly targets that affordance and the current `any` inventory was not enough. +If the page did not significantly change, stay on the current `any` discovery path and continue pagination until the target appears. Narrow to `inputable`, `scrollable`, `hoverable`, or `selectable` only when the task directly targets that affordance and the current `any` results were not enough. ## Commands diff --git a/server/agent/prompts/small_model/highlight_tool.j2 b/server/agent/prompts/small_model/highlight_tool.j2 index 650a7a4..e8f4e55 100644 --- a/server/agent/prompts/small_model/highlight_tool.j2 +++ b/server/agent/prompts/small_model/highlight_tool.j2 @@ -7,7 +7,7 @@ Find elements before interacting. 1. `element_type: "any"` is the default first pass for each new page state because it exposes extension-derived page insight across element types, neighboring controls, and other actionable structure that you cannot reliably infer from intent alone. 2. After any significant page-state change, immediately call `highlight` with `element_type: "any"` before choosing the next element. Significant changes include navigation to a different page or tab, opening a modal or expanded detail view, or any major layout change. 3. Do not jump away from `element_type: "any"` on a newly changed page before rebuilding the mixed-type inventory. -4. Treat highlight pagination as reliable. If the target is not on the current page, keep the same `element_type`, reuse the previous `highlight_snapshot_id`, and increment `page`. +4. Treat highlight pagination as reliable. If the target is not on the current page, keep the same `element_type` and increment `page`. 5. If the page state is unchanged and you cannot find the element you want to act on, your first reaction should be to continue `element_type: "any"` to the next page. 6. Buttons, links, icon-only controls, and dense toolbars should still be discovered through `any` pagination. 7. Stay inside pagination and supported `element_type` changes; do not replace a miss with guessed text, labels, or icon semantics. @@ -21,7 +21,7 @@ Find elements before interacting. { "element_type": "inputable" } { "element_type": "scrollable" } { "element_type": "selectable" } -{ "page": 2, "highlight_snapshot_id": 17 } +{ "page": 2 } ``` ## Selection Strategy @@ -31,8 +31,8 @@ Find elements before interacting. - Treat returned `element_id` values as page-local numeric labels such as `1`, `2`, `3` - Every highlight response also returns a `highlight_snapshot_id` - Use `element_id` together with `highlight_snapshot_id` -- To continue pagination on the same unchanged page state, reuse the previous `highlight_snapshot_id` -- A new `highlight` response returns a new `highlight_snapshot_id` +- Pagination is based on the current page state. Use `page=2,3...` directly; do not pass an old `highlight_snapshot_id` back into `highlight` +- A new `highlight` response always returns a new `highlight_snapshot_id` - After any significant page-state change, restart discovery with `highlight` and `element_type: "any"` before selecting the next target - Do not narrow a newly changed page just because you expect a button, submit control, close control, or toolbar icon - If the target is not on the first page, continue `page=2,3...` in the same highlight mode before changing strategy diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py index 4c9fa87..2df996e 100644 --- a/server/agent/tools/browser_executor.py +++ b/server/agent/tools/browser_executor.py @@ -278,13 +278,11 @@ def _execute_highlight_action( element_type = action.element_type or "any" page = action.page or 1 keywords = getattr(action, "keywords", None) - highlight_snapshot_id = action.highlight_snapshot_id command = HighlightElementsCommand( element_type=element_type, page=page, keywords=keywords, - highlight_snapshot_id=highlight_snapshot_id, conversation_id=self.conversation_id, ) result_dict = self._execute_command_sync(command) diff --git a/server/agent/tools/highlight_tool.py b/server/agent/tools/highlight_tool.py index daa9c95..a22a323 100644 --- a/server/agent/tools/highlight_tool.py +++ b/server/agent/tools/highlight_tool.py @@ -41,10 +41,6 @@ class BaseHighlightAction(OpenBrowserAction): ge=1, description="Page number for pagination (1-indexed).", ) - highlight_snapshot_id: Optional[int] = Field( - default=None, - description="Highlight snapshot ID from the previous highlight response. Required when page > 1 so pagination continues on the same frozen inventory.", - ) class HighlightAction(BaseHighlightAction): diff --git a/server/agent/user_help.py b/server/agent/user_help.py index 7e3c29d..bb3435f 100644 --- a/server/agent/user_help.py +++ b/server/agent/user_help.py @@ -6,7 +6,6 @@ from openhands.sdk.event import ActionEvent, MessageEvent from openhands.sdk.event.base import Event - PLEASE_HELP_ME_TOOL_NAME = "please_help_me" diff --git a/server/core/session_manager.py b/server/core/session_manager.py index 1e0f3bd..c02d146 100644 --- a/server/core/session_manager.py +++ b/server/core/session_manager.py @@ -122,8 +122,7 @@ def _init_database(self): cursor = conn.cursor() # Create sessions table - cursor.execute( - """ + cursor.execute(""" CREATE TABLE IF NOT EXISTS sessions ( conversation_id TEXT PRIMARY KEY, status TEXT NOT NULL DEFAULT 'idle', @@ -136,15 +135,13 @@ def _init_database(self): tags TEXT DEFAULT '[]', metadata TEXT DEFAULT '{}' ) - """ - ) + """) # Run migrations to add missing columns self._migrate_database(cursor) # Create user_messages table - cursor.execute( - """ + cursor.execute(""" CREATE TABLE IF NOT EXISTS user_messages ( id INTEGER PRIMARY KEY AUTOINCREMENT, conversation_id TEXT NOT NULL, @@ -154,12 +151,10 @@ def _init_database(self): FOREIGN KEY (conversation_id) REFERENCES sessions(conversation_id), UNIQUE(conversation_id, message_index) ) - """ - ) + """) # Create session_events table for SSE event history - cursor.execute( - """ + cursor.execute(""" CREATE TABLE IF NOT EXISTS session_events ( id INTEGER PRIMARY KEY AUTOINCREMENT, conversation_id TEXT NOT NULL, @@ -169,30 +164,23 @@ def _init_database(self): created_at TEXT NOT NULL, FOREIGN KEY (conversation_id) REFERENCES sessions(conversation_id) ) - """ - ) + """) # Create indexes for faster queries - cursor.execute( - """ + cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_user_messages_conversation ON user_messages(conversation_id) - """ - ) + """) - cursor.execute( - """ + cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_session_events_conversation ON session_events(conversation_id) - """ - ) + """) - cursor.execute( - """ + cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_session_events_index ON session_events(conversation_id, event_index) - """ - ) + """) conn.commit() conn.close() @@ -210,11 +198,9 @@ def _migrate_database(self, cursor): if "first_user_message" not in columns: logger.info("Adding missing column: first_user_message") - cursor.execute( - """ + cursor.execute(""" ALTER TABLE sessions ADD COLUMN first_user_message TEXT - """ - ) + """) logger.info("Migration completed: first_user_message column added") except Exception as e: diff --git a/server/models/commands.py b/server/models/commands.py index 7682163..bc5a83e 100644 --- a/server/models/commands.py +++ b/server/models/commands.py @@ -268,18 +268,6 @@ class HighlightElementsCommand(BaseCommand): default=None, description="Exact observed text or stable tokens to filter by detected semantic text (visible text, labels, roles, and stable element tokens). Use only for wording already seen in the screenshot or returned HTML. When provided, only matching elements are returned (no pagination). Example: ['Continue with Email', 'View comments']", ) - highlight_snapshot_id: Optional[int] = Field( - default=None, - description="Highlight snapshot ID from a previous highlight_elements response. Required when page > 1 so pagination continues on the same frozen inventory.", - ) - - @model_validator(mode="after") - def validate_snapshot_pagination(self) -> "HighlightElementsCommand": - if (self.page or 1) > 1 and self.highlight_snapshot_id is None: - raise ValueError( - "highlight_snapshot_id is required when page > 1 so pagination stays on the same frozen highlight inventory" - ) - return self class ClickElementCommand(BaseCommand): diff --git a/server/tests/unit/test_command_models.py b/server/tests/unit/test_command_models.py index 140e3a3..42d18ef 100644 --- a/server/tests/unit/test_command_models.py +++ b/server/tests/unit/test_command_models.py @@ -41,6 +41,19 @@ def test_highlight_rejects_invalid_page_numbers(self, page: int) -> None: with pytest.raises(ValidationError): HighlightElementsCommand(page=page) + def test_highlight_allows_pagination_without_snapshot_id(self) -> None: + command = HighlightElementsCommand(page=2) + + assert command.page == 2 + + def test_highlight_ignores_snapshot_id_input_for_backward_compatibility( + self, + ) -> None: + command = HighlightElementsCommand(page=2, highlight_snapshot_id=101) + + assert command.page == 2 + assert "highlight_snapshot_id" not in command.model_dump() + class TestVisualInteractionContracts: def test_scroll_supports_page_level_scrolling_without_element_id(self) -> None: diff --git a/server/tests/unit/test_prompt_contracts.py b/server/tests/unit/test_prompt_contracts.py index 1a8a825..2dfafae 100644 --- a/server/tests/unit/test_prompt_contracts.py +++ b/server/tests/unit/test_prompt_contracts.py @@ -64,6 +64,7 @@ def test_highlight_prompt_matches_default_any_workflow(self) -> None: assert '"any" (default)' in description assert "default first pass for each new page state" in description assert "extension-derived page insight across element types" in description + assert '{ "page": 2, "highlight_snapshot_id": 17 }' not in description assert '"clickable" (default without keywords)' not in description def test_highlight_prompt_keeps_icon_targets_on_any_pagination(self) -> None: @@ -84,7 +85,7 @@ def test_highlight_prompt_requires_exact_text_keywords_and_pagination_before_gue description = get_highlight_tool_description() assert ( - "Treat pages as reliable collision-free slices of the same candidate set" + "Treat pages as reliable collision-free slices of the current page state's candidate set" in description ) assert "Do not jump from a first-page miss to `keywords`" in description @@ -113,6 +114,15 @@ def test_highlight_prompt_requires_rehighlight_after_significant_page_change( in description ) + def test_highlight_prompt_uses_page_number_pagination_without_snapshot_reuse( + self, + ) -> None: + description = get_highlight_tool_description() + + assert '{ "page": 2 }' in description + assert "reuse the previous `highlight_snapshot_id`" not in description + assert "same frozen inventory" not in description + def test_highlight_prompt_omits_clickable_mode_from_agent_guidance(self) -> None: description = get_highlight_tool_description() diff --git a/server/tests/unit/test_tool_prompt_profiles.py b/server/tests/unit/test_tool_prompt_profiles.py index 8d562ab..f53f28a 100644 --- a/server/tests/unit/test_tool_prompt_profiles.py +++ b/server/tests/unit/test_tool_prompt_profiles.py @@ -139,7 +139,7 @@ def test_large_model_highlight_prompt_keeps_detailed_pagination_guidance_without in description ) assert ( - "Treat pages as reliable collision-free slices of the same candidate set" + "Treat pages as reliable collision-free slices of the current page state's candidate set" in description ) assert "Do not jump from a first-page miss to `keywords`" in description diff --git a/uv.lock b/uv.lock index 88ad2be..9b9545d 100644 --- a/uv.lock +++ b/uv.lock @@ -1675,8 +1675,8 @@ requires-dist = [ { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=bfba5e3889829067baeab3b12d38008360913771" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, - { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0" }, - { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0" }, + { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=df47da7429a04cc2a5681e701331d85fcb798f1e" }, + { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=df47da7429a04cc2a5681e701331d85fcb798f1e" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "pydantic", specifier = ">=2.5.0" }, @@ -2221,7 +2221,7 @@ wheels = [ [[package]] name = "openhands-sdk" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0#56c315b34a8d3f0b3d1c89cba57115dd859edbb0" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=df47da7429a04cc2a5681e701331d85fcb798f1e#df47da7429a04cc2a5681e701331d85fcb798f1e" } dependencies = [ { name = "agent-client-protocol" }, { name = "deprecation" }, @@ -2241,7 +2241,7 @@ dependencies = [ [[package]] name = "openhands-tools" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=56c315b34a8d3f0b3d1c89cba57115dd859edbb0#56c315b34a8d3f0b3d1c89cba57115dd859edbb0" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=df47da7429a04cc2a5681e701331d85fcb798f1e#df47da7429a04cc2a5681e701331d85fcb798f1e" } dependencies = [ { name = "bashlex" }, { name = "binaryornot" },