From 4f1d997a009289a92d485e68a0c964d42e7f86cf Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Mon, 30 Mar 2026 19:33:37 +0800 Subject: [PATCH] Refine highlight pagination and enforce explicit confirmations --- AGENTS.md | 3 +- bug_report_highlight_any_scrollable.md | 66 --- extension/src/background/index.ts | 75 ++- extension/src/commands/visual-highlight.ts | 437 +++++++++--------- extension/src/utils/collision-detection.ts | 29 +- server/agent/tools/base.py | 15 - server/agent/tools/browser_executor.py | 112 ----- .../tests/unit/test_agent_browser_executor.py | 159 ++++--- server/tests/unit/test_base_classes.py | 16 +- 9 files changed, 373 insertions(+), 539 deletions(-) delete mode 100644 bug_report_highlight_any_scrollable.md diff --git a/AGENTS.md b/AGENTS.md index 0feb22c..d9f94f2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -169,8 +169,7 @@ OpenBrowser uses Jinja2 templates for agent prompts, enabling dynamic content in - Model profile is resolved from session metadata and exposed to prompt rendering as `model_profile` / `small_model`; see `server/agent/manager.py` and `server/agent/tools/prompt_context.py` - Tool prompt variants are split by model profile under `server/agent/prompts/small_model/` and `server/agent/prompts/big_model/` - Small-model browser guidance intentionally avoids `keywords` fallback and leans harder on same-mode highlight pagination when dense UI may be split across collision-aware pages -- Observation rendering also differs by model profile: large models keep clickable highlights compact (`... and N clickable elements`), while small models include clickable element HTML in the LLM-visible observation text for extra semantic grounding -- The small-model clickable-observation branch is implemented in `server/agent/tools/base.py`; the per-conversation `small_model` flag is attached in `server/agent/tools/browser_executor.py` +- Observation rendering now includes clickable element HTML for all model profiles; prompt differences still vary by model profile ### Keyword Discipline - Highlight pagination remains the default discovery flow for controls and dense UI diff --git a/bug_report_highlight_any_scrollable.md b/bug_report_highlight_any_scrollable.md deleted file mode 100644 index 4c6dcc5..0000000 --- a/bug_report_highlight_any_scrollable.md +++ /dev/null @@ -1,66 +0,0 @@ -## Bug Report: `highlight_elements` with `type='any'` Never Returns Scrollable Elements - -**Date:** 2026-03-22 -**Component:** `extension/src/commands/highlight-detection.injected.js` -**Severity:** Medium -**Type:** Logic Error / Dead Code - ---- - -### Summary - -When calling `highlight_elements(element_type='any')`, scrollable elements are effectively **never returned** due to short-circuit logic in `resolveElementCandidate()`. The scrollable check (lines 948-957) is dead code for `type='any'`. - ---- - -### Root Cause - -In `resolveElementCandidate(el, 'any')` (lines 837-973): - -```javascript -const clickableCandidate = resolveClickableCandidate(el); // Traverses UP DOM tree - -if (clickableCandidate) { - return { type: 'clickable', ... }; // ← EARLY RETURN -} - -// Lines 924-969 below are NEVER reached when clickableCandidate exists: -if (isInputableCandidate(el)) // dead code -if (isSelectableCandidate(el)) // dead code -if (isScrollableCandidate(el)) // dead code ← SCROLLABLE NEVER RETURNED -if (isHoverableCandidate(el)) // dead code -``` - -The function short-circuits on the first clickable ancestor found (by traversing UP the DOM tree), making all subsequent type checks dead code for `type='any'`. - ---- - -### Expected Behavior - -Per collision-aware pagination design, `type='any'` should return elements across **all types** (clickable, inputable, selectable, scrollable, hoverable), prioritized by `HIGHLIGHT_TYPE_PRIORITY`. - ---- - -### Actual Behavior - -`type='any'` effectively becomes `type='clickable'`. Elements are only returned if: -- They themselves are clickable, OR -- Any **ancestor** has clickable characteristics (pointer cursor + text content) - -This causes: -- Scrollable divs to be misclassified as "clickable" (their ancestor) -- Hoverable elements to be skipped entirely -- False positives where a non-interactive ancestor shadows the actual target element - ---- - -### Affected Code Paths - -- `resolveElementCandidate()` — lines 912-922 (early return on `clickableCandidate`) -- `resolveClickableCandidate()` — lines 632-689 (DOM tree traversal with `isTightClickableWrapper` checks) - ---- - -### Suggested Fix - -For `type='any'`, collect candidates across ALL types and use `compareCandidates()` with `HIGHLIGHT_TYPE_PRIORITY` for selection — matching the behavior of the specific-type paths (lines 840-910) rather than short-circuiting. diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index c725b93..ca73359 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -45,8 +45,7 @@ import { } from '../commands/label-constants'; import { getOrCreateUUID } from '../uuid/uuidGenerator'; import { - selectCollisionFreePage, - calculateTotalPages, + paginateCollisionFreeElements, sortElementsByVisualOrder, } from '../utils/collision-detection'; import { @@ -161,35 +160,25 @@ async function runRawScreenshotPrime(options: { function buildStoredHighlightPages(options: { filteredElements: InteractiveElement[]; - totalPages: number; viewportWidth: number; viewportHeight: number; keywordMode: boolean; }): InteractiveElement[][] { - const { - filteredElements, - totalPages, - viewportWidth, - viewportHeight, - keywordMode, - } = options; + const { filteredElements, viewportWidth, viewportHeight, keywordMode } = + options; if (keywordMode) { return [sortElementsByVisualOrder(filteredElements)]; } - const pages: InteractiveElement[][] = []; - for (let page = 1; page <= totalPages; page++) { - const pageElements = selectCollisionFreePage( - filteredElements, - page, - viewportWidth, - viewportHeight, - ); - pages.push(sortElementsByVisualOrder(pageElements)); - } - - return pages; + const collisionFreePages = paginateCollisionFreeElements( + filteredElements, + viewportWidth, + viewportHeight, + ); + return collisionFreePages.map((collisionFreePage) => + sortElementsByVisualOrder(collisionFreePage), + ); } function buildHighlightConsistencyScript( @@ -389,38 +378,39 @@ async function captureHighlightedPageState( `⏱️ [HighlightTrace] background keyword-filter ${Date.now() - keywordFilterStart}ms (keywords=${keywordList.length}, kept=${filteredElements.length}/${allElements.length})`, ); + let storedPages: InteractiveElement[][]; let paginatedElements: InteractiveElement[]; let totalPages: number; let currentPage = page; if (keywordList.length > 0) { - paginatedElements = filteredElements; - totalPages = 1; + storedPages = buildStoredHighlightPages({ + filteredElements, + viewportWidth: detectedViewportWidth, + viewportHeight: detectedViewportHeight, + keywordMode: true, + }); + paginatedElements = storedPages[0] ?? []; + totalPages = storedPages.length || 1; currentPage = 1; console.log( `🔍 [${logLabel}] Keywords [${keywordList.join(', ')}] matched ${paginatedElements.length} elements (no pagination)`, ); } else { - const paginationSelectionStart = Date.now(); - paginatedElements = selectCollisionFreePage( + const paginationBuildStart = Date.now(); + storedPages = buildStoredHighlightPages({ filteredElements, - page, - detectedViewportWidth, - detectedViewportHeight, - ); - const paginationSelectionMs = Date.now() - paginationSelectionStart; - const totalPagesStart = Date.now(); - totalPages = calculateTotalPages( - filteredElements, - detectedViewportWidth, - detectedViewportHeight, - ); - const totalPagesMs = Date.now() - totalPagesStart; + viewportWidth: detectedViewportWidth, + viewportHeight: detectedViewportHeight, + keywordMode: false, + }); + paginatedElements = storedPages[Math.max(0, page - 1)] ?? []; + totalPages = storedPages.length; console.log( `📄 [${logLabel}] Page ${page}/${totalPages}, showing ${paginatedElements.length} of ${filteredElements.length} elements`, ); console.log( - `⏱️ [HighlightTrace] background pagination select=${paginationSelectionMs}ms totalPages=${totalPagesMs}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, + `⏱️ [HighlightTrace] background pagination build-pages=${Date.now() - paginationBuildStart}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, ); } @@ -515,13 +505,6 @@ async function captureHighlightedPageState( ); } - const storedPages = buildStoredHighlightPages({ - filteredElements, - totalPages, - viewportWidth: detectedViewportWidth, - viewportHeight: detectedViewportHeight, - keywordMode: keywordList.length > 0, - }); const displayOrderedElements = storedPages[currentPage - 1] ?? []; const cacheStoreStart = Date.now(); diff --git a/extension/src/commands/visual-highlight.ts b/extension/src/commands/visual-highlight.ts index eba80fd..e368d3f 100644 --- a/extension/src/commands/visual-highlight.ts +++ b/extension/src/commands/visual-highlight.ts @@ -36,10 +36,8 @@ const LABEL_BG_COLORS: Record = { any: 'rgba(0, 204, 204, 0.7)', }; -/** - * Maximum number of elements to draw at once to prevent performance issues - */ -const MAX_ELEMENTS = 50; +const DRAW_YIELD_EVERY = 12; +const MAX_ENCODED_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB limit for the final data URL /** * Base sizes in CSS pixels (will be multiplied by scale for device pixels) @@ -47,13 +45,23 @@ const MAX_ELEMENTS = 50; const BASE_BOX_PADDING = 1; // Box padding at scale=1 (minimal to avoid covering content) const BASE_LINE_WIDTH = 1.5; // Box border width at scale=1 (thinner to reduce overlap) +interface PreparedBoundingBox { + element: InteractiveElement; + boxX: number; + boxY: number; + boxWidth: number; + boxHeight: number; + color: string; + bgColor: string; +} + /** * Draw highlights (bounding boxes with labels) on a screenshot * * @param screenshotDataUrl - Base64 data URL of the screenshot * @param elements - Array of interactive elements to highlight * @param options - Highlight options (limit, offset, elementTypes filter, scale) - * @returns Promise resolving to base64 PNG data URL with highlights drawn + * @returns Promise resolving to a base64 data URL with highlights drawn */ export async function drawHighlights( screenshotDataUrl: string, @@ -179,11 +187,6 @@ export async function drawHighlights( `📐 [VisualHighlight] Using scale: ${scale.toFixed(3)} (provided: ${providedScale}, calculated: ${actualScale})`, ); - // Create OffscreenCanvas with same dimensions as screenshot - console.log( - `🖼️ [VisualHighlight] Screenshot dimensions: ${imageBitmap.width}x${imageBitmap.height}`, - ); - // Create OffscreenCanvas with same dimensions as screenshot const canvas = new OffscreenCanvas(imageBitmap.width, imageBitmap.height); const ctx = canvas.getContext('2d'); @@ -200,41 +203,22 @@ export async function drawHighlights( // Close the bitmap to free memory imageBitmap.close(); - // Note: Single-type design - caller already filtered by element_type - // All elements passed here are of the same type console.log( `🎨 [VisualHighlight] Drawing ${elements.length} elements (type: ${elements[0]?.type || 'none'}, scale=${scale})`, ); - const elementsToDraw = elements; - - // Draw each element's bounding box and label - // Scale coordinates from CSS pixels to device pixels - for (const element of elementsToDraw) { - drawBoundingBox(ctx, element, scale); - } - - const resultBlob = await canvas.convertToBlob({ type: 'image/png' }); - - // Convert blob to data URL - const dataUrl = await new Promise((resolve, reject) => { - const reader = new FileReader(); - reader.onloadend = () => resolve(reader.result as string); - reader.onerror = () => - reject( - new Error('[VisualHighlight] Failed to convert result to data URL'), - ); - reader.readAsDataURL(resultBlob); - }); - - // Compress if too large (>10MB) - const MAX_SIZE = 10 * 1024 * 1024; // 10MB limit - const compressedDataUrl = await compressDataUrl(dataUrl, MAX_SIZE); + const preparedBoxes = await prepareBoundingBoxes(elements, scale); + await drawBoundingBoxes(ctx, preparedBoxes, scale); + await drawLabels(ctx, preparedBoxes, scale); + const encodedResult = await encodeCanvasWithCompression( + canvas, + MAX_ENCODED_IMAGE_SIZE, + ); console.log( - `✅ [VisualHighlight] Highlights drawn successfully, size: ${compressedDataUrl.length} bytes`, + `✅ [VisualHighlight] Highlights drawn successfully, size: ${encodedResult.byteLength} bytes`, ); - return compressedDataUrl; + return encodedResult.dataUrl; } catch (error) { const errorMsg = `[VisualHighlight] Error drawing highlights: ${error instanceof Error ? error.message : error}`; console.error(`❌ ${errorMsg}`); @@ -242,230 +226,237 @@ export async function drawHighlights( } } -/** - * Compress a data URL if it exceeds the maximum size - * Uses JPEG encoding with quality reduction - * - * @param dataUrl - Original data URL (PNG or JPEG) - * @param maxSize - Maximum allowed size in bytes - * @returns Compressed data URL (JPEG if compression was needed) - */ -async function compressDataUrl( - dataUrl: string, - maxSize: number, -): Promise { - if (dataUrl.length <= maxSize) { - console.log( - `🖼️ [Compress] Image size ${dataUrl.length} bytes is within limit ${maxSize} bytes`, - ); - return dataUrl; - } +async function prepareBoundingBoxes( + elements: InteractiveElement[], + scale: number, +): Promise { + const preparedBoxes: PreparedBoundingBox[] = []; - console.log( - `⚠️ [Compress] Image size ${dataUrl.length} bytes exceeds limit ${maxSize} bytes, compressing...`, - ); + for (let index = 0; index < elements.length; index++) { + preparedBoxes.push(buildPreparedBoundingBox(elements[index], scale)); - // Check if OffscreenCanvas is available - if ( - typeof OffscreenCanvas === 'undefined' || - typeof createImageBitmap === 'undefined' - ) { - console.warn( - '[Compress] OffscreenCanvas not available, returning original image', - ); - return dataUrl; + if ((index + 1) % DRAW_YIELD_EVERY === 0) { + await yieldToMainThread(); + } } - try { - // Convert data URL to Blob - const response = await fetch(dataUrl); - const blob = await response.blob(); + return preparedBoxes; +} - // Create ImageBitmap - const imageBitmap = await createImageBitmap(blob); - const { width, height } = imageBitmap; +function buildPreparedBoundingBox( + element: InteractiveElement, + scale: number, +): PreparedBoundingBox { + const { x, y, width, height } = element.bbox; + const boxPadding = Math.round(BASE_BOX_PADDING * scale); + const boxX = Math.round(x * scale) - boxPadding; + const boxY = Math.round(y * scale) - boxPadding; + const boxWidth = Math.round(width * scale) + boxPadding * 2; + const boxHeight = Math.round(height * scale) + boxPadding * 2; - console.log(`🖼️ [Compress] Original dimensions: ${width}x${height}`); + return { + element, + boxX, + boxY, + boxWidth, + boxHeight, + color: COLORS[element.type] || '#CCCCCC', + bgColor: LABEL_BG_COLORS[element.type] || 'rgba(200, 200, 200, 0.7)', + }; +} - // Try different JPEG qualities - const qualities = [0.8, 0.7, 0.6, 0.5, 0.4]; +async function drawBoundingBoxes( + ctx: OffscreenCanvasRenderingContext2D, + preparedBoxes: PreparedBoundingBox[], + scale: number, +): Promise { + if (preparedBoxes.length === 0) { + return; + } - for (const quality of qualities) { - const canvas = new OffscreenCanvas(width, height); - const ctx = canvas.getContext('2d'); + const groupedBoxes = new Map(); + for (const preparedBox of preparedBoxes) { + const boxes = groupedBoxes.get(preparedBox.color) ?? []; + boxes.push(preparedBox); + groupedBoxes.set(preparedBox.color, boxes); + } - if (!ctx) { - throw new Error('[Compress] Failed to get 2d context'); - } + ctx.lineWidth = BASE_LINE_WIDTH * scale; - // Fill with white background (for JPEG) - ctx.fillStyle = '#FFFFFF'; - ctx.fillRect(0, 0, width, height); - ctx.drawImage(imageBitmap, 0, 0); - - const compressedBlob = await canvas.convertToBlob({ - type: 'image/jpeg', - quality: quality, - }); - - // Convert to data URL - const compressedDataUrl = await new Promise((resolve, reject) => { - const reader = new FileReader(); - reader.onloadend = () => resolve(reader.result as string); - reader.onerror = () => - reject(new Error('[Compress] Failed to read compressed blob')); - reader.readAsDataURL(compressedBlob); - }); + for (const [color, boxes] of groupedBoxes) { + ctx.strokeStyle = color; + ctx.beginPath(); - console.log( - `🖼️ [Compress] JPEG quality=${quality}: ${compressedDataUrl.length} bytes`, - ); + for (let index = 0; index < boxes.length; index++) { + const box = boxes[index]; + ctx.rect(box.boxX, box.boxY, box.boxWidth, box.boxHeight); - if (compressedDataUrl.length <= maxSize) { - console.log( - `✅ [Compress] Compressed to ${compressedDataUrl.length} bytes with quality=${quality}`, - ); - return compressedDataUrl; + if ((index + 1) % DRAW_YIELD_EVERY === 0 && index + 1 < boxes.length) { + ctx.stroke(); + ctx.beginPath(); + await yieldToMainThread(); } } - // If still too large, try reducing dimensions - console.log( - `⚠️ [Compress] Quality reduction not enough, trying to reduce dimensions...`, + ctx.stroke(); + } +} + +async function drawLabels( + ctx: OffscreenCanvasRenderingContext2D, + preparedBoxes: PreparedBoundingBox[], + scale: number, +): Promise { + if (preparedBoxes.length === 0) { + return; + } + + ctx.textBaseline = 'top'; + + for (let index = 0; index < preparedBoxes.length; index++) { + const preparedBox = preparedBoxes[index]; + drawLabel( + ctx, + preparedBox.element.id, + preparedBox.boxX, + preparedBox.boxY, + preparedBox.boxWidth, + preparedBox.boxHeight, + preparedBox.bgColor, + scale, + preparedBox.element.labelPosition || 'above', ); - const scales = [0.75, 0.5, 0.4]; - for (const scale of scales) { - const newWidth = Math.round(width * scale); - const newHeight = Math.round(height * scale); + if ((index + 1) % DRAW_YIELD_EVERY === 0) { + await yieldToMainThread(); + } + } +} - const canvas = new OffscreenCanvas(newWidth, newHeight); - const ctx = canvas.getContext('2d'); +async function encodeCanvasWithCompression( + canvas: OffscreenCanvas, + maxSize: number, +): Promise<{ dataUrl: string; byteLength: number }> { + const pngBlob = await canvas.convertToBlob({ type: 'image/png' }); + const pngEstimatedSize = estimateDataUrlLength(pngBlob); - if (!ctx) { - throw new Error('[Compress] Failed to get 2d context'); - } + if (pngEstimatedSize <= maxSize) { + console.log( + `🖼️ [Compress] Image size ${pngEstimatedSize} bytes is within limit ${maxSize} bytes`, + ); + return buildEncodedResult(pngBlob); + } - ctx.fillStyle = '#FFFFFF'; - ctx.fillRect(0, 0, newWidth, newHeight); - ctx.drawImage(imageBitmap, 0, 0, newWidth, newHeight); + console.log( + `⚠️ [Compress] Image size ${pngEstimatedSize} bytes exceeds limit ${maxSize} bytes, compressing...`, + ); - const compressedBlob = await canvas.convertToBlob({ - type: 'image/jpeg', - quality: 0.6, - }); + const qualities = [0.8, 0.7, 0.6, 0.5, 0.4]; + for (const quality of qualities) { + const compressedBlob = await canvas.convertToBlob({ + type: 'image/jpeg', + quality, + }); + const estimatedSize = estimateDataUrlLength(compressedBlob); - const compressedDataUrl = await new Promise((resolve, reject) => { - const reader = new FileReader(); - reader.onloadend = () => resolve(reader.result as string); - reader.onerror = () => - reject(new Error('[Compress] Failed to read compressed blob')); - reader.readAsDataURL(compressedBlob); - }); + console.log( + `🖼️ [Compress] JPEG quality=${quality}: ${estimatedSize} bytes`, + ); + if (estimatedSize <= maxSize) { console.log( - `🖼️ [Compress] Scale=${scale} (${newWidth}x${newHeight}): ${compressedDataUrl.length} bytes`, + `✅ [Compress] Compressed to ${estimatedSize} bytes with quality=${quality}`, ); - - if (compressedDataUrl.length <= maxSize) { - console.log( - `✅ [Compress] Compressed to ${compressedDataUrl.length} bytes with scale=${scale}`, - ); - return compressedDataUrl; - } + return buildEncodedResult(compressedBlob); } + } - // Return the smallest version even if still over limit - console.warn( - `⚠️ [Compress] Could not compress below ${maxSize} bytes, returning smallest version`, - ); + console.log( + '⚠️ [Compress] Quality reduction not enough, trying to reduce dimensions...', + ); + + const scales = [0.75, 0.5, 0.4]; + for (const scale of scales) { + const scaledCanvas = createScaledCanvas(canvas, scale); + const compressedBlob = await scaledCanvas.convertToBlob({ + type: 'image/jpeg', + quality: 0.6, + }); + const estimatedSize = estimateDataUrlLength(compressedBlob); - // Return 50% scale JPEG with 0.4 quality as final fallback - const finalCanvas = new OffscreenCanvas( - Math.round(width * 0.5), - Math.round(height * 0.5), + console.log( + `🖼️ [Compress] Scale=${scale} (${scaledCanvas.width}x${scaledCanvas.height}): ${estimatedSize} bytes`, ); - const finalCtx = finalCanvas.getContext('2d'); - if (finalCtx) { - finalCtx.fillStyle = '#FFFFFF'; - finalCtx.fillRect(0, 0, finalCanvas.width, finalCanvas.height); - finalCtx.drawImage( - imageBitmap, - 0, - 0, - finalCanvas.width, - finalCanvas.height, + + if (estimatedSize <= maxSize) { + console.log( + `✅ [Compress] Compressed to ${estimatedSize} bytes with scale=${scale}`, ); - const finalBlob = await finalCanvas.convertToBlob({ - type: 'image/jpeg', - quality: 0.4, - }); - return new Promise((resolve, reject) => { - const reader = new FileReader(); - reader.onloadend = () => resolve(reader.result as string); - reader.onerror = () => - reject(new Error('[Compress] Failed to read final blob')); - reader.readAsDataURL(finalBlob); - }); + return buildEncodedResult(compressedBlob); } - - return dataUrl; - } catch (error) { - console.error( - `❌ [Compress] Compression failed: ${error instanceof Error ? error.message : error}`, - ); - return dataUrl; } + + console.warn( + `⚠️ [Compress] Could not compress below ${maxSize} bytes, returning smallest version`, + ); + + const fallbackCanvas = createScaledCanvas(canvas, 0.5); + const fallbackBlob = await fallbackCanvas.convertToBlob({ + type: 'image/jpeg', + quality: 0.4, + }); + return buildEncodedResult(fallbackBlob); } -/** - * Draw a bounding box with label for an interactive element - * - * @param ctx - Canvas 2D rendering context - * @param element - Interactive element to draw - * @param scale - Scale factor to convert CSS pixels to device pixels (default: 1) - */ -function drawBoundingBox( - ctx: OffscreenCanvasRenderingContext2D, - element: InteractiveElement, - scale: number = 1, -): void { - const color = COLORS[element.type] || '#CCCCCC'; - const { x, y, width, height } = element.bbox; +async function buildEncodedResult( + blob: Blob, +): Promise<{ dataUrl: string; byteLength: number }> { + const dataUrl = await blobToDataUrl(blob); + return { + dataUrl, + byteLength: dataUrl.length, + }; +} - // Calculate device-pixel values from base CSS sizes - const boxPadding = Math.round(BASE_BOX_PADDING * scale); - const lineWidth = BASE_LINE_WIDTH * scale; +function estimateDataUrlLength(blob: Blob): number { + const headerLength = `data:${blob.type || 'application/octet-stream'};base64,` + .length; + return headerLength + Math.ceil(blob.size / 3) * 4; +} - // Apply scale to convert CSS pixels to device pixels, then apply padding - // x, y are viewport-relative from getBoundingClientRect() - const boxX = Math.round(x * scale) - boxPadding; - const boxY = Math.round(y * scale) - boxPadding; - const boxWidth = Math.round(width * scale) + boxPadding * 2; - const boxHeight = Math.round(height * scale) + boxPadding * 2; +function createScaledCanvas( + sourceCanvas: OffscreenCanvas, + scale: number, +): OffscreenCanvas { + const scaledWidth = Math.max(1, Math.round(sourceCanvas.width * scale)); + const scaledHeight = Math.max(1, Math.round(sourceCanvas.height * scale)); + const scaledCanvas = new OffscreenCanvas(scaledWidth, scaledHeight); + const scaledCtx = scaledCanvas.getContext('2d'); + + if (!scaledCtx) { + throw new Error('[Compress] Failed to get 2d context'); + } - console.log( - `[VisualHighlight] Drawing bbox for ${element.id}: CSS(${x}, ${y}, ${width}, ${height}) → Device(${boxX}, ${boxY}, ${boxWidth}, ${boxHeight}) scale=${scale}`, - ); + scaledCtx.fillStyle = '#FFFFFF'; + scaledCtx.fillRect(0, 0, scaledWidth, scaledHeight); + scaledCtx.drawImage(sourceCanvas, 0, 0, scaledWidth, scaledHeight); + return scaledCanvas; +} - // Draw bounding box - ctx.strokeStyle = color; - ctx.lineWidth = lineWidth; - ctx.strokeRect(boxX, boxY, boxWidth, boxHeight); +async function blobToDataUrl(blob: Blob): Promise { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onloadend = () => resolve(reader.result as string); + reader.onerror = () => + reject( + new Error('[VisualHighlight] Failed to convert result to data URL'), + ); + reader.readAsDataURL(blob); + }); +} - // Draw element ID label with transparent background - const bgColor = LABEL_BG_COLORS[element.type] || 'rgba(200, 200, 200, 0.7)'; - drawLabel( - ctx, - element.id, - boxX, - boxY, - boxWidth, - boxHeight, - bgColor, - scale, - element.labelPosition || 'above', - ); +async function yieldToMainThread(): Promise { + await new Promise((resolve) => setTimeout(resolve, 0)); } /** diff --git a/extension/src/utils/collision-detection.ts b/extension/src/utils/collision-detection.ts index 31c69ca..054abc2 100644 --- a/extension/src/utils/collision-detection.ts +++ b/extension/src/utils/collision-detection.ts @@ -206,6 +206,31 @@ export function isLabelWithinViewport( ); } +/** + * Build collision-free highlight pages using a constraint-aware greedy + * algorithm. It places the most-constrained remaining element first, then + * chooses the label position that blocks the fewest later elements. + * + * @param elements - All elements sorted by priority + * @param page - 1-indexed page number + * @param viewportWidth - Optional viewport width for boundary checks + * @param viewportHeight - Optional viewport height for boundary checks + * @returns All collision-free pages up to maxPages when provided + */ +export function paginateCollisionFreeElements( + elements: InteractiveElement[], + viewportWidth?: number, + viewportHeight?: number, + maxPages?: number, +): InteractiveElement[][] { + return buildCollisionFreePages( + elements, + viewportWidth, + viewportHeight, + maxPages, + ); +} + /** * Select a collision-free page of elements using a constraint-aware greedy * algorithm. It places the most-constrained remaining element first, then @@ -227,7 +252,7 @@ export function selectCollisionFreePage( return []; } - const pages = buildCollisionFreePages( + const pages = paginateCollisionFreeElements( elements, viewportWidth, viewportHeight, @@ -245,7 +270,7 @@ export function calculateTotalPages( viewportWidth?: number, viewportHeight?: number, ): number { - return buildCollisionFreePages(elements, viewportWidth, viewportHeight) + return paginateCollisionFreeElements(elements, viewportWidth, viewportHeight) .length; } diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py index 66af7bc..37a330e 100644 --- a/server/agent/tools/base.py +++ b/server/agent/tools/base.py @@ -383,14 +383,9 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]: text_parts.append("") # Format: id: for each element element_descriptions = [] - clickable_count = 0 - include_clickable_html = bool(self.small_model) for el in self.highlighted_elements: el_id = el.get("id", "unknown") el_type = el.get("type") - if el_type == "clickable" and not include_clickable_html: - clickable_count += 1 - continue raw_hints = ( el.get("interactionHints") or el.get("interaction_hints") or [] ) @@ -415,16 +410,6 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]: else: tag = el.get("tagName", "").upper() element_descriptions.append(f"{display_id} ({tag})") - if clickable_count: - clickable_label = ( - f"{clickable_count} clickable element" - if clickable_count == 1 - else f"{clickable_count} clickable elements" - ) - if element_descriptions: - element_descriptions.append(f"... and {clickable_label}") - else: - element_descriptions.append(clickable_label) text_parts.append("\n".join(element_descriptions)) text_parts.append("") diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py index 778871d..41be4bb 100644 --- a/server/agent/tools/browser_executor.py +++ b/server/agent/tools/browser_executor.py @@ -12,7 +12,6 @@ """ import asyncio -from collections import OrderedDict import logging import threading from typing import Any, Dict, Optional, Union @@ -52,26 +51,7 @@ logger = logging.getLogger(__name__) -CONFIRMED_ACTION_ID_CACHE_SIZE = 10 ELEMENT_HTML_CACHE_MISS_PLACEHOLDER = "" -VISUAL_SAFE_ELEMENT_ID_LENGTH = 3 -VISUAL_SAFE_ELEMENT_ID_CHAR_MAP = { - "0": "O", - "o": "O", - "O": "O", - "i": "1", - "I": "1", - "l": "1", - "L": "1", - "z": "2", - "Z": "2", - "s": "5", - "S": "5", - "g": "6", - "G": "6", - "b": "8", - "B": "8", -} # Global registry for shared BrowserExecutor instances per conversation # Key: conversation_id (str), Value: BrowserExecutor instance @@ -121,8 +101,6 @@ def __init__(self): self.conversation_id = None # Pending confirmations per conversation for 2PC actions. self.pending_confirmations: Dict[str, Dict[str, Any]] = {} - # Recently confirmed element targets keyed by action_type then element_id. - self.confirmed_action_id_lru: Dict[str, Dict[str, OrderedDict[str, None]]] = {} def _uses_small_model(self) -> bool: """Whether the active conversation uses the small-model profile.""" @@ -380,18 +358,6 @@ def _execute_element_interaction_action( if action_type == "click": if not action.element_id: raise ValueError("click requires element_id parameter") - if self._has_confirmed_action_element_id("click", action.element_id): - command = ClickElementCommand( - element_id=action.element_id, - conversation_id=self.conversation_id, - tab_id=action.tab_id, - ) - result_dict = self._execute_element_command(command, "click element") - return self._build_observation_from_result( - result_dict, - f"Clicked element: {action.element_id}", - element_id=action.element_id, - ) element_preview = self._get_element_full_html(action.element_id, "click") full_html = element_preview[0] screenshot = element_preview[1] @@ -504,21 +470,6 @@ def _execute_element_interaction_action( raise ValueError("keyboard_input requires element_id parameter") if not action.text: raise ValueError("keyboard_input requires text parameter") - if self._has_confirmed_action_element_id( - "keyboard_input", action.element_id - ): - command = KeyboardInputCommand( - element_id=action.element_id, - text=action.text, - conversation_id=self.conversation_id, - tab_id=action.tab_id, - ) - result_dict = self._execute_element_command(command, "input text") - return self._build_observation_from_result( - result_dict, - f"Input text to element: {action.element_id}", - element_id=action.element_id, - ) element_preview = self._get_element_full_html( action.element_id, "keyboard_input" ) @@ -605,7 +556,6 @@ def _execute_element_interaction_action( if not result_dict or not result_dict.get("success"): ext_error = self._extract_result_error(result_dict) raise RuntimeError(f"Failed to click element: {ext_error}") - self._remember_confirmed_action_element_id("click", pending_element_id) message = f"Confirmed and clicked element: {pending_element_id}" self._clear_pending_confirmation() return self._build_observation_from_result( @@ -636,9 +586,6 @@ def _execute_element_interaction_action( if not result_dict or not result_dict.get("success"): ext_error = self._extract_result_error(result_dict) raise RuntimeError(f"Failed to input text: {ext_error}") - self._remember_confirmed_action_element_id( - "keyboard_input", pending_element_id - ) message = f"Confirmed and input text to element: {pending_element_id}" self._clear_pending_confirmation() return self._build_observation_from_result( @@ -727,24 +674,6 @@ def _get_pending_confirmation(self) -> Optional[Dict[str, Any]]: """Get pending confirmation for current conversation""" return self.pending_confirmations.get(self.conversation_id) - def _normalize_confirmed_action_element_id( - self, element_id: Optional[str] - ) -> Optional[str]: - """Normalize cached element ids used for repeat-confirmation shortcuts.""" - if not isinstance(element_id, str): - return None - - compact = element_id.strip().replace(" ", "") - if not compact: - return None - - if len(compact) != VISUAL_SAFE_ELEMENT_ID_LENGTH or not compact.isalnum(): - return compact - - return "".join( - VISUAL_SAFE_ELEMENT_ID_CHAR_MAP.get(char, char.upper()) for char in compact - ) - def _build_element_id_resolution_note( self, requested_element_id: Optional[str], @@ -764,47 +693,6 @@ def _build_element_id_resolution_note( f"'{resolved_element_id}'." ) - def _get_confirmed_action_element_id_lru( - self, action_type: str - ) -> OrderedDict[str, None]: - """Get or create the confirmed-action element-id LRU cache for current conversation.""" - conversation_lru = self.confirmed_action_id_lru.setdefault( - self.conversation_id, {} - ) - return conversation_lru.setdefault(action_type, OrderedDict()) - - def _has_confirmed_action_element_id( - self, action_type: str, element_id: Optional[str] - ) -> bool: - """Return whether this exact element id was recently confirmed for the action.""" - normalized = self._normalize_confirmed_action_element_id(element_id) - if normalized is None: - return False - - lru = self._get_confirmed_action_element_id_lru(action_type) - if normalized not in lru: - return False - - lru.move_to_end(normalized) - return True - - def _remember_confirmed_action_element_id( - self, action_type: str, element_id: Optional[str] - ) -> None: - """Record a confirmed-action element id in the per-conversation LRU cache.""" - normalized = self._normalize_confirmed_action_element_id(element_id) - if normalized is None: - return - - lru = self._get_confirmed_action_element_id_lru(action_type) - if normalized in lru: - lru.move_to_end(normalized) - else: - lru[normalized] = None - - while len(lru) > CONFIRMED_ACTION_ID_CACHE_SIZE: - lru.popitem(last=False) - def _extract_result_error( self, result_dict: Optional[Dict[str, Any]], default: str = "Unknown error" ) -> str: diff --git a/server/tests/unit/test_agent_browser_executor.py b/server/tests/unit/test_agent_browser_executor.py index 2e93542..db9a669 100644 --- a/server/tests/unit/test_agent_browser_executor.py +++ b/server/tests/unit/test_agent_browser_executor.py @@ -8,7 +8,6 @@ from server.agent.tools.element_interaction_tool import ElementInteractionAction from server.agent.tools.highlight_tool import HighlightAction from server.models.commands import ( - ClickElementCommand, HoverElementCommand, SwipeElementCommand, ) @@ -264,31 +263,25 @@ def fake_execute(command): assert executor._get_pending_confirmation() is None -def test_repeat_click_with_confirmed_element_id_executes_without_pending_confirmation( +def test_click_always_requires_pending_confirmation_even_after_prior_confirm( monkeypatch, ) -> None: executor = BrowserExecutor() executor.conversation_id = "conv-repeat-click" - executor._remember_confirmed_action_element_id("click", "15") monkeypatch.setattr( executor, "_get_element_full_html", - lambda *args, **kwargs: (_ for _ in ()).throw( - AssertionError("should not fetch confirmation preview") + lambda element_id, intended_action=None: ( + "", + "data:image/png;base64,pending", ), ) - - captured = {} - - def fake_execute(command): - captured["command"] = command - return { - "success": True, - "data": {"screenshot": "data:image/png;base64,clicked-direct"}, - } - - monkeypatch.setattr(executor, "_execute_command_sync", fake_execute) + monkeypatch.setattr( + executor, + "_execute_command_sync", + lambda command: (_ for _ in ()).throw(AssertionError("should stay in 2PC")), + ) observation = executor._execute_action_sync( ElementInteractionAction( @@ -300,12 +293,10 @@ def fake_execute(command): ) assert observation.success is True - assert observation.message == "Clicked element: 15" - assert observation.pending_confirmation is None - assert isinstance(captured["command"], ClickElementCommand) - assert captured["command"].element_id == "15" - assert captured["command"].tab_id == 456 - assert executor._get_pending_confirmation() is None + assert observation.message == "Click action pending confirmation for element: 15" + assert observation.pending_confirmation is not None + assert observation.pending_confirmation["action_type"] == "click" + assert observation.pending_confirmation["element_id"] == "15" def test_unconfirmed_click_element_id_still_enters_pending_confirmation( @@ -344,53 +335,25 @@ def test_unconfirmed_click_element_id_still_enters_pending_confirmation( ) -def test_confirmed_action_element_id_lru_keeps_only_most_recent_ten_entries() -> None: - executor = BrowserExecutor() - executor.conversation_id = "conv-repeat-lru" - - for i in range(11): - executor._remember_confirmed_action_element_id("click", f"id-{i}") - - lru = executor._get_confirmed_action_element_id_lru("click") - - assert len(lru) == 10 - assert "id-0" not in lru - assert "id-10" in lru - - -def test_confirmed_action_element_id_normalization_only_applies_to_visual_ids() -> None: - executor = BrowserExecutor() - - assert executor._normalize_confirmed_action_element_id("D02") == "DO2" - assert executor._normalize_confirmed_action_element_id(" d o 2 ") == "DO2" - assert executor._normalize_confirmed_action_element_id("id-10") == "id-10" - - -def test_repeat_keyboard_input_with_confirmed_element_id_executes_without_pending_confirmation( +def test_keyboard_input_always_requires_pending_confirmation_even_after_prior_confirm( monkeypatch, ) -> None: executor = BrowserExecutor() executor.conversation_id = "conv-repeat-input" - executor._remember_confirmed_action_element_id("keyboard_input", "inp123") monkeypatch.setattr( executor, "_get_element_full_html", - lambda *args, **kwargs: (_ for _ in ()).throw( - AssertionError("should not fetch confirmation preview") + lambda element_id, intended_action=None: ( + '', + "data:image/png;base64,input-pending", ), ) - - captured = {} - - def fake_execute(command): - captured["command"] = command - return { - "success": True, - "data": {"screenshot": "data:image/png;base64,input-direct"}, - } - - monkeypatch.setattr(executor, "_execute_command_sync", fake_execute) + monkeypatch.setattr( + executor, + "_execute_command_sync", + lambda command: (_ for _ in ()).throw(AssertionError("should stay in 2PC")), + ) observation = executor._execute_action_sync( ElementInteractionAction( @@ -403,12 +366,13 @@ def fake_execute(command): ) assert observation.success is True - assert observation.message == "Input text to element: inp123" - assert observation.pending_confirmation is None - assert captured["command"].element_id == "inp123" - assert captured["command"].text == "hello" - assert captured["command"].tab_id == 321 - assert executor._get_pending_confirmation() is None + assert ( + observation.message + == "Keyboard input action pending confirmation for element: inp123" + ) + assert observation.pending_confirmation is not None + assert observation.pending_confirmation["action_type"] == "keyboard_input" + assert observation.pending_confirmation["element_id"] == "inp123" def test_click_pending_confirmation_records_corrected_element_id(monkeypatch) -> None: @@ -458,7 +422,6 @@ def test_confirmed_click_element_id_does_not_skip_keyboard_input_confirmation( ) -> None: executor = BrowserExecutor() executor.conversation_id = "conv-input-action-isolation" - executor._remember_confirmed_action_element_id("click", "inp123") monkeypatch.setattr( executor, @@ -492,6 +455,70 @@ def test_confirmed_click_element_id_does_not_skip_keyboard_input_confirmation( assert observation.pending_confirmation["action_type"] == "keyboard_input" +def test_confirmed_click_does_not_grant_next_click_shortcut(monkeypatch) -> None: + executor = BrowserExecutor() + executor.conversation_id = "conv-click-no-shortcut" + executor._set_pending_confirmation( + element_id="btn13", + action_type="click", + full_html="", + extra_data={"tab_id": 456}, + ) + + executed_commands = [] + + def fake_execute(command): + executed_commands.append(command) + return { + "success": True, + "data": {"screenshot": "data:image/png;base64,clicked"}, + } + + monkeypatch.setattr(executor, "_execute_command_sync", fake_execute) + + confirmed = executor._execute_action_sync( + ElementInteractionAction( + action="confirm_click", + conversation_id="conv-click-no-shortcut", + ) + ) + + assert confirmed.success is True + assert confirmed.message == "Confirmed and clicked element: btn13" + assert len(executed_commands) == 1 + + monkeypatch.setattr( + executor, + "_get_element_full_html", + lambda element_id, intended_action=None: ( + "", + "data:image/png;base64,pending-again", + ), + ) + monkeypatch.setattr( + executor, + "_execute_command_sync", + lambda command: (_ for _ in ()).throw(AssertionError("should stay in 2PC")), + ) + + next_observation = executor._execute_action_sync( + ElementInteractionAction( + action="click", + element_id="btn13", + tab_id=456, + conversation_id="conv-click-no-shortcut", + ) + ) + + assert next_observation.success is True + assert ( + next_observation.message + == "Click action pending confirmation for element: btn13" + ) + assert next_observation.pending_confirmation is not None + assert next_observation.pending_confirmation["action_type"] == "click" + + def test_confirm_keyboard_input_reports_nested_extension_error(monkeypatch) -> None: executor = BrowserExecutor() executor.conversation_id = "conv-input-error" diff --git a/server/tests/unit/test_base_classes.py b/server/tests/unit/test_base_classes.py index a07c3c9..ae358f8 100644 --- a/server/tests/unit/test_base_classes.py +++ b/server/tests/unit/test_base_classes.py @@ -81,7 +81,7 @@ def test_browser_state_prefers_tab_id_field_and_attaches_screenshot(self) -> Non assert llm_content[0].image_urls == ["data:image/png;base64,abc123"] assert "**[99]** Example" in llm_content[1].text - def test_highlighted_clickable_elements_are_summarized(self) -> None: + def test_highlighted_clickable_elements_include_html(self) -> None: observation = OpenBrowserObservation( success=True, element_type="clickable", @@ -97,9 +97,8 @@ def test_highlighted_clickable_elements_are_summarized(self) -> None: text = _text_content(observation) - assert "1 clickable element" in text - assert "abc123(clickable):" not in text - assert "" not in text + assert "1 clickable element" not in text + assert "abc123(clickable): " in text def test_highlighted_elements_render_page_metadata(self) -> None: observation = OpenBrowserObservation( @@ -122,7 +121,9 @@ def test_highlighted_elements_render_page_metadata(self) -> None: assert "**Page**: 2/4" in text assert "**Total Elements**: 9" in text - def test_small_model_highlighted_clickable_elements_keep_html(self) -> None: + def test_small_model_highlighted_clickable_elements_still_include_html( + self, + ) -> None: observation = OpenBrowserObservation( success=True, small_model=True, @@ -201,10 +202,11 @@ def test_highlighted_elements_include_detected_type_suffix(self) -> None: text = _text_content(observation) + assert 'vrtbj5(clickable):
' in text assert "q4w08w(inputable):" in text - assert "... and 1 clickable element" in text + assert "clickable element" not in text - def test_small_model_mixed_highlighted_elements_include_clickable_html( + def test_small_model_mixed_highlighted_elements_match_default_rendering( self, ) -> None: observation = OpenBrowserObservation(