diff --git a/AGENTS.md b/AGENTS.md
index 93244e4..0feb22c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -165,6 +165,13 @@ OpenBrowser uses Jinja2 templates for agent prompts, enabling dynamic content in
- **Clean output**: `trim_blocks=True` and `lstrip_blocks=True` remove extra whitespace
- **Caching**: Templates are cached after first load for performance
+### Model Profile Differences
+- Model profile is resolved from session metadata and exposed to prompt rendering as `model_profile` / `small_model`; see `server/agent/manager.py` and `server/agent/tools/prompt_context.py`
+- Tool prompt variants are split by model profile under `server/agent/prompts/small_model/` and `server/agent/prompts/big_model/`
+- Small-model browser guidance intentionally avoids `keywords` fallback and leans harder on same-mode highlight pagination when dense UI may be split across collision-aware pages
+- Observation rendering also differs by model profile: large models keep clickable highlights compact (`... and N clickable elements`), while small models include clickable element HTML in the LLM-visible observation text for extra semantic grounding
+- The small-model clickable-observation branch is implemented in `server/agent/tools/base.py`; the per-conversation `small_model` flag is attached in `server/agent/tools/browser_executor.py`
+
### Keyword Discipline
- Highlight pagination remains the default discovery flow for controls and dense UI
- After any significant page-state change, restart discovery with `highlight_elements(element_type="any")` before choosing the next element
@@ -206,10 +213,13 @@ Elements are paginated to ensure **no visual overlap** in each screenshot:
- Reason: OpenBrowser intentionally keeps automated tabs in the browser background, and Chrome may heavily throttle hidden-tab timers. A page-side `setTimeout` stability loop can therefore take far longer than its nominal budget and become the main cause of highlight timeouts.
- In practice, the main cause of unstable first-highlight screenshots is often **missing warmup**, not a bad readiness classifier. A background tab may answer lightweight `Runtime.evaluate` probes while still sitting in a partially painted / partially decoded state.
- A screenshot-style warmup is therefore the default precondition for `highlight_elements`. It helps force hidden-tab paint/compositor/image-decode work before interactive-element detection runs.
+- All highlight warmup and highlight screenshot captures now reuse the same screenshot wake-up profile as `tab view` (`TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS`) instead of a weaker highlight-only profile. The goal is consistency: if a screenshot is needed to wake the page, the highlight path should not use a different, less effective capture mode.
+- For navigation-driven default observations such as `tab init`, `tab open`, `tab switch`, `tab refresh`, `tab back`, and `tab forward`, the extension now performs an **internal raw screenshot prime** first, then runs the normal highlight warmup + detection + highlighted screenshot flow. That raw prime screenshot is only for waking the background page and is **not** returned to the agent.
- If `highlight_elements` keeps returning `not_ready` but `tab view` immediately makes the next highlight succeed, treat that as a warmup issue first.
- The extension samples viewport readiness signals once per attempt: document readiness, viewport text/media density, pending images, and loading placeholders such as skeleton/shimmer/spinner indicators.
- Readiness is graded as `ready`, `provisionally_ready`, or `not_ready`.
- If readiness is `not_ready`, the extension performs only a couple of short **background-side** retries before proceeding or returning the latest result.
+- The screenshot-side wake-up itself also runs a bounded pre-capture warmup loop. It touches visible viewport media, samples readiness, and retries only a couple of times when the snapshot still looks `not_ready`.
- After screenshot capture, highlight still runs a **consistency check**. This is a drift detector, not a loading detector: it verifies whether sampled highlighted elements moved or disappeared between detection and screenshot.
- Design rule: prefer snapshot classification plus bounded retries; avoid depending on repeated timers inside the target page for highlight stability.
@@ -321,10 +331,10 @@ OpenBrowser has explicit screenshot control for maximum flexibility:
| Command | Auto-Screenshot | Notes |
|---------|------------------|-------|
-| `tab init` | Yes | Verify page load |
-| `tab open` | Yes | Verify new tab |
-| `tab switch` | Yes | Verify tab switch |
-| `tab refresh` | Yes | Verify refresh result |
+| `tab init` | Yes | Returns default `highlight any page 1`; first does an internal raw screenshot prime to wake the page |
+| `tab open` | Yes | Returns default `highlight any page 1`; first does an internal raw screenshot prime to wake the page |
+| `tab switch` | Yes | Returns default `highlight any page 1`; first does an internal raw screenshot prime to wake the page |
+| `tab refresh` | Yes | Returns default `highlight any page 1`; first does an internal raw screenshot prime to wake the page |
|---------|------------------|-------|
| `highlight_elements` | Yes | Visual overlay for element selection |
| `click_element` | Yes | Verify interaction result |
@@ -367,7 +377,9 @@ Automated testing framework for evaluating AI agent performance on browser autom
```
OpenBrowser/eval/
├── evaluate_browser_agent.py # Main evaluation entry point
-├── dataset/ # YAML test case definitions (9 tests)
+├── dataset/ # YAML test case definitions (12 tests)
+│ ├── bluebook_simple.yaml # BlueBook search and like test
+│ ├── bluebook_complex.yaml # BlueBook multi-image reply test
│ ├── gbr.yaml # GBR search test
│ ├── gbr_detailed.yaml # GBR detailed search test
│ ├── techforum.yaml # TechForum upvote test
@@ -376,10 +388,11 @@ OpenBrowser/eval/
│ ├── cloudstack_interactive.yaml # CloudStack DAS interactive test
│ ├── finviz_simple.yaml # Finviz simple screener test
│ ├── finviz_complex.yaml # Finviz multi-filter test
-│ └── dataflow.yaml # DataFlow visual challenge test
+│ ├── dataflow.yaml # DataFlow visual challenge test
+│ └── northstar_add_bag.yaml # Combined fit-guide and add-to-bag geometry test
├── output/ # Generated results and images
├── server.py # Mock websites server with tracking API
-└── (mock websites: gbr/, techforum/, cloudstack/, dataflow/, finviz/)
+└── (mock websites: gbr/, techforum/, cloudstack/, dataflow/, finviz/, bluebook/, northstar/)
```
### Key Features
@@ -550,17 +563,19 @@ Tests are defined in YAML format with:
| `gbr` | GBR Search Test | easy | 400s (~6.7min) | 0.8 RMB | Search for "fed" related news |
| `finviz_simple` | Finviz Simple Screener Test | easy | 300s (5min) | 0.8 RMB | Filter stocks by market cap over 10 billion |
| `techforum` | TechForum Upvote Test | medium | 300s (5min) | 0.5 RMB | Upvote the first AI-related post |
+| `bluebook_simple` | BlueBook Search And Like Test | medium | 300s (5min) | 0.6 RMB | Search for the target note and like it |
| `gbr_detailed` | GBR Detailed Search & Read Test | medium | 600s (10min) | 1.5 RMB | Search for "fed", click into each article (3 articles), and summarize content |
| `finviz_complex` | Finviz Multi-Filter Screener Test | medium | 400s (~6.7min) | 1.0 RMB | Multi-filter stock screener: market cap, P/E, volume |
| `dataflow` | DataFlow Visual Challenge Test | medium | 300s (5min) | 0.5 RMB | Dashboard interactions: settings, reports, navigation |
+| `northstar_add_bag` | Northstar Fit Guide + Add To Bag Test | medium | 540s (9min) | 1.2 RMB | Save the Care & Wash fit guide section, then choose size M and add the shell to bag |
#### Advanced Tests
| ID | Name | Difficulty | Time Limit | Cost Limit | Description |
|----|------|------------|------------|------------|-------------|
+| `bluebook_complex` | BlueBook Multi-Image Reply Test | hard | 500s (~8.3min) | 1.2 RMB | Search for the OpenClaw note, view all images, and leave a quick comment |
| `cloudstack` | CloudStack DAS Agent Test | hard | 500s (~8.3min) | 1.2 RMB | Find DAS console and greet DAS agent |
| `techforum_reply` | TechForum Comment Reply Test | hard | 500s (~8.3min) | 1.0 RMB | Open comments, find "Graduate Student" comment, reply with paper name |
| `cloudstack_interactive` | CloudStack DAS Interactive Test | very hard | 700s (~11.7min) | 2.0 RMB | Multi-turn conversation with DAS agent: greeting, system status, storage check |
-
#### Event Matching Notes
- **Standard events**: `page_view`, `click`, `input`, `submit`, `hover`, `scroll`, `answer_action`
- **Special event types**:
@@ -586,6 +601,10 @@ Criteria match tracked events using flexible pattern matching:
- Page URLs, input values, custom fields
- Alternative conditions for flexible scoring
+### Deferred Prompt And Observation Follow-Ups
+- Observation design: add structured geometry hints such as `partly_visible`, `near_viewport_edge`, `occluded_by_sticky_ui`, explicit scroll-container identity, and structured stale-element causes before expanding prompt text again.
+- Prompt compaction: after geometry-focused eval results stabilize, reduce duplicated rules between the SDK system prompt and tool prompts so tool templates keep only tool-local contracts and recovery guidance.
+
## NOTES
- **Git dependencies:** `openhands-sdk` and `openhands-tools` from git subdirectories
diff --git a/eval/cloudstack/js/das-agent.js b/eval/cloudstack/js/das-agent.js
index f2a66d4..101a9cc 100644
--- a/eval/cloudstack/js/das-agent.js
+++ b/eval/cloudstack/js/das-agent.js
@@ -42,6 +42,75 @@ document.addEventListener('DOMContentLoaded', function() {
this.style.height = (this.scrollHeight) + 'px';
});
}
+
+ function normalizeMessage(message) {
+ return message.toLowerCase().replace(/\s+/g, ' ').trim();
+ }
+
+ function containsAny(text, keywords) {
+ return keywords.some(keyword => text.includes(keyword));
+ }
+
+ function buildAgentReply(message) {
+ const normalizedMessage = normalizeMessage(message);
+
+ const greetingKeywords = [
+ 'hello',
+ 'hi',
+ 'hey',
+ 'greetings',
+ 'good morning',
+ 'good afternoon',
+ 'good evening'
+ ];
+ const statusKeywords = [
+ 'status',
+ 'system',
+ 'health',
+ 'report',
+ 'running',
+ 'current state',
+ 'how are you'
+ ];
+ const storageKeywords = [
+ 'storage',
+ 'disk',
+ 'space',
+ 'capacity',
+ 'usage',
+ 'utilization',
+ 'volume'
+ ];
+ const cpuKeywords = ['cpu', 'load'];
+ const memoryKeywords = ['memory', 'ram'];
+ const alertKeywords = ['alert', 'warning', 'alarm', 'incident', 'issue'];
+
+ if (containsAny(normalizedMessage, storageKeywords)) {
+ return 'Storage usage check complete: primary cluster is at 68% used, log volume is at 42%, and free capacity is enough for current workload. No immediate storage risk detected.';
+ }
+
+ if (containsAny(normalizedMessage, statusKeywords)) {
+ return 'Current system status is stable. Core database services are online, replication delay is within threshold, and there are no critical incidents at the moment.';
+ }
+
+ if (containsAny(normalizedMessage, cpuKeywords)) {
+ return 'CPU load is moderate right now, averaging around 34% across the main database nodes. No hot node is currently flagged.';
+ }
+
+ if (containsAny(normalizedMessage, memoryKeywords)) {
+ return 'Memory usage is healthy. Working set pressure is low and cache hit rate remains within the expected range.';
+ }
+
+ if (containsAny(normalizedMessage, alertKeywords)) {
+ return 'There are no active P1 alerts. I only see a few low-priority optimization suggestions related to slow-query tuning and index review.';
+ }
+
+ if (containsAny(normalizedMessage, greetingKeywords)) {
+ return 'Hello. I am DAS Agent. I can help with system status, storage usage, alerts, and database operations checks.';
+ }
+
+ return 'I can help with database operations. You can ask me for current system status, storage usage, performance health, or active alerts.';
+ }
// Send message function
function sendMessage() {
@@ -74,7 +143,7 @@ document.addEventListener('DOMContentLoaded', function() {
// Simulate agent response delay
setTimeout(function() {
- addAgentMessage('Hello, I am DAS Agent');
+ addAgentMessage(buildAgentReply(message));
sendBtn.disabled = false;
sendBtn.textContent = 'Send Message';
}, 800);
diff --git a/eval/dataset/northstar_add_bag.yaml b/eval/dataset/northstar_add_bag.yaml
new file mode 100644
index 0000000..7da7ca2
--- /dev/null
+++ b/eval/dataset/northstar_add_bag.yaml
@@ -0,0 +1,43 @@
+id: northstar_add_bag
+name: "Northstar Fit Guide + Add To Bag Test"
+difficulty: medium
+description: "Open the fit guide, save the Care & Wash section, then reposition the purchase rail, choose size M, and add the shell to bag."
+start_url: "http://localhost:16605/northstar/"
+instruction: "On the Northstar Outfitters Commuter Shell page, open the fit guide, scroll inside it until the Care & Wash section is centered, save the guide from that section, then choose size Medium and add the jacket to your bag."
+time_limit: 540.0
+cost_limit: 1.2
+
+criteria:
+ - type: open_fit_guide
+ description: "Open the fit guide drawer"
+ points: 0.5
+ expected:
+ event_type: fit_guide_open
+ page: "/northstar/"
+ drawer: "fit-guide"
+
+ - type: save_fit_guide
+ description: "Save the fit guide from the Care & Wash section"
+ points: 2.0
+ expected:
+ event_type: fit_guide_save
+ page: "/northstar/"
+ section: "care-wash"
+
+ - type: select_medium_size
+ description: "Select size Medium"
+ points: 1.5
+ expected:
+ event_type: product_size_select
+ page: "/northstar/"
+ productId: "commuter-shell"
+ size: "M"
+
+ - type: add_to_bag
+ description: "Add the selected shell to bag"
+ points: 2.0
+ expected:
+ event_type: product_add_to_bag
+ page: "/northstar/"
+ productId: "commuter-shell"
+ size: "M"
diff --git a/eval/evaluate_browser_agent.py b/eval/evaluate_browser_agent.py
index d23d99c..047743a 100644
--- a/eval/evaluate_browser_agent.py
+++ b/eval/evaluate_browser_agent.py
@@ -7,26 +7,27 @@
"""
import argparse
-import asyncio
+import atexit
import base64
+import datetime
+import fcntl
import json
+import logging
import os
-import sys
-import time
-import yaml
-import requests
-from pathlib import Path
-from typing import Dict, List, Any, Optional, Tuple
-from dataclasses import dataclass
-import subprocess
import shutil
import signal
-import atexit
-import logging
-import datetime
+import sys
import threading
-import fcntl
+import time
+from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
from contextlib import AbstractContextManager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+
+import requests
+import yaml
logger = logging.getLogger(__name__)
@@ -108,6 +109,18 @@ class MessageRunResult:
error: Optional[str] = None
+@dataclass(frozen=True)
+class ScheduledJob:
+ """One scheduled automated evaluation job."""
+
+ target_index: int
+ test_index: int
+ target: LLMTarget
+ test_case: TestCase
+ model_key: str
+ site_bucket: str
+
+
class OpenBrowserClient:
"""Client for OpenBrowser server API"""
@@ -563,18 +576,24 @@ def health_check(self) -> bool:
except requests.exceptions.RequestException:
return False
- def clear_events(self) -> bool:
- """Clear all tracked events"""
+ def clear_events(self, site: Optional[str] = None) -> bool:
+ """Clear tracked events, optionally scoped to one mock site."""
try:
- response = self.session.get(f"{self.base_url}/api/events/clear", timeout=2)
+ params = {"site": site} if site else None
+ response = self.session.get(
+ f"{self.base_url}/api/events/clear", params=params, timeout=2
+ )
return response.status_code == 200
except Exception:
return False
- def get_events(self) -> List[Dict[str, Any]]:
- """Get all tracked events"""
+ def get_events(self, site: Optional[str] = None) -> List[Dict[str, Any]]:
+ """Get tracked events, optionally scoped to one mock site."""
try:
- response = self.session.get(f"{self.base_url}/api/events", timeout=5)
+ params = {"site": site} if site else None
+ response = self.session.get(
+ f"{self.base_url}/api/events", params=params, timeout=5
+ )
if response.status_code == 200:
data = response.json()
return data.get("events", [])
@@ -744,6 +763,69 @@ def __init__(self, chrome_uuid: Optional[str] = None):
self.current_model: Optional[str] = None # Current model being tested
self.current_target: Optional[LLMTarget] = None # Current CLI target
+ @staticmethod
+ def _sanitize_model_name(model_name: str) -> str:
+ """Make a model name safe for filesystem paths."""
+ return model_name.replace("/", "_").replace(":", "_")
+
+ @staticmethod
+ def _get_model_key(target: LLMTarget) -> str:
+ """Return the concurrency key for one target."""
+ return target.model_name or target.alias or target.name
+
+ @staticmethod
+ def _get_test_site_bucket(test_case: TestCase) -> str:
+ """Infer the mock-site bucket from the test start URL."""
+ parsed = urlparse(test_case.start_url)
+ segments = [segment for segment in parsed.path.split("/") if segment]
+ if segments:
+ return segments[0]
+ return test_case.id
+
+ def _ensure_model_output_dir(self, model_name: Optional[str]) -> Path:
+ """Ensure the per-model output directory exists."""
+ if self.output_dir is None:
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
+ self.output_dir = OUTPUT_BASE_DIR / timestamp
+ self.output_dir.mkdir(parents=True, exist_ok=True)
+ logger.info(f"Created base output directory: {self.output_dir}")
+
+ if not model_name:
+ return self.output_dir
+
+ model_output_dir = self.output_dir / self._sanitize_model_name(model_name)
+ model_output_dir.mkdir(exist_ok=True)
+ return model_output_dir
+
+ def _build_error_result(
+ self, test_case: TestCase, model_name: Optional[str], error: str
+ ) -> TestResult:
+ """Build a failed test result for scheduler/runtime errors."""
+ max_score = sum(criterion.get("points", 1) for criterion in test_case.criteria)
+ return TestResult(
+ test_case=test_case,
+ passed=False,
+ score=0,
+ max_score=max_score,
+ events=[],
+ sse_events=[],
+ track_events=[],
+ images=[],
+ error=error,
+ duration=0.0,
+ cost=0.0,
+ efficiency_score=0.0,
+ usage_score=0.0,
+ total_score=0.0,
+ model=model_name,
+ )
+
+ def _create_worker_evaluator(self) -> "Evaluator":
+ """Create a short-lived evaluator with independent HTTP sessions."""
+ worker = Evaluator(chrome_uuid=self.chrome_uuid)
+ worker.output_dir = self.output_dir
+ return worker
+
def resolve_targets(self, targets: List[LLMTarget]) -> List[LLMTarget]:
"""Resolve configured aliases to raw model names."""
llm_configs = self.openbrowser.get_llm_configs()
@@ -856,61 +938,46 @@ def load_test_cases(self) -> List[TestCase]:
return test_cases
- def run_test(self, test_case: TestCase) -> TestResult:
+ def run_test(
+ self,
+ test_case: TestCase,
+ target: Optional[LLMTarget] = None,
+ model_name: Optional[str] = None,
+ ) -> TestResult:
"""Run a single test case"""
- logger.info(f"Running test: {test_case.name}")
+ active_target = target or self.current_target
+ active_model_name = model_name or self.current_model
+ site_bucket = self._get_test_site_bucket(test_case)
- # Ensure output directory exists with model subdirectory
- if self.output_dir is None:
- timestamp = time.strftime("%Y%m%d_%H%M%S")
- self.output_dir = OUTPUT_BASE_DIR / timestamp
- self.output_dir.mkdir(parents=True, exist_ok=True)
- logger.info(f"Created base output directory: {self.output_dir}")
+ logger.info(
+ "Running test: %s [model=%s site=%s]",
+ test_case.name,
+ active_model_name,
+ site_bucket,
+ )
- # Create model-specific subdirectory if model is set
- model_output_dir = self.output_dir
- if self.current_model:
- # Sanitize model name for filesystem
- model_name_safe = self.current_model.replace("/", "_").replace(":", "_")
- model_output_dir = self.output_dir / model_name_safe
- model_output_dir.mkdir(exist_ok=True)
- logger.info(f"Using model output directory: {model_output_dir}")
+ model_output_dir = self._ensure_model_output_dir(active_model_name)
- # Clear previous events
- self.eval_server.clear_events()
+ # Clear only the current mock-site event bucket.
+ self.eval_server.clear_events(site=site_bucket)
# Create new conversation with current model
conversation_id = self.openbrowser.create_conversation(
- model_alias=self.current_target.alias if self.current_target else None,
+ model_alias=active_target.alias if active_target else None,
)
if conversation_id:
logger.debug(f"Created conversation: {conversation_id}")
else:
logger.warning(
- f"Failed to create conversation for model {self.current_model}"
- )
- max_score = sum(
- criterion.get("points", 1) for criterion in test_case.criteria
+ f"Failed to create conversation for model {active_model_name}"
)
- return TestResult(
- test_case=test_case,
- passed=False,
- score=0,
- max_score=max_score,
- events=[],
- sse_events=[],
- track_events=[],
- images=[],
- error=(
- f"Failed to create conversation for target {self.current_model}. "
+ return self._build_error_result(
+ test_case,
+ active_model_name,
+ (
+ f"Failed to create conversation for target {active_model_name}. "
"See logs for server response details."
),
- duration=0.0,
- cost=0.0,
- efficiency_score=0.0,
- usage_score=0.0,
- total_score=0.0,
- model=self.current_model,
)
start_time = time.time()
@@ -979,7 +1046,7 @@ def run_test(self, test_case: TestCase) -> TestResult:
time.sleep(min(pending_event_wait, max(0.0, deadline - time.time())))
# Get tracking events
- track_events = self.eval_server.get_events()
+ track_events = self.eval_server.get_events(site=site_bucket)
# Save track events to file
track_events_file = self._save_track_events(
@@ -1033,7 +1100,7 @@ def run_test(self, test_case: TestCase) -> TestResult:
total_score=total_score,
sse_events_file=sse_events_file,
track_events_file=track_events_file,
- model=self.current_model,
+ model=active_model_name,
)
finally:
self._cleanup_openbrowser_conversation(conversation_id)
@@ -1354,7 +1421,7 @@ def _check_criterion(
logger.debug(f"Criterion matched by SSE event: {event.get('type')}")
return True
- logger.debug(f"Criterion not met")
+ logger.debug("Criterion not met")
return False
def _check_count_min_condition(
@@ -1638,6 +1705,7 @@ def generate_report(self):
def run_manual_test(self, test_case: TestCase) -> TestResult:
"""Run a test case in manual mode with human performing the same task as OpenBrowser"""
logger.info(f"Running manual test: {test_case.name}")
+ site_bucket = self._get_test_site_bucket(test_case)
# Ensure output directory exists
if self.output_dir is None:
@@ -1646,8 +1714,8 @@ def run_manual_test(self, test_case: TestCase) -> TestResult:
self.output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Created output directory: {self.output_dir}")
- # Clear previous events
- self.eval_server.clear_events()
+ # Clear previous events for the current mock site only.
+ self.eval_server.clear_events(site=site_bucket)
# Print test information
print("\n" + "=" * 60)
@@ -1656,14 +1724,14 @@ def run_manual_test(self, test_case: TestCase) -> TestResult:
print("=" * 60)
if test_case.start_url:
- print(f"\n📋 Please open your browser and navigate to:")
+ print("\n📋 Please open your browser and navigate to:")
print(f" {test_case.start_url}")
print("Make sure the eval server is running (localhost:16605).")
print("The browser should load the test page.")
input("\nPress Enter when ready to continue...")
# Show the SAME instruction that would be given to OpenBrowser
- print(f"\n📝 Task Instruction (same as given to OpenBrowser):")
+ print("\n📝 Task Instruction (same as given to OpenBrowser):")
print(f" {test_case.instruction}")
print(
"\nPerform this task in the browser. Events will be tracked from this moment."
@@ -1692,7 +1760,7 @@ def run_manual_test(self, test_case: TestCase) -> TestResult:
time.sleep(2)
# Get tracking events
- track_events = self.eval_server.get_events()
+ track_events = self.eval_server.get_events(site=site_bucket)
# Save track events to file (no conversation_id for manual mode, use "manual")
track_events_file = self._save_track_events(
@@ -1738,7 +1806,7 @@ def run_manual_test(self, test_case: TestCase) -> TestResult:
# Print completion message
print(f"\n{'=' * 60}")
- print(f"Manual test completed!")
+ print("Manual test completed!")
print(f"Duration: {duration:.1f}s")
print(f"Track events recorded: {len(track_events)}")
print(f"Task score: {score:.1f}/{max_score:.1f}")
@@ -1751,19 +1819,159 @@ def run_manual_test(self, test_case: TestCase) -> TestResult:
return result
+ def _build_scheduled_jobs(
+ self, test_cases: List[TestCase], targets: List[LLMTarget]
+ ) -> Dict[int, List[ScheduledJob]]:
+ """Build the full job matrix for automated evaluation."""
+ jobs_by_target: Dict[int, List[ScheduledJob]] = {}
+ for target_index, target in enumerate(targets):
+ model_key = self._get_model_key(target)
+ jobs_by_target[target_index] = [
+ ScheduledJob(
+ target_index=target_index,
+ test_index=test_index,
+ target=target,
+ test_case=test_case,
+ model_key=model_key,
+ site_bucket=self._get_test_site_bucket(test_case),
+ )
+ for test_index, test_case in enumerate(test_cases)
+ ]
+ return jobs_by_target
+
+ def _execute_scheduled_job(self, job: ScheduledJob) -> TestResult:
+ """Run one scheduled job in an isolated worker evaluator."""
+ worker = self._create_worker_evaluator()
+ try:
+ return worker.run_test(
+ job.test_case,
+ target=job.target,
+ model_name=job.model_key,
+ )
+ except Exception as e:
+ logger.exception(
+ "Scheduled job failed: model=%s test=%s site=%s",
+ job.model_key,
+ job.test_case.id,
+ job.site_bucket,
+ )
+ return self._build_error_result(
+ job.test_case,
+ job.model_key,
+ f"Unhandled scheduler worker error: {e}",
+ )
+
+ def _run_scheduled_jobs(
+ self,
+ test_cases: List[TestCase],
+ targets: List[LLMTarget],
+ parallel: int,
+ single_model_parallel: int,
+ ) -> Dict[int, List[TestResult]]:
+ """Run scheduled jobs with global and per-model concurrency limits."""
+ max_parallel = max(1, parallel)
+ per_model_limit = max(1, single_model_parallel)
+ jobs_by_target = self._build_scheduled_jobs(test_cases, targets)
+ results_by_target: Dict[int, List[Optional[TestResult]]] = {
+ target_index: [None] * len(test_cases)
+ for target_index in range(len(targets))
+ }
+
+ logger.info(
+ "Scheduler limits: parallel=%s, single_model_parallel=%s",
+ max_parallel,
+ per_model_limit,
+ )
+
+ running_by_model: Dict[str, int] = {}
+ busy_sites: set[str] = set()
+ in_flight: Dict[Any, ScheduledJob] = {}
+ target_order = list(range(len(targets)))
+
+ with ThreadPoolExecutor(max_workers=max_parallel) as executor:
+ while True:
+ pending_jobs = any(jobs for jobs in jobs_by_target.values())
+
+ while len(in_flight) < max_parallel:
+ scheduled_job: Optional[ScheduledJob] = None
+
+ for target_index in target_order:
+ pending_for_target = jobs_by_target[target_index]
+ if not pending_for_target:
+ continue
+
+ model_key = pending_for_target[0].model_key
+ if running_by_model.get(model_key, 0) >= per_model_limit:
+ continue
+
+ for job_index, job in enumerate(pending_for_target):
+ if job.site_bucket in busy_sites:
+ continue
+ scheduled_job = pending_for_target.pop(job_index)
+ break
+
+ if scheduled_job is not None:
+ break
+
+ if scheduled_job is None:
+ break
+
+ future = executor.submit(self._execute_scheduled_job, scheduled_job)
+ in_flight[future] = scheduled_job
+ running_by_model[scheduled_job.model_key] = (
+ running_by_model.get(scheduled_job.model_key, 0) + 1
+ )
+ busy_sites.add(scheduled_job.site_bucket)
+
+ logger.info(
+ "Scheduled test '%s' for model '%s' on site '%s'",
+ scheduled_job.test_case.id,
+ scheduled_job.model_key,
+ scheduled_job.site_bucket,
+ )
+
+ if not in_flight and not pending_jobs:
+ break
+
+ if not in_flight:
+ logger.warning("Pending jobs remain but none could be scheduled")
+ break
+
+ done, _ = wait(set(in_flight.keys()), return_when=FIRST_COMPLETED)
+ for future in done:
+ job = in_flight.pop(future)
+ running_by_model[job.model_key] -= 1
+ if running_by_model[job.model_key] <= 0:
+ del running_by_model[job.model_key]
+ busy_sites.discard(job.site_bucket)
+
+ result = future.result()
+ results_by_target[job.target_index][job.test_index] = result
+
+ status = "PASSED" if result.passed else "FAILED"
+ logger.info(
+ "Completed test '%s' for model '%s': %s %.1f/%.1f",
+ job.test_case.name,
+ job.model_key,
+ status,
+ result.score,
+ result.max_score,
+ )
+
+ return {
+ target_index: [result for result in target_results if result is not None]
+ for target_index, target_results in results_by_target.items()
+ }
+
def run_all(
self,
targets: Optional[List[LLMTarget]] = None,
skip_services: bool = False,
manual: bool = False,
+ parallel: int = 1,
+ single_model_parallel: int = 1,
):
- """Run all test cases for specified LLM targets.
-
- Args:
- targets: Explicit LLM targets to test.
- skip_services: If True, skip service availability checks
- manual: If True, only check eval server (manual mode doesn't need OpenBrowser)
- """
+ """Run all test cases for specified LLM targets."""
if not self.ensure_services(skip_services=skip_services, manual=manual):
logger.error("Cannot run tests: services unavailable")
return False
@@ -1772,7 +1980,6 @@ def run_all(
logger.error("No model aliases provided")
return False
- # Create timestamped output directory
timestamp = time.strftime("%Y%m%d_%H%M%S")
self.output_dir = OUTPUT_BASE_DIR / timestamp
self.output_dir.mkdir(parents=True, exist_ok=True)
@@ -1783,51 +1990,35 @@ def run_all(
logger.warning("No test cases found")
return False
- # Store overall results for summary report
- all_results = []
+ scheduled_results = self._run_scheduled_jobs(
+ test_cases=test_cases,
+ targets=targets,
+ parallel=parallel,
+ single_model_parallel=single_model_parallel,
+ )
- target_names = [target.name for target in targets]
+ all_results: List[TestResult] = []
+ target_names = [self._get_model_key(target) for target in targets]
- for target in targets:
+ for target_index, target in enumerate(targets):
logger.info(f"\n{'=' * 60}")
logger.info(
- f"Testing target alias: {target.alias} -> model: {target.model_name}"
+ "Finished target alias: %s -> model: %s",
+ target.alias,
+ target.model_name,
)
logger.info(f"{'=' * 60}")
- self.current_target = target
- self.current_model = target.model_name or target.name
-
- # Clear results for this model
- self.results = []
-
- # Run all test cases for this model
- for test_case in test_cases:
- result = self.run_test(test_case)
- self.results.append(result)
-
- status = "PASSED" if result.passed else "FAILED"
- logger.info(
- f"Test '{test_case.name}' {status}: {result.score:.1f}/{result.max_score:.1f}"
- )
-
- # Generate report for this model
+ self.results = scheduled_results.get(target_index, [])
if self.results:
model_report_path = self.generate_report()
logger.info(f"Model report saved to: {model_report_path}")
-
- # Add model information to results and store for summary
- for result in self.results:
- result.model = target.model_name or target.name
all_results.extend(self.results)
- # Generate cross-model summary report if we tested multiple models
if len(targets) > 1 and all_results:
self._generate_cross_model_summary(all_results, target_names)
- # Restore results for backward compatibility
self.results = all_results
-
return True
def run_all_manual(self, skip_services: bool = False) -> bool:
@@ -1852,9 +2043,9 @@ def run_all_manual(self, skip_services: bool = False) -> bool:
return False
print(f"\n{'=' * 60}")
- print(f"MANUAL ALL-TESTS MODE")
+ print("MANUAL ALL-TESTS MODE")
print(f"Found {len(test_cases)} test cases to complete")
- print(f"Each test will start when you confirm ready after seeing start URL")
+ print("Each test will start when you confirm ready after seeing start URL")
print(f"{'=' * 60}")
# Store overall results for summary report
@@ -1945,7 +2136,7 @@ def _generate_manual_summary(self, all_results: List[TestResult]):
# Print final summary
print(f"\n{'=' * 60}")
- print(f"MANUAL TESTING COMPLETE")
+ print("MANUAL TESTING COMPLETE")
print(f"{'=' * 60}")
print(f"Total tests: {total_tests}")
print(f"Passed tests: {passed_tests} ({pass_rate:.1f}%)")
@@ -1956,7 +2147,7 @@ def _generate_manual_summary(self, all_results: List[TestResult]):
print(f"{'=' * 60}")
# Print per-test summary table
- print(f"\nTest Results Summary:")
+ print("\nTest Results Summary:")
print(
f"{'Test Name':40} {'Status':10} {'Task Score':12} {'Efficiency':12} {'Duration':10}"
)
@@ -2269,6 +2460,18 @@ def main():
action="append",
help="Configured LLM alias to evaluate. Can be passed multiple times.",
)
+ parser.add_argument(
+ "--parallel",
+ type=int,
+ default=1,
+ help="Maximum number of automated test jobs running at once.",
+ )
+ parser.add_argument(
+ "--single-model-parallel",
+ type=int,
+ default=1,
+ help="Maximum concurrent test jobs allowed for the same resolved model.",
+ )
parser.add_argument(
"--no-services", action="store_true", help="Don't start services"
)
@@ -2316,6 +2519,12 @@ def main():
"(or set OPENBROWSER_CHROME_UUID)"
)
+ if args.parallel < 1:
+ parser.error("--parallel must be >= 1")
+
+ if args.single_model_parallel < 1:
+ parser.error("--single-model-parallel must be >= 1")
+
evaluator = Evaluator(chrome_uuid=args.chrome_uuid)
# Register cleanup
@@ -2371,9 +2580,9 @@ def main():
if args.manual:
logger.info(f"Running manual test: {test_case.name}")
print(f"\n{'=' * 60}")
- print(f"MANUAL MODE ENABLED")
+ print("MANUAL MODE ENABLED")
print(f"Test: {test_case.name}")
- print(f"Model selection ignored (manual human test)")
+ print("Model selection ignored (manual human test)")
print(f"{'=' * 60}")
result = evaluator.run_manual_test(test_case)
@@ -2405,21 +2614,22 @@ def main():
# Normal (automated) mode
else:
all_results = []
+ scheduled_results = evaluator._run_scheduled_jobs(
+ test_cases=[test_case],
+ targets=llm_targets,
+ parallel=args.parallel,
+ single_model_parallel=args.single_model_parallel,
+ )
target_names = [
- target.model_name or target.name for target in llm_targets
+ evaluator._get_model_key(target) for target in llm_targets
]
- for target in llm_targets:
- logger.info(f"\n{'=' * 60}")
- logger.info(
- f"Testing target alias: {target.alias} -> model: {target.model_name}"
- )
- logger.info(f"{'=' * 60}")
- evaluator.current_target = target
- evaluator.current_model = target.model_name or target.name
+ for target_index, target in enumerate(llm_targets):
+ target_results = scheduled_results.get(target_index, [])
+ if not target_results:
+ continue
- result = evaluator.run_test(test_case)
- result.model = target.model_name or target.name
+ result = target_results[0]
all_results.append(result)
print(
@@ -2430,7 +2640,6 @@ def main():
print(f" Task score: {result.score:.1f}/{result.max_score:.1f}")
print(f" Efficiency score: {result.efficiency_score or 0:.2f}/1.0")
print(f" Usage score: {result.usage_score or 0:.2f}/1.0")
- # Calculate composite score for this test
passed_float = 1.0 if result.passed else 0.0
eff_score = result.efficiency_score or 0.0
usage_score_val = result.usage_score or 0.0
@@ -2473,10 +2682,10 @@ def main():
else:
# Run all tests for all models (manual mode now supported)
if args.manual:
- logger.info(f"Running all tests in MANUAL mode")
+ logger.info("Running all tests in MANUAL mode")
print(f"\n{'=' * 60}")
- print(f"ALL TESTS MANUAL MODE")
- print(f"Model selection ignored (manual human test)")
+ print("ALL TESTS MANUAL MODE")
+ print("Model selection ignored (manual human test)")
print(f"{'=' * 60}")
success = evaluator.run_all_manual(skip_services=args.no_services)
@@ -2491,7 +2700,11 @@ def main():
sys.exit(1)
success = evaluator.run_all(
- targets=llm_targets, skip_services=args.no_services, manual=False
+ targets=llm_targets,
+ skip_services=args.no_services,
+ manual=False,
+ parallel=args.parallel,
+ single_model_parallel=args.single_model_parallel,
)
if not success:
sys.exit(1)
diff --git a/eval/evaluation_report.json b/eval/evaluation_report.json
index 3f53998..345d6ae 100644
--- a/eval/evaluation_report.json
+++ b/eval/evaluation_report.json
@@ -1,11 +1,11 @@
{
"evaluation": {
- "timestamp": "2026-03-28 21:09:34",
- "unix_timestamp": 1774703374.492846,
+ "timestamp": "2026-03-30 11:17:06",
+ "unix_timestamp": 1774840626.757355,
"summary": {
- "total_tests": 22,
- "passed_tests": 21,
- "pass_rate": 95.45,
+ "total_tests": 24,
+ "passed_tests": 24,
+ "pass_rate": 100.0,
"models_tested": [
"dashscope/qwen3.5-flash",
"dashscope/qwen3.5-plus"
@@ -13,28 +13,28 @@
},
"model_performance": {
"dashscope/qwen3.5-flash": {
- "pass_rate": 90.91,
- "task_score": 59.0,
- "task_max_score": 62.5,
- "efficiency_score": 7.0914,
- "usage_score": 8.6369,
- "composite_score": 0.8314,
- "avg_duration": 165.81,
- "avg_cost": 0.2005,
- "passed_count": 10,
- "total_tests": 11
+ "pass_rate": 100.0,
+ "task_score": 68.5,
+ "task_max_score": 68.5,
+ "efficiency_score": 9.0653,
+ "usage_score": 11.0829,
+ "composite_score": 0.9358,
+ "avg_duration": 114.89,
+ "avg_cost": 0.075442,
+ "passed_count": 12,
+ "total_tests": 12
},
"dashscope/qwen3.5-plus": {
"pass_rate": 100.0,
- "task_score": 62.5,
- "task_max_score": 62.5,
- "efficiency_score": 7.638,
- "usage_score": 4.4691,
- "composite_score": 0.8201,
- "avg_duration": 144.49,
- "avg_cost": 0.620445,
- "passed_count": 11,
- "total_tests": 11
+ "task_score": 67.5,
+ "task_max_score": 68.5,
+ "efficiency_score": 8.2484,
+ "usage_score": 8.3979,
+ "composite_score": 0.8774,
+ "avg_duration": 149.63,
+ "avg_cost": 0.291952,
+ "passed_count": 12,
+ "total_tests": 12
}
},
"test_results": {
@@ -45,23 +45,23 @@
"passed": true,
"task_score": 6.0,
"task_max_score": 6.0,
- "efficiency_score": 0.656,
- "usage_score": 0.7975,
- "composite_score": 0.8907,
- "total_score": 7.45,
- "duration": 103.21,
- "cost": 0.121497
+ "efficiency_score": 0.6992,
+ "usage_score": 0.8882,
+ "composite_score": 0.9175,
+ "total_score": 7.59,
+ "duration": 90.23,
+ "cost": 0.067092
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 6.0,
"task_max_score": 6.0,
- "efficiency_score": 0.6513,
- "usage_score": 0.2662,
- "composite_score": 0.7835,
- "total_score": 6.92,
- "duration": 104.6,
- "cost": 0.440265
+ "efficiency_score": 0.6702,
+ "usage_score": 0.6682,
+ "composite_score": 0.8677,
+ "total_score": 7.34,
+ "duration": 98.94,
+ "cost": 0.199068
}
}
},
@@ -72,23 +72,23 @@
"passed": true,
"task_score": 3,
"task_max_score": 3,
- "efficiency_score": 0.817,
- "usage_score": 0.9464,
- "composite_score": 0.9527,
- "total_score": 4.76,
- "duration": 54.91,
- "cost": 0.042865
+ "efficiency_score": 0.7026,
+ "usage_score": 0.9286,
+ "composite_score": 0.9263,
+ "total_score": 4.63,
+ "duration": 89.21,
+ "cost": 0.05708
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 3,
"task_max_score": 3,
- "efficiency_score": 0.7766,
- "usage_score": 0.777,
- "composite_score": 0.9107,
- "total_score": 4.55,
- "duration": 67.03,
- "cost": 0.178378
+ "efficiency_score": 0.7458,
+ "usage_score": 0.8466,
+ "composite_score": 0.9185,
+ "total_score": 4.59,
+ "duration": 76.26,
+ "cost": 0.122742
}
}
},
@@ -97,25 +97,25 @@
"results_by_model": {
"dashscope/qwen3.5-flash": {
"passed": true,
- "task_score": 7.5,
+ "task_score": 9.0,
"task_max_score": 9.0,
- "efficiency_score": 0.7165,
- "usage_score": 0.873,
- "composite_score": 0.9179,
- "total_score": 9.09,
- "duration": 198.43,
- "cost": 0.254081
+ "efficiency_score": 0.7799,
+ "usage_score": 0.9379,
+ "composite_score": 0.9436,
+ "total_score": 10.72,
+ "duration": 154.09,
+ "cost": 0.124176
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 9.0,
"task_max_score": 9.0,
- "efficiency_score": 0.6635,
- "usage_score": 0.3651,
- "composite_score": 0.8057,
- "total_score": 10.03,
- "duration": 235.55,
- "cost": 1.269709
+ "efficiency_score": 0.592,
+ "usage_score": 0.6905,
+ "composite_score": 0.8565,
+ "total_score": 10.28,
+ "duration": 285.6,
+ "cost": 0.619067
}
}
},
@@ -123,26 +123,26 @@
"name": "GBR Search Test",
"results_by_model": {
"dashscope/qwen3.5-flash": {
- "passed": false,
- "task_score": 0.5,
+ "passed": true,
+ "task_score": 2.5,
"task_max_score": 2.5,
- "efficiency_score": 0.0806,
- "usage_score": 0.4183,
- "composite_score": 0.0998,
- "total_score": 1.0,
- "duration": 367.76,
- "cost": 0.465362
+ "efficiency_score": 0.8086,
+ "usage_score": 0.9414,
+ "composite_score": 0.95,
+ "total_score": 4.25,
+ "duration": 76.54,
+ "cost": 0.046907
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 2.5,
"task_max_score": 2.5,
- "efficiency_score": 0.7707,
- "usage_score": 0.5913,
- "composite_score": 0.8724,
- "total_score": 3.86,
- "duration": 91.71,
- "cost": 0.326986
+ "efficiency_score": 0.7862,
+ "usage_score": 0.7942,
+ "composite_score": 0.9161,
+ "total_score": 4.08,
+ "duration": 85.52,
+ "cost": 0.164629
}
}
},
@@ -153,23 +153,23 @@
"passed": true,
"task_score": 9.5,
"task_max_score": 9.5,
- "efficiency_score": 0.2831,
- "usage_score": 0.5356,
- "composite_score": 0.7637,
- "total_score": 10.32,
- "duration": 358.43,
- "cost": 0.464415
+ "efficiency_score": 0.7537,
+ "usage_score": 0.9936,
+ "composite_score": 0.9495,
+ "total_score": 11.25,
+ "duration": 123.16,
+ "cost": 0.006363
},
"dashscope/qwen3.5-plus": {
"passed": true,
- "task_score": 9.5,
+ "task_score": 8.5,
"task_max_score": 9.5,
- "efficiency_score": 0.6498,
- "usage_score": 0.0922,
- "composite_score": 0.7484,
- "total_score": 10.24,
- "duration": 175.1,
- "cost": 0.907785
+ "efficiency_score": 0.7346,
+ "usage_score": 0.7258,
+ "composite_score": 0.8921,
+ "total_score": 9.96,
+ "duration": 132.72,
+ "cost": 0.274247
}
}
},
@@ -180,23 +180,23 @@
"passed": true,
"task_score": 12.0,
"task_max_score": 12.0,
- "efficiency_score": 0.7024,
- "usage_score": 0.8476,
- "composite_score": 0.91,
- "total_score": 13.55,
- "duration": 148.82,
- "cost": 0.182839
+ "efficiency_score": 0.7297,
+ "usage_score": 0.9161,
+ "composite_score": 0.9292,
+ "total_score": 13.65,
+ "duration": 135.13,
+ "cost": 0.100653
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 12.0,
"task_max_score": 12.0,
- "efficiency_score": 0.6839,
- "usage_score": 0.3953,
- "composite_score": 0.8158,
- "total_score": 13.08,
- "duration": 158.07,
- "cost": 0.725694
+ "efficiency_score": 0.6813,
+ "usage_score": 0.7499,
+ "composite_score": 0.8862,
+ "total_score": 13.43,
+ "duration": 159.34,
+ "cost": 0.300169
}
}
},
@@ -207,23 +207,23 @@
"passed": true,
"task_score": 2,
"task_max_score": 2,
- "efficiency_score": 0.9001,
- "usage_score": 0.9546,
- "composite_score": 0.9709,
- "total_score": 3.85,
- "duration": 29.97,
- "cost": 0.022705
+ "efficiency_score": 0.8776,
+ "usage_score": 0.9646,
+ "composite_score": 0.9684,
+ "total_score": 3.84,
+ "duration": 36.73,
+ "cost": 0.0177
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 2,
"task_max_score": 2,
- "efficiency_score": 0.8533,
- "usage_score": 0.7708,
- "composite_score": 0.9248,
- "total_score": 3.62,
- "duration": 44.02,
- "cost": 0.114576
+ "efficiency_score": 0.7867,
+ "usage_score": 0.8596,
+ "composite_score": 0.9293,
+ "total_score": 3.65,
+ "duration": 63.98,
+ "cost": 0.070214
}
}
},
@@ -234,23 +234,23 @@
"passed": true,
"task_score": 5.0,
"task_max_score": 5.0,
- "efficiency_score": 0.7334,
- "usage_score": 0.8726,
- "composite_score": 0.9212,
- "total_score": 6.61,
- "duration": 106.62,
- "cost": 0.127404
+ "efficiency_score": 0.6489,
+ "usage_score": 0.902,
+ "composite_score": 0.9102,
+ "total_score": 6.55,
+ "duration": 140.44,
+ "cost": 0.09796
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 5.0,
"task_max_score": 5.0,
- "efficiency_score": 0.5376,
- "usage_score": 0.2214,
- "composite_score": 0.7518,
- "total_score": 5.76,
- "duration": 184.97,
- "cost": 0.778617
+ "efficiency_score": 0.5245,
+ "usage_score": 0.6334,
+ "composite_score": 0.8316,
+ "total_score": 6.16,
+ "duration": 190.18,
+ "cost": 0.366565
}
}
},
@@ -261,23 +261,23 @@
"passed": true,
"task_score": 3.5,
"task_max_score": 3.5,
- "efficiency_score": 0.7705,
- "usage_score": 0.8949,
- "composite_score": 0.9331,
- "total_score": 5.17,
- "duration": 114.74,
- "cost": 0.126168
+ "efficiency_score": 0.8244,
+ "usage_score": 0.9518,
+ "composite_score": 0.9552,
+ "total_score": 5.28,
+ "duration": 87.78,
+ "cost": 0.057875
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 3.5,
"task_max_score": 3.5,
- "efficiency_score": 0.592,
- "usage_score": 0.5048,
- "composite_score": 0.8194,
- "total_score": 4.6,
- "duration": 203.98,
- "cost": 0.594295
+ "efficiency_score": 0.5207,
+ "usage_score": 0.6071,
+ "composite_score": 0.8256,
+ "total_score": 4.63,
+ "duration": 239.64,
+ "cost": 0.471474
}
}
},
@@ -288,23 +288,23 @@
"passed": true,
"task_score": 3,
"task_max_score": 3,
- "efficiency_score": 0.7678,
- "usage_score": 0.6428,
- "composite_score": 0.8821,
- "total_score": 4.41,
- "duration": 139.31,
- "cost": 0.178579
+ "efficiency_score": 0.7973,
+ "usage_score": 0.8323,
+ "composite_score": 0.9259,
+ "total_score": 4.63,
+ "duration": 121.6,
+ "cost": 0.08386
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 3,
"task_max_score": 3,
- "efficiency_score": 0.7283,
- "usage_score": 0,
- "composite_score": 0.7457,
- "total_score": 3.73,
- "duration": 163.02,
- "cost": 0.716118
+ "efficiency_score": 0.6867,
+ "usage_score": 0.2032,
+ "composite_score": 0.778,
+ "total_score": 3.89,
+ "duration": 187.96,
+ "cost": 0.398401
}
}
},
@@ -315,26 +315,53 @@
"passed": true,
"task_score": 7.0,
"task_max_score": 7.0,
- "efficiency_score": 0.6639,
- "usage_score": 0.8536,
- "composite_score": 0.9035,
- "total_score": 8.52,
- "duration": 201.65,
- "cost": 0.219589
+ "efficiency_score": 0.6144,
+ "usage_score": 0.8758,
+ "composite_score": 0.8981,
+ "total_score": 8.49,
+ "duration": 231.34,
+ "cost": 0.186276
},
"dashscope/qwen3.5-plus": {
"passed": true,
"task_score": 7.0,
"task_max_score": 7.0,
- "efficiency_score": 0.7311,
- "usage_score": 0.485,
- "composite_score": 0.8432,
- "total_score": 8.22,
- "duration": 161.35,
- "cost": 0.772474
+ "efficiency_score": 0.7343,
+ "usage_score": 0.7991,
+ "composite_score": 0.9067,
+ "total_score": 8.53,
+ "duration": 159.39,
+ "cost": 0.301292
+ }
+ }
+ },
+ "northstar_add_bag": {
+ "name": "Northstar Fit Guide + Add To Bag Test",
+ "results_by_model": {
+ "dashscope/qwen3.5-flash": {
+ "passed": true,
+ "task_score": 6.0,
+ "task_max_score": 6.0,
+ "efficiency_score": 0.8288,
+ "usage_score": 0.9505,
+ "composite_score": 0.9559,
+ "total_score": 7.78,
+ "duration": 92.44,
+ "cost": 0.059363
+ },
+ "dashscope/qwen3.5-plus": {
+ "passed": true,
+ "task_score": 6.0,
+ "task_max_score": 6.0,
+ "efficiency_score": 0.7852,
+ "usage_score": 0.8204,
+ "composite_score": 0.9211,
+ "total_score": 7.61,
+ "duration": 116.0,
+ "cost": 0.215559
}
}
}
}
}
-}
+}
\ No newline at end of file
diff --git a/eval/northstar/css/northstar.css b/eval/northstar/css/northstar.css
new file mode 100644
index 0000000..b878956
--- /dev/null
+++ b/eval/northstar/css/northstar.css
@@ -0,0 +1,554 @@
+:root {
+ --bg: #f3f0ea;
+ --surface: #fcfbf8;
+ --ink: #1d1a17;
+ --muted: #6d655d;
+ --line: rgba(29, 26, 23, 0.12);
+ --accent: #2747d8;
+ --accent-soft: rgba(39, 71, 216, 0.12);
+ --success: #1c7a4e;
+ --warning: #d36b25;
+ --shadow: 0 28px 60px rgba(37, 27, 18, 0.12);
+ font-family: "Avenir Next", "Segoe UI", sans-serif;
+}
+
+* {
+ box-sizing: border-box;
+}
+
+body {
+ margin: 0;
+ background: linear-gradient(180deg, #f7f4ee 0%, #efe8dd 100%);
+ color: var(--ink);
+}
+
+button,
+input {
+ font: inherit;
+}
+
+.announcement-bar {
+ position: sticky;
+ top: 0;
+ z-index: 30;
+ background: #1b1f2c;
+ color: #f6f7fb;
+ text-align: center;
+ padding: 10px 16px;
+ font-size: 13px;
+ letter-spacing: 0.04em;
+ text-transform: uppercase;
+}
+
+.site-header {
+ position: sticky;
+ top: 42px;
+ z-index: 20;
+ display: grid;
+ grid-template-columns: auto 1fr auto;
+ gap: 28px;
+ align-items: center;
+ padding: 18px 32px;
+ background: rgba(252, 251, 248, 0.92);
+ border-bottom: 1px solid var(--line);
+ backdrop-filter: blur(12px);
+}
+
+.brand {
+ display: flex;
+ gap: 14px;
+ align-items: center;
+}
+
+.brand-mark {
+ width: 42px;
+ height: 42px;
+ border-radius: 14px;
+ display: grid;
+ place-items: center;
+ background: linear-gradient(135deg, #2f416f, #111827);
+ color: #f7fbff;
+ font-weight: 700;
+ font-size: 18px;
+}
+
+.brand-name {
+ font-size: 17px;
+ font-weight: 700;
+}
+
+.brand-tag {
+ font-size: 12px;
+ color: var(--muted);
+}
+
+.site-nav {
+ display: flex;
+ gap: 24px;
+ justify-content: center;
+}
+
+.site-link {
+ color: var(--ink);
+ text-decoration: none;
+ font-size: 14px;
+}
+
+.header-actions {
+ display: flex;
+ gap: 12px;
+ align-items: center;
+}
+
+.search-shell input {
+ width: 280px;
+ border: 1px solid var(--line);
+ border-radius: 999px;
+ padding: 11px 16px;
+ background: #fff;
+}
+
+.header-btn {
+ border: 1px solid var(--line);
+ border-radius: 999px;
+ background: #fff;
+ padding: 11px 16px;
+ color: var(--ink);
+}
+
+.page-shell {
+ max-width: 1300px;
+ margin: 0 auto;
+ padding: 28px 32px 220px;
+}
+
+.breadcrumbs {
+ display: flex;
+ gap: 10px;
+ align-items: center;
+ font-size: 13px;
+ color: var(--muted);
+ margin-bottom: 24px;
+}
+
+.crumb-sep {
+ opacity: 0.55;
+}
+
+.product-layout {
+ display: grid;
+ grid-template-columns: minmax(0, 1.1fr) minmax(360px, 420px);
+ gap: 28px;
+ align-items: start;
+}
+
+.gallery-column,
+.detail-column {
+ display: grid;
+ gap: 24px;
+}
+
+.hero-card,
+.panel-card,
+.story-card {
+ border-radius: 28px;
+ background: var(--surface);
+ border: 1px solid rgba(29, 26, 23, 0.08);
+ box-shadow: var(--shadow);
+}
+
+.hero-card {
+ min-height: 520px;
+ padding: 30px;
+ display: flex;
+ flex-direction: column;
+ justify-content: space-between;
+ overflow: hidden;
+}
+
+.hero-1 {
+ background:
+ linear-gradient(145deg, rgba(16, 22, 41, 0.95), rgba(43, 70, 122, 0.85)),
+ radial-gradient(circle at top right, rgba(255, 255, 255, 0.18), transparent 35%);
+ color: #f7f8fb;
+}
+
+.hero-2 {
+ min-height: 340px;
+ background: linear-gradient(160deg, #d4d8df, #f6f8fb 44%, #f0ece6);
+}
+
+.hero-3 {
+ min-height: 360px;
+ background: linear-gradient(160deg, #d7d1c8, #f8f3ea 44%, #efe8de);
+}
+
+.media-badge,
+.collection-label,
+.section-kicker,
+.story-label,
+.drawer-kicker {
+ display: inline-flex;
+ align-items: center;
+ gap: 8px;
+ font-size: 12px;
+ letter-spacing: 0.08em;
+ text-transform: uppercase;
+ color: var(--muted);
+}
+
+.hero-1 .media-badge {
+ color: #dce8ff;
+}
+
+.hero-copy h1,
+.hero-copy h2,
+.panel-title,
+.product-name {
+ margin: 0;
+ line-height: 1.06;
+}
+
+.hero-copy h1 {
+ max-width: 78%;
+ font-size: 54px;
+}
+
+.hero-copy.slim h2 {
+ max-width: 72%;
+ font-size: 34px;
+}
+
+.detail-column .panel-card,
+.product-summary {
+ padding: 28px;
+}
+
+.product-summary {
+ border-radius: 28px;
+ background: rgba(252, 251, 248, 0.9);
+ border: 1px solid var(--line);
+ box-shadow: var(--shadow);
+}
+
+.price-row {
+ display: flex;
+ gap: 14px;
+ align-items: baseline;
+ margin: 12px 0 14px;
+}
+
+.price {
+ font-size: 34px;
+ font-weight: 700;
+}
+
+.price-note,
+.summary-copy,
+.fit-note,
+.panel-copy,
+.shipping-banner,
+.drawer-subhead,
+.review-topline,
+.guide-section p {
+ color: var(--muted);
+}
+
+.swatch-row,
+.feature-pills,
+.size-grid,
+.purchase-actions {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 12px;
+}
+
+.swatch,
+.size-option,
+.secondary-btn,
+.review-action,
+.header-btn,
+.sticky-btn {
+ border: 1px solid var(--line);
+ background: #fff;
+ color: var(--ink);
+ border-radius: 999px;
+ padding: 12px 16px;
+}
+
+.swatch.active,
+.size-option.active {
+ border-color: var(--accent);
+ background: var(--accent-soft);
+ color: var(--accent);
+}
+
+.feature-pill {
+ border-radius: 999px;
+ background: rgba(29, 26, 23, 0.05);
+ padding: 10px 14px;
+ font-size: 13px;
+}
+
+.add-btn {
+ border: none;
+ border-radius: 999px;
+ padding: 14px 22px;
+ background: var(--accent);
+ color: #fff;
+ font-weight: 700;
+ box-shadow: 0 16px 30px rgba(39, 71, 216, 0.24);
+}
+
+.add-btn:disabled {
+ opacity: 0.48;
+ cursor: not-allowed;
+ box-shadow: none;
+}
+
+.add-btn.is-complete,
+.review-action.is-complete {
+ border-color: rgba(28, 122, 78, 0.2);
+ background: #1c7a4e;
+ color: #fff;
+ box-shadow: 0 16px 30px rgba(28, 122, 78, 0.22);
+}
+
+.add-btn.is-complete:disabled,
+.review-action.is-complete:disabled {
+ opacity: 1;
+ cursor: default;
+}
+
+.action-feedback {
+ margin: 14px 0 0;
+ min-height: 20px;
+ font-size: 14px;
+ font-weight: 600;
+ color: var(--success);
+ opacity: 0;
+ transform: translateY(4px);
+ transition: opacity 140ms ease, transform 140ms ease;
+}
+
+.action-feedback.visible {
+ opacity: 1;
+ transform: translateY(0);
+}
+
+.drawer-feedback {
+ padding: 0 28px 12px;
+ margin: 0;
+}
+
+.story-grid {
+ margin-top: 32px;
+ display: grid;
+ grid-template-columns: repeat(3, minmax(0, 1fr));
+ gap: 20px;
+}
+
+.story-card {
+ padding: 24px;
+}
+
+.story-card h3 {
+ margin: 12px 0 0;
+ font-size: 20px;
+ line-height: 1.3;
+}
+
+.sticky-cta {
+ position: fixed;
+ left: 24px;
+ right: 24px;
+ bottom: 20px;
+ z-index: 24;
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ padding: 18px 22px;
+ border-radius: 24px;
+ background: rgba(24, 27, 36, 0.94);
+ color: #f6f7fb;
+ box-shadow: 0 26px 60px rgba(15, 17, 23, 0.32);
+}
+
+.sticky-label {
+ font-size: 15px;
+ font-weight: 700;
+}
+
+.sticky-meta {
+ font-size: 13px;
+ color: rgba(246, 247, 251, 0.72);
+}
+
+.sticky-btn {
+ border-color: rgba(255, 255, 255, 0.2);
+ background: rgba(255, 255, 255, 0.08);
+ color: #fff;
+}
+
+.drawer-overlay {
+ position: fixed;
+ inset: 0;
+ z-index: 50;
+ display: flex;
+ justify-content: flex-end;
+ background: rgba(22, 18, 15, 0.34);
+ backdrop-filter: blur(4px);
+}
+
+.drawer-overlay.hidden {
+ display: none;
+}
+
+.drawer {
+ width: min(520px, 100%);
+ height: 100%;
+ background: var(--surface);
+ display: flex;
+ flex-direction: column;
+ position: relative;
+ box-shadow: -24px 0 60px rgba(28, 21, 16, 0.18);
+}
+
+.drawer-header {
+ display: flex;
+ justify-content: space-between;
+ align-items: flex-start;
+ padding: 28px 28px 18px;
+ border-bottom: 1px solid var(--line);
+}
+
+.drawer-header h2 {
+ margin: 6px 0 0;
+ font-size: 30px;
+}
+
+.drawer-close {
+ border: 1px solid var(--line);
+ background: #fff;
+ border-radius: 999px;
+ padding: 10px 14px;
+}
+
+.drawer-subhead {
+ padding: 0 28px 18px;
+ font-size: 14px;
+}
+
+.drawer-scroll {
+ flex: 1;
+ overflow-y: auto;
+ padding: 0 28px 140px;
+ scroll-behavior: smooth;
+}
+
+.guide-scroll {
+ padding-bottom: 260px;
+}
+
+.fit-guide-refresh-fab {
+ position: fixed;
+ left: var(--fit-guide-refresh-left, calc(100vw - min(520px, 100vw) + 78px));
+ top: var(--fit-guide-refresh-top, calc(100vh - 150px));
+ z-index: 60;
+ width: 82px;
+ height: 82px;
+ border: 1px solid rgba(29, 26, 23, 0.12);
+ border-radius: 999px;
+ display: grid;
+ place-items: center;
+ background: rgba(252, 251, 248, 0.96);
+ color: #1d1a17;
+ box-shadow: 0 18px 34px rgba(28, 21, 16, 0.18);
+ backdrop-filter: blur(12px);
+ transition: opacity 140ms ease, box-shadow 140ms ease;
+}
+
+.fit-guide-refresh-fab svg {
+ width: 32px;
+ height: 32px;
+ fill: currentColor;
+}
+
+.review-card,
+.guide-section {
+ border-radius: 22px;
+ background: #fff;
+ border: 1px solid var(--line);
+ padding: 22px;
+ margin-bottom: 18px;
+}
+
+.review-card h3,
+.guide-section h3 {
+ margin: 8px 0 10px;
+ font-size: 24px;
+}
+
+.review-card p,
+.guide-section p {
+ margin: 0 0 18px;
+ line-height: 1.55;
+}
+
+.target-action {
+ background: rgba(39, 71, 216, 0.08);
+ border-color: rgba(39, 71, 216, 0.32);
+ color: var(--accent);
+}
+
+.review-card.is-complete,
+.guide-section.is-complete,
+.panel-card.is-complete {
+ border-color: rgba(28, 122, 78, 0.22);
+ box-shadow: 0 18px 36px rgba(28, 122, 78, 0.14);
+}
+
+.review-action.is-complete.target-action {
+ border-color: rgba(28, 122, 78, 0.2);
+ background: #1c7a4e;
+ color: #fff;
+}
+
+.target-guide-section {
+ border-color: rgba(39, 71, 216, 0.24);
+ background: linear-gradient(180deg, rgba(39, 71, 216, 0.05), rgba(39, 71, 216, 0.01));
+}
+
+.target-guide-section.is-complete {
+ border-color: rgba(28, 122, 78, 0.22);
+ background: linear-gradient(180deg, rgba(28, 122, 78, 0.08), rgba(28, 122, 78, 0.02));
+}
+
+@media (max-width: 1080px) {
+ .site-header {
+ grid-template-columns: 1fr;
+ justify-items: start;
+ }
+
+ .product-layout,
+ .story-grid {
+ grid-template-columns: 1fr;
+ }
+
+ .search-shell input {
+ width: min(100%, 320px);
+ }
+
+ .sticky-cta {
+ left: 14px;
+ right: 14px;
+ bottom: 14px;
+ padding: 16px 18px;
+ gap: 12px;
+ }
+
+ .fit-guide-refresh-fab {
+ width: 76px;
+ height: 76px;
+ }
+}
diff --git a/eval/northstar/index.html b/eval/northstar/index.html
new file mode 100644
index 0000000..f43b4f7
--- /dev/null
+++ b/eval/northstar/index.html
@@ -0,0 +1,270 @@
+
+
+
+
+
+ Northstar Outfitters - Commuter Shell
+
+
+
+
+ Free two-day shipping on commuting layers and rain shells
+
+
+
+
+
+
+ Men
+ /
+ Shells
+ /
+ Commuter Shell
+
+
+
+
+
+ Waterproof 20k
+
+
Commuter Shell
+
Storm-proof layer built for train platforms, bike lanes, and late meetings.
+
+
+
+
+
Lightweight membrane
+
Quiet stretch face fabric and taped seams for all-day wear.
+
+
+
+
+
Packable hood
+
Helmet-friendly shape with hidden visor and storm collar.
+
+
+
+
+
+
+
+
+
+ Weather proofing
+ 20k / 20k membrane with seam tape on the shoulder line and hood crown.
+
+
+ Bike commute fit
+ Dropped rear hem and articulated sleeves keep the cuff line steady on the bars.
+
+
+ Storage
+ Front chest entry plus inside drop pocket sized for notebook, gloves, and compact charger.
+
+
+
+
+
+
+
Commuter Shell
+
Locker pickup available
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/eval/northstar/js/northstar.js b/eval/northstar/js/northstar.js
new file mode 100644
index 0000000..d13cba6
--- /dev/null
+++ b/eval/northstar/js/northstar.js
@@ -0,0 +1,354 @@
+window.tracker = new AgentTracker("northstaroutfitters.com", "hard");
+
+document.addEventListener("DOMContentLoaded", () => {
+ const state = {
+ selectedSize: null,
+ purchaseRailTracked: false,
+ reviewsGeometryTracked: false,
+ fitGuideGeometryTracked: false,
+ };
+
+ const purchaseRail = document.getElementById("purchase-rail");
+ const addToBagBtn = document.getElementById("add-to-bag");
+ const purchaseFeedback = document.getElementById("purchase-feedback");
+ const sizeButtons = Array.from(document.querySelectorAll(".size-option"));
+ const reviewsOverlay = document.getElementById("reviews-overlay");
+ const reviewsFeedback = document.getElementById("reviews-feedback");
+ const reviewsScroll = document.getElementById("reviews-scroll");
+ const fitGuideOverlay = document.getElementById("fit-guide-overlay");
+ const fitGuideScroll = document.getElementById("fit-guide-scroll");
+ const targetReview = document.querySelector('[data-review-id="review-trail-commute"]')?.closest(".review-card");
+ const careSection = document.getElementById("care-wash-section");
+ const fitGuideFeedback = document.getElementById("fit-guide-feedback");
+ const saveFitGuideBtn = document.getElementById("save-fit-guide");
+ const fitGuideRefreshBtn = document.getElementById("fit-guide-refresh");
+
+ function showFeedback(node, message) {
+ if (!node) {
+ return;
+ }
+
+ node.textContent = message;
+ node.classList.add("visible");
+ }
+
+ function resetAddToBagState() {
+ addToBagBtn.textContent = "Add to bag";
+ addToBagBtn.classList.remove("is-complete");
+ addToBagBtn.disabled = state.selectedSize === null;
+ }
+
+ function clamp(value, min, max) {
+ return Math.min(Math.max(value, min), max);
+ }
+
+ function resetFitGuideRefreshPlacement() {
+ if (!fitGuideRefreshBtn) {
+ return;
+ }
+
+ fitGuideRefreshBtn.style.removeProperty("--fit-guide-refresh-left");
+ fitGuideRefreshBtn.style.removeProperty("--fit-guide-refresh-top");
+ }
+
+ function placeFitGuideRefreshBlocker() {
+ if (!fitGuideRefreshBtn || !saveFitGuideBtn || fitGuideOverlay.classList.contains("hidden")) {
+ resetFitGuideRefreshPlacement();
+ return;
+ }
+
+ const drawer = fitGuideOverlay.querySelector(".drawer");
+ const saveRect = saveFitGuideBtn.getBoundingClientRect();
+ const drawerRect = drawer?.getBoundingClientRect();
+ const blockerWidth = fitGuideRefreshBtn.offsetWidth || 82;
+ const blockerHeight = fitGuideRefreshBtn.offsetHeight || 82;
+ let left = saveRect.left + saveRect.width / 2 - blockerWidth / 2;
+ let top = saveRect.top + saveRect.height / 2 - blockerHeight / 2;
+
+ if (drawerRect) {
+ const minLeft = drawerRect.left + 16;
+ const maxLeft = drawerRect.right - blockerWidth - 16;
+ const minTop = drawerRect.top + 96;
+ const maxTop = drawerRect.bottom - blockerHeight - 16;
+
+ left = clamp(left, minLeft, Math.max(minLeft, maxLeft));
+ top = clamp(top, minTop, Math.max(minTop, maxTop));
+ }
+
+ fitGuideRefreshBtn.style.setProperty("--fit-guide-refresh-left", `${Math.round(left)}px`);
+ fitGuideRefreshBtn.style.setProperty("--fit-guide-refresh-top", `${Math.round(top)}px`);
+ }
+
+ function queueFitGuideRefreshPlacement() {
+ requestAnimationFrame(() => {
+ requestAnimationFrame(placeFitGuideRefreshBlocker);
+ });
+ }
+
+ function trackGeometryReadyOnce() {
+ if (!purchaseRail || state.purchaseRailTracked) {
+ return;
+ }
+
+ const rect = purchaseRail.getBoundingClientRect();
+ const topSafe = 130;
+ const bottomSafe = window.innerHeight - 180;
+ const centeredEnough = rect.top >= topSafe && rect.bottom <= bottomSafe;
+
+ if (!centeredEnough) {
+ return;
+ }
+
+ state.purchaseRailTracked = true;
+ tracker.track("purchase_rail_geometry_ready", {
+ section: "purchase-rail",
+ scrollY: Math.round(window.scrollY),
+ });
+ }
+
+ function isCenteredInContainer(target, container) {
+ if (!target || !container) {
+ return false;
+ }
+
+ const targetRect = target.getBoundingClientRect();
+ const containerRect = container.getBoundingClientRect();
+ const targetCenter = targetRect.top + targetRect.height / 2;
+ const containerCenter = containerRect.top + containerRect.height / 2;
+ const tolerance = Math.max(60, containerRect.height * 0.18);
+
+ return (
+ targetRect.top >= containerRect.top + 24 &&
+ targetRect.bottom <= containerRect.bottom - 110 &&
+ Math.abs(targetCenter - containerCenter) <= tolerance
+ );
+ }
+
+ function overlapsBlocker(target, blocker) {
+ if (!target || !blocker) {
+ return false;
+ }
+
+ const targetRect = target.getBoundingClientRect();
+ const blockerRect = blocker.getBoundingClientRect();
+
+ const horizontalOverlap =
+ Math.min(targetRect.right, blockerRect.right) -
+ Math.max(targetRect.left, blockerRect.left);
+ const verticalOverlap =
+ Math.min(targetRect.bottom, blockerRect.bottom) -
+ Math.max(targetRect.top, blockerRect.top);
+
+ return horizontalOverlap > 8 && verticalOverlap > 8;
+ }
+
+ function trackReviewsGeometry() {
+ if (state.reviewsGeometryTracked || reviewsOverlay.classList.contains("hidden")) {
+ return;
+ }
+
+ if (!isCenteredInContainer(targetReview, reviewsScroll)) {
+ return;
+ }
+
+ state.reviewsGeometryTracked = true;
+ tracker.track("reviews_drawer_geometry_ready", {
+ reviewId: "review-trail-commute",
+ scrollTop: Math.round(reviewsScroll.scrollTop),
+ });
+ }
+
+ function trackFitGuideGeometry() {
+ if (state.fitGuideGeometryTracked || fitGuideOverlay.classList.contains("hidden")) {
+ return;
+ }
+
+ if (!isCenteredInContainer(careSection, fitGuideScroll)) {
+ return;
+ }
+
+ if (overlapsBlocker(saveFitGuideBtn, fitGuideRefreshBtn)) {
+ return;
+ }
+
+ state.fitGuideGeometryTracked = true;
+ tracker.track("fit_guide_geometry_ready", {
+ section: "care-wash",
+ scrollTop: Math.round(fitGuideScroll.scrollTop),
+ });
+ }
+
+ window.addEventListener("scroll", trackGeometryReadyOnce, { passive: true });
+ window.addEventListener("resize", () => {
+ trackGeometryReadyOnce();
+
+ if (!fitGuideOverlay.classList.contains("hidden")) {
+ queueFitGuideRefreshPlacement();
+ }
+ });
+
+ sizeButtons.forEach((button) => {
+ button.addEventListener("click", () => {
+ sizeButtons.forEach((item) => item.classList.remove("active"));
+ button.classList.add("active");
+ state.selectedSize = button.dataset.size || null;
+ resetAddToBagState();
+ showFeedback(
+ purchaseFeedback,
+ `Size ${state.selectedSize} selected. Ready to add the shell to bag.`,
+ );
+ tracker.track("product_size_select", {
+ productId: "commuter-shell",
+ size: state.selectedSize,
+ });
+ });
+ });
+
+ addToBagBtn.addEventListener("click", () => {
+ if (!state.selectedSize) {
+ return;
+ }
+
+ tracker.track("product_add_to_bag", {
+ productId: "commuter-shell",
+ size: state.selectedSize,
+ });
+
+ addToBagBtn.textContent = "Added to bag";
+ addToBagBtn.classList.add("is-complete");
+ addToBagBtn.disabled = true;
+ purchaseRail.classList.add("is-complete");
+ showFeedback(
+ purchaseFeedback,
+ `Commuter Shell in size ${state.selectedSize} added to your bag.`,
+ );
+ });
+
+ document.getElementById("open-reviews")?.addEventListener("click", () => {
+ state.reviewsGeometryTracked = false;
+ reviewsOverlay.classList.remove("hidden");
+ reviewsScroll.scrollTop = 0;
+ showFeedback(
+ reviewsFeedback,
+ "Reviews drawer opened. Scroll inside this panel to reach the target review.",
+ );
+ tracker.track("reviews_drawer_open", {
+ drawer: "reviews",
+ });
+ });
+
+ document.getElementById("open-fit-guide")?.addEventListener("click", () => {
+ state.fitGuideGeometryTracked = false;
+ fitGuideOverlay.classList.remove("hidden");
+ fitGuideScroll.scrollTop = 0;
+ queueFitGuideRefreshPlacement();
+ showFeedback(
+ fitGuideFeedback,
+ "Fit guide opened. Scroll this panel until Care & Wash is centered.",
+ );
+ tracker.track("fit_guide_open", {
+ drawer: "fit-guide",
+ });
+ });
+
+ document.querySelectorAll(".drawer-close").forEach((button) => {
+ button.addEventListener("click", () => {
+ const overlayId = button.dataset.close;
+ if (!overlayId) {
+ return;
+ }
+
+ document.getElementById(overlayId)?.classList.add("hidden");
+ if (overlayId === "fit-guide-overlay") {
+ resetFitGuideRefreshPlacement();
+ }
+ tracker.track("drawer_close", {
+ drawer: overlayId,
+ });
+ });
+ });
+
+ reviewsOverlay.addEventListener("click", (event) => {
+ if (event.target === reviewsOverlay) {
+ reviewsOverlay.classList.add("hidden");
+ tracker.track("drawer_close", {
+ drawer: "reviews-overlay",
+ method: "overlay",
+ });
+ }
+ });
+
+ fitGuideOverlay.addEventListener("click", (event) => {
+ if (event.target === fitGuideOverlay) {
+ fitGuideOverlay.classList.add("hidden");
+ resetFitGuideRefreshPlacement();
+ tracker.track("drawer_close", {
+ drawer: "fit-guide-overlay",
+ method: "overlay",
+ });
+ }
+ });
+
+ fitGuideRefreshBtn?.addEventListener("click", () => {
+ window.location.reload();
+ });
+
+ reviewsScroll.addEventListener(
+ "scroll",
+ () => {
+ tracker.track("reviews_panel_scroll", {
+ scrollTop: Math.round(reviewsScroll.scrollTop),
+ });
+ trackReviewsGeometry();
+ },
+ { passive: true },
+ );
+
+ fitGuideScroll.addEventListener(
+ "scroll",
+ () => {
+ tracker.track("fit_guide_scroll", {
+ scrollTop: Math.round(fitGuideScroll.scrollTop),
+ });
+ trackFitGuideGeometry();
+ },
+ { passive: true },
+ );
+
+ document.querySelectorAll(".review-action").forEach((button) => {
+ button.addEventListener("click", () => {
+ const reviewId = button.dataset.reviewId;
+ const card = button.closest(".review-card");
+ const title = card?.querySelector("h3")?.textContent?.trim() || "";
+
+ tracker.track("review_helpful", {
+ reviewId,
+ title,
+ helpful: true,
+ });
+
+ button.textContent = "Helpful saved";
+ button.classList.add("is-complete");
+ button.disabled = true;
+ card?.classList.add("is-complete");
+ showFeedback(reviewsFeedback, `Saved "${title}" as Helpful.`);
+ });
+ });
+
+ saveFitGuideBtn?.addEventListener("click", () => {
+ tracker.track("fit_guide_save", {
+ section: "care-wash",
+ });
+
+ saveFitGuideBtn.textContent = "Guide saved";
+ saveFitGuideBtn.classList.add("is-complete");
+ saveFitGuideBtn.disabled = true;
+ careSection?.classList.add("is-complete");
+ showFeedback(
+ fitGuideFeedback,
+ "Care & Wash saved. This section is now marked as complete.",
+ );
+ });
+
+ trackGeometryReadyOnce();
+});
diff --git a/eval/server.py b/eval/server.py
index 5eaea3b..9bd38df 100644
--- a/eval/server.py
+++ b/eval/server.py
@@ -13,20 +13,94 @@
4. Export events via /api/events endpoint
"""
-import http.server
-import socketserver
import html
+import http.server
import json
import os
+import socketserver
+import threading
+from copy import deepcopy
from datetime import datetime
-from urllib.parse import urlparse, parse_qs
+from urllib.parse import parse_qs, urlparse
# Configuration
PORT = 16605
EVAL_DIR = os.path.dirname(os.path.abspath(__file__))
# In-memory event storage
-events_store = {"events": [], "sessions": {}}
+events_store = {"events": [], "sessions": {}, "sites": {}}
+events_store_lock = threading.Lock()
+
+SITE_NAME_TO_BUCKET = {
+ "globalbusinessreview.com": "gbr",
+ "techforum.com": "techforum",
+ "cloudstack.com": "cloudstack",
+ "dataflow.io": "dataflow",
+ "finviz": "finviz",
+ "bluebook.life": "bluebook",
+ "northstaroutfitters.com": "northstar",
+}
+
+
+def _normalize_site_bucket(raw_value):
+ """Normalize a site/path/domain into a mock-site bucket key."""
+ if not raw_value or not isinstance(raw_value, str):
+ return None
+
+ parsed = urlparse(raw_value)
+ candidate = parsed.path if parsed.scheme or parsed.netloc else raw_value
+ candidate = candidate.strip()
+
+ if candidate.startswith("/"):
+ segments = [segment for segment in candidate.split("/") if segment]
+ if segments:
+ return segments[0]
+
+ normalized = candidate.strip().lower()
+ return SITE_NAME_TO_BUCKET.get(normalized, normalized or None)
+
+
+def _get_event_site_bucket(event):
+ """Infer the mock-site bucket for one tracked event."""
+ for key in ("page", "url", "site"):
+ bucket = _normalize_site_bucket(event.get(key))
+ if bucket:
+ return bucket
+ return "unknown"
+
+
+def _get_or_create_site_store(site_bucket):
+ """Return the per-site event store, creating it if needed."""
+ return events_store["sites"].setdefault(site_bucket, {"events": [], "sessions": {}})
+
+
+def _snapshot_events(site_bucket=None):
+ """Return a JSON-safe snapshot of tracked events."""
+ with events_store_lock:
+ if site_bucket:
+ site_store = events_store["sites"].get(
+ site_bucket, {"events": [], "sessions": {}}
+ )
+ return {
+ "site": site_bucket,
+ "events": deepcopy(site_store["events"]),
+ "sessions": deepcopy(site_store["sessions"]),
+ }
+
+ return deepcopy(events_store)
+
+
+def _clear_events(site_bucket=None):
+ """Clear tracked events globally or for a specific site."""
+ with events_store_lock:
+ if site_bucket:
+ events_store["sites"][site_bucket] = {"events": [], "sessions": {}}
+ return
+
+ events_store["events"] = []
+ events_store["sessions"] = {}
+ events_store["sites"] = {}
+
# URL mappings
URL_MAPPINGS = {
@@ -177,6 +251,8 @@
"/finviz/index.html": ("/finviz/index.html", "text/html"),
"/bluebook/": ("/bluebook/index.html", "text/html"),
"/bluebook/index.html": ("/bluebook/index.html", "text/html"),
+ "/northstar/": ("/northstar/index.html", "text/html"),
+ "/northstar/index.html": ("/northstar/index.html", "text/html"),
}
CSS_MIMETYPE = "text/css"
@@ -197,16 +273,25 @@ def do_GET(self):
"""Handle GET requests"""
parsed_path = urlparse(self.path)
path = parsed_path.path
+ query_params = parse_qs(parsed_path.query)
+ site_bucket = _normalize_site_bucket(query_params.get("site", [None])[0])
# API endpoints
if path == "/api/events":
- self.send_json_response(events_store)
+ self.send_json_response(_snapshot_events(site_bucket))
return
elif path == "/api/events/clear":
- events_store["events"] = []
- events_store["sessions"] = {}
+ _clear_events(site_bucket)
self.send_json_response(
- {"status": "cleared", "message": "All events cleared"}
+ {
+ "status": "cleared",
+ "site": site_bucket,
+ "message": (
+ f"Events cleared for site '{site_bucket}'"
+ if site_bucket
+ else "All events cleared"
+ ),
+ }
)
return
elif path == "/api/sites":
@@ -248,6 +333,12 @@ def do_GET(self):
"url": "/bluebook/",
"description": "Xiaohongshu-like feed - test search, note modal, comment actions, and dense visual layouts",
},
+ {
+ "name": "northstaroutfitters.com",
+ "difficulty": "hard",
+ "url": "/northstar/",
+ "description": "Apparel product page - test geometry-first scrolling, sticky UI, and drawer-scoped scrolling",
+ },
]
}
self.send_json_response(sites)
@@ -255,8 +346,8 @@ def do_GET(self):
elif path == "/api/help":
help_text = {
"endpoints": {
- "GET /api/events": "Get all tracked events",
- "GET /api/events/clear": "Clear all events",
+ "GET /api/events": "Get tracked events (optional ?site=)",
+ "GET /api/events/clear": "Clear tracked events (optional ?site=)",
"GET /api/sites": "List available mock sites",
"GET /api/help": "Show this help",
"POST /api/track": "Submit tracking event (from browser)",
@@ -268,6 +359,7 @@ def do_GET(self):
"/dataflow/": "DataFlow analytics dashboard mock (medium)",
"/finviz/": "Finviz stock screener mock (hard)",
"/bluebook/": "BlueBook lifestyle feed mock (hard)",
+ "/northstar/": "Northstar Outfitters product page mock (hard)",
},
}
self.send_json_response(help_text)
@@ -317,6 +409,7 @@ def do_GET(self):
"dataflow",
"finviz",
"bluebook",
+ "northstar",
]:
if path.startswith(f"/{site}/js/") and path.endswith(".js"):
self.send_file(path, JS_MIMETYPE)
@@ -330,6 +423,7 @@ def do_GET(self):
"dataflow",
"finviz",
"bluebook",
+ "northstar",
]:
if path.startswith(f"/{site}/css/") and path.endswith(".css"):
self.send_file(path, CSS_MIMETYPE)
@@ -364,22 +458,42 @@ def do_POST(self):
try:
event = json.loads(post_data.decode("utf-8"))
event["received_at"] = datetime.now().isoformat()
- events_store["events"].append(event)
- # Track sessions
+ site_bucket = _get_event_site_bucket(event)
session_id = event.get("sessionId", "unknown")
- if session_id not in events_store["sessions"]:
- events_store["sessions"][session_id] = {
- "sessionId": session_id,
- "site": event.get("site", "unknown"),
- "difficulty": event.get("difficulty", "unknown"),
- "start_time": event.get("timestamp"),
- "events_count": 0,
- }
- events_store["sessions"][session_id]["events_count"] += 1
- events_store["sessions"][session_id]["last_activity"] = event.get(
- "timestamp"
- )
+
+ with events_store_lock:
+ events_store["events"].append(event)
+
+ if session_id not in events_store["sessions"]:
+ events_store["sessions"][session_id] = {
+ "sessionId": session_id,
+ "site": event.get("site", "unknown"),
+ "site_bucket": site_bucket,
+ "difficulty": event.get("difficulty", "unknown"),
+ "start_time": event.get("timestamp"),
+ "events_count": 0,
+ }
+ events_store["sessions"][session_id]["events_count"] += 1
+ events_store["sessions"][session_id]["last_activity"] = event.get(
+ "timestamp"
+ )
+
+ site_store = _get_or_create_site_store(site_bucket)
+ site_store["events"].append(event)
+ if session_id not in site_store["sessions"]:
+ site_store["sessions"][session_id] = {
+ "sessionId": session_id,
+ "site": event.get("site", "unknown"),
+ "site_bucket": site_bucket,
+ "difficulty": event.get("difficulty", "unknown"),
+ "start_time": event.get("timestamp"),
+ "events_count": 0,
+ }
+ site_store["sessions"][session_id]["events_count"] += 1
+ site_store["sessions"][session_id]["last_activity"] = event.get(
+ "timestamp"
+ )
self.send_json_response({"status": "ok", "message": "Event tracked"})
except Exception as e:
@@ -726,27 +840,32 @@ def print_startup_info(port):
print("Mock Websites Server for AI Agent Evaluation")
print("=" * 60)
print(f"\nServer started at: http://localhost:{port}")
- print(f"\nAvailable Sites:")
+ print("\nAvailable Sites:")
print(f" - GBR (Easy): http://localhost:{port}/gbr/")
print(f" - TechForum (Medium): http://localhost:{port}/techforum/")
print(f" - CloudStack (Hard): http://localhost:{port}/cloudstack/")
print(f" - DataFlow (Medium): http://localhost:{port}/dataflow/")
print(f" - Finviz (Hard): http://localhost:{port}/finviz/")
print(f" - BlueBook (Hard): http://localhost:{port}/bluebook/")
- print(f"\nAPI Endpoints:")
- print(f" - GET http://localhost:{port}/api/events - Get all tracked events")
- print(f" - GET http://localhost:{port}/api/events/clear - Clear all events")
+ print("\nAPI Endpoints:")
+ print(
+ f" - GET http://localhost:{port}/api/events - Get tracked events (?site=gbr)"
+ )
+ print(
+ f" - GET http://localhost:{port}/api/events/clear - Clear tracked events (?site=gbr)"
+ )
print(f" - GET http://localhost:{port}/api/sites - List available sites")
print(f" - GET http://localhost:{port}/api/help - API help")
print(f" - POST http://localhost:{port}/api/track - Submit tracking event")
- print(f"\nPress Ctrl+C to stop the server")
+ print("\nPress Ctrl+C to stop the server")
print("=" * 60 + "\n")
def main():
"""Main entry point"""
- with socketserver.TCPServer(("", PORT), MockWebsiteHandler) as httpd:
+ with socketserver.ThreadingTCPServer(("", PORT), MockWebsiteHandler) as httpd:
httpd.allow_reuse_address = True
+ httpd.daemon_threads = True
print_startup_info(PORT)
try:
diff --git a/extension/src/__tests__/background-cleanup-regression.test.ts b/extension/src/__tests__/background-cleanup-regression.test.ts
index 7938fe4..9583b2c 100644
--- a/extension/src/__tests__/background-cleanup-regression.test.ts
+++ b/extension/src/__tests__/background-cleanup-regression.test.ts
@@ -26,9 +26,48 @@ describe('Background cleanup regressions', () => {
test('swipe screenshots reuse tab-view warmup capture options', () => {
expect(backgroundSource).toContain("case 'swipe_element': {");
- expect(backgroundSource).toContain(' 900,');
+ expect(backgroundSource).toContain("logLabel: 'SwipeElement'");
+ expect(backgroundSource).toContain('preconditionWaitForRender: 900');
expect(backgroundSource).toContain(
- ' TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,',
+ 'preconditionCaptureOptions: TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS',
+ );
+ });
+
+ test('highlight warmup defaults to tab-view capture options', () => {
+ expect(backgroundSource).toContain(
+ 'captureOptions = TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS',
+ );
+ expect(backgroundSource).toContain(
+ 'preconditionCaptureOptions ?? TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS',
+ );
+ expect(backgroundSource).toContain(
+ 'const screenshotResult = await captureScreenshot(',
+ );
+ expect(backgroundSource).not.toContain(
+ 'HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS',
+ );
+ });
+
+ test('navigation defaults prime the page with a raw screenshot before highlight', () => {
+ expect(backgroundSource).toContain('async function runRawScreenshotPrime(');
+ expect(backgroundSource).toContain('primeWithRawScreenshot: true');
+ expect(backgroundSource).toContain("logLabel: 'Tab Init'");
+ expect(backgroundSource).toContain("logLabel: 'Tab Open'");
+ expect(backgroundSource).toContain("logLabel: 'Tab Refresh'");
+ });
+
+ test('post-action screenshots reuse the default highlighted page-state helper', () => {
+ expect(backgroundSource).toContain(
+ 'async function captureDefaultHighlightedPageState(',
+ );
+ expect(backgroundSource).toContain("logLabel: 'ClickElement'");
+ expect(backgroundSource).toContain("logLabel: 'Tab Init'");
+ });
+
+ test('tab view still uses the raw screenshot path', () => {
+ expect(backgroundSource).toContain("case 'view': {");
+ expect(backgroundSource).toContain(
+ 'const viewScreenshotResult = await captureScreenshot(',
);
});
});
diff --git a/extension/src/__tests__/highlight-integration.test.ts b/extension/src/__tests__/highlight-integration.test.ts
index 2dfdb57..61c683f 100644
--- a/extension/src/__tests__/highlight-integration.test.ts
+++ b/extension/src/__tests__/highlight-integration.test.ts
@@ -109,8 +109,8 @@ describe('Highlight Integration', () => {
// Run selectCollisionFreePage
const page1 = selectCollisionFreePage(elements, 1);
- expect(page1.map((element) => element.id)).toEqual(
- page1.map((_, index) => String(index + 1)),
+ expect(new Set(page1.map((element) => element.id)).size).toBe(
+ page1.length,
);
// Verify no label collisions on the same page
@@ -161,15 +161,19 @@ describe('Highlight Integration', () => {
const positions = new Set(page1.map((e) => e.labelPosition));
expect(positions.size).toBe(page1.length);
- // Verify elements on different pages while numeric ids reset per page.
+ // Verify elements on different pages while preserving each element's ID.
const page1Selectors = new Set(page1.map((e) => e.selector));
+ const expectedIdsBySelector = Object.fromEntries(
+ elements.map((element) => [element.selector, element.id]),
+ );
const page2 = selectCollisionFreePage(elements, 2);
expect(page2.length).toBeGreaterThan(0);
- expect(page2.map((element) => element.id)).toEqual(
- page2.map((_, index) => String(index + 1)),
- );
for (const elem of page2) {
expect(page1Selectors.has(elem.selector)).toBe(false);
+ expect(expectedIdsBySelector[elem.selector]).toBe(elem.id);
+ }
+ for (const elem of page1) {
+ expect(expectedIdsBySelector[elem.selector]).toBe(elem.id);
}
});
@@ -259,6 +263,16 @@ describe('Highlight Integration', () => {
// Both should fit with appropriate label positions
expect(result.length).toBeGreaterThan(0);
});
+
+ test('should separate partially overlapping non-nested boxes across pages', () => {
+ const elemA = createElement('overlap-a', 'clickable', 100, 100, 120, 40);
+ const elemB = createElement('overlap-b', 'clickable', 180, 110, 120, 40);
+ const elements = [elemA, elemB];
+
+ expect(calculateTotalPages(elements, 1280, 720)).toBe(2);
+ expect(selectCollisionFreePage(elements, 1, 1280, 720)).toHaveLength(1);
+ expect(selectCollisionFreePage(elements, 2, 1280, 720)).toHaveLength(1);
+ });
});
describe('Label placement algorithm', () => {
@@ -331,7 +345,7 @@ describe('Highlight Integration', () => {
const page1 = selectCollisionFreePage(elements, 1, 1728, 891);
- expect(page1.map((e) => e.id)).toEqual(['1', '2', '3']);
+ expect(page1.map((e) => e.id)).toEqual(['modal', 'like', 'reply']);
expect(page1[0].labelPosition).toBeDefined();
expect(page1[1].labelPosition).toBeDefined();
expect(page1[2].labelPosition).toBeDefined();
@@ -356,6 +370,36 @@ describe('Highlight Integration', () => {
// Should not use 'left' position (would be outside viewport)
expect(leftElem?.labelPosition).not.toBe('left');
});
+
+ test('should treat one-pixel label-to-element gaps as blocked', () => {
+ const upper = createElement('upper', 'clickable', 100, 44, 80, 30);
+ const lower = createElement('lower', 'clickable', 100, 101, 80, 30);
+
+ const result = selectCollisionFreePage([upper, lower], 1, 1280, 720);
+
+ expect(findBySelector(result, '#upper')?.labelPosition).toBe('above');
+ expect(findBySelector(result, '#lower')?.labelPosition).toBe('below');
+ });
+
+ test('should treat one-pixel label-to-label gaps as blocked', () => {
+ const left = createElement('AAAAAA', 'clickable', 100, 100, 24, 14);
+ const leftLabel = getLabelBBox(left.bbox, 'above', left.id);
+ const right = createElement(
+ 'CCCCCC',
+ 'clickable',
+ leftLabel.x + leftLabel.width + 1,
+ 100,
+ 24,
+ 14,
+ );
+
+ const result = selectCollisionFreePage([left, right], 1, 1280, 720);
+
+ expect(findBySelector(result, '#AAAAAA')?.labelPosition).not.toBe(
+ 'above',
+ );
+ expect(findBySelector(result, '#CCCCCC')?.labelPosition).toBe('above');
+ });
});
describe('Edge cases', () => {
diff --git a/extension/src/__tests__/highlight-padding.test.ts b/extension/src/__tests__/highlight-padding.test.ts
index 836756a..86543ca 100644
--- a/extension/src/__tests__/highlight-padding.test.ts
+++ b/extension/src/__tests__/highlight-padding.test.ts
@@ -13,18 +13,17 @@ import {
/**
* TDD Tests for Padding Consistency
- * Bug: BASE_LABEL_PADDING is 4, should be 5 to match LABEL_PADDING
- * Label height = 16 (font) + 5 (padding) * 2 = 26px
+ * Label height = 16 (font) + 3 (padding) * 2 = 22px
*/
describe('Padding Consistency', () => {
describe('Collision Detection Constants (background/index.ts)', () => {
- test('should have LABEL_HEIGHT of 26px', () => {
- expect(LABEL_HEIGHT).toBe(26);
+ test('should have LABEL_HEIGHT of 22px', () => {
+ expect(LABEL_HEIGHT).toBe(22);
});
- test('should have LABEL_PADDING of 5px', () => {
- expect(LABEL_PADDING).toBe(5);
+ test('should have LABEL_PADDING of 3px', () => {
+ expect(LABEL_PADDING).toBe(3);
});
test('should have LABEL_FONT_SIZE of 16px', () => {
@@ -33,8 +32,8 @@ describe('Padding Consistency', () => {
});
describe('Visual Drawing Constants (visual-highlight.ts)', () => {
- test('should have BASE_LABEL_PADDING of 5px', () => {
- expect(BASE_LABEL_PADDING).toBe(5);
+ test('should have BASE_LABEL_PADDING of 3px', () => {
+ expect(BASE_LABEL_PADDING).toBe(3);
});
test('should have BASE_FONT_SIZE of 16px', () => {
@@ -43,7 +42,7 @@ describe('Padding Consistency', () => {
test('should calculate label height consistently', () => {
const expectedLabelHeight = BASE_FONT_SIZE + BASE_LABEL_PADDING * 2;
- expect(expectedLabelHeight).toBe(26);
+ expect(expectedLabelHeight).toBe(22);
});
});
@@ -57,7 +56,7 @@ describe('Padding Consistency', () => {
const visualLabelHeight = BASE_FONT_SIZE + BASE_LABEL_PADDING * 2;
expect(collisionLabelHeight).toBe(visualLabelHeight);
- expect(collisionLabelHeight).toBe(26);
+ expect(collisionLabelHeight).toBe(22);
});
});
});
diff --git a/extension/src/__tests__/highlight-placement.test.ts b/extension/src/__tests__/highlight-placement.test.ts
index 7514a74..cf43175 100644
--- a/extension/src/__tests__/highlight-placement.test.ts
+++ b/extension/src/__tests__/highlight-placement.test.ts
@@ -9,6 +9,7 @@ import {
selectCollisionFreePage,
} from '../utils/collision-detection';
import type { InteractiveElement } from '../types';
+import { generateShortHash } from '../commands/element-id';
import { getLabelDimensions } from '../utils/label-geometry';
/**
@@ -23,18 +24,19 @@ import { getLabelDimensions } from '../utils/label-geometry';
// Helper to create a minimal InteractiveElement
function createElement(
- id: string,
+ selectorName: string,
x: number,
y: number,
width: number,
height: number,
labelPosition?: 'above' | 'below' | 'left' | 'right',
): InteractiveElement {
+ const selector = `#${selectorName}`;
return {
- id,
+ id: generateShortHash(selector),
type: 'clickable',
tagName: 'button',
- selector: `#${id}`,
+ selector,
bbox: { x, y, width, height },
isVisible: true,
isInViewport: true,
@@ -158,9 +160,9 @@ describe('Smart Label Placement', () => {
expect(result).toHaveLength(2);
expect(result[0]?.selector).toBe('#constrained');
- expect(result[0]?.id).toBe('1');
+ expect(result[0]?.id).toMatch(/^[0-9A-Z]{3}$/);
expect(result[1]?.selector).toBe('#flexible');
- expect(result[1]?.id).toBe('2');
+ expect(result[1]?.id).toMatch(/^[0-9A-Z]{3}$/);
});
test('should place label above when space available (default)', () => {
@@ -172,20 +174,22 @@ describe('Smart Label Placement', () => {
expect(result[0].labelPosition).toBe('above');
});
- test('should place label below when above collides with another element', () => {
+ test('should place one label below when two identical elements would both prefer above', () => {
// Element A at (100, 100) - label above at y=74-100
// Element B at (100, 100) - same position as A, label above would collide
- // Element B should try below instead
+ // The layout should split them across above/below instead of dropping one.
const elemA = createElement('a', 100, 100, 50, 30);
const elemB = createElement('b', 100, 100, 50, 30);
const elements = [elemA, elemB];
const result = selectCollisionFreePage(elements, 1);
- // Both elements should be on page 1 with different label positions
+ // Both elements should be on page 1 with different label positions.
expect(result).toHaveLength(2);
- const resultB = findBySelector(result, '#b');
- expect(resultB?.labelPosition).toBe('below');
+ expect(result.map((element) => element.labelPosition).sort()).toEqual([
+ 'above',
+ 'below',
+ ]);
});
test('should place label left when above and below collide', () => {
@@ -239,8 +243,8 @@ describe('Smart Label Placement', () => {
// Element completely surrounded in input order. The constraint-aware
// heuristic should reorder placements so the center element still fits.
const center = createElement('center', 200, 100, 50, 30);
- const above = createElement('above', 200, 74, 50, 30);
- const below = createElement('below', 200, 130, 50, 30);
+ const above = createElement('above', 200, 64, 50, 30);
+ const below = createElement('below', 200, 140, 50, 30);
const left = createElement('left', 80, 100, 50, 30);
const right = createElement('right', 320, 100, 50, 30);
@@ -367,12 +371,14 @@ describe('Smart Label Placement', () => {
// All should fit without collision
expect(result).toHaveLength(3);
- expect(result.map((element) => element.id)).toEqual(['1', '2', '3']);
expect(result.map((element) => element.selector)).toEqual([
'#a',
'#b',
'#c',
]);
+ expect(result.every((element) => /^[0-9A-Z]{3}$/.test(element.id))).toBe(
+ true,
+ );
});
});
});
diff --git a/extension/src/__tests__/label-geometry.test.ts b/extension/src/__tests__/label-geometry.test.ts
new file mode 100644
index 0000000..b79ae2e
--- /dev/null
+++ b/extension/src/__tests__/label-geometry.test.ts
@@ -0,0 +1,18 @@
+import { describe, expect, test } from 'bun:test';
+
+import { getLabelDimensions, getLabelTextWidth } from '../utils/label-geometry';
+
+describe('label-geometry', () => {
+ test('measures wide glyphs wider than narrow glyphs', () => {
+ expect(getLabelTextWidth('III')).toBeLessThan(getLabelTextWidth('WWW'));
+ expect(getLabelTextWidth('111')).toBeLessThan(getLabelTextWidth('MQH'));
+ });
+
+ test('allocates enough width for measured label text plus padding', () => {
+ const dimensions = getLabelDimensions('MQH');
+
+ expect(dimensions.width).toBe(
+ dimensions.textWidth + dimensions.padding * 2,
+ );
+ });
+});
diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index 992a006..c725b93 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -23,7 +23,8 @@ import { clearScreenshotCache } from '../commands/computer';
import { drawHighlights } from '../commands/visual-highlight';
import { highlightSingleElement } from '../commands/single-highlight';
import { elementCache } from '../commands/element-cache';
-import { assignSequentialElementIds } from '../commands/element-id';
+import { assignHashedElementIds } from '../commands/element-id';
+import { buildElementCacheMissMessage } from '../commands/element-cache';
import {
buildHighlightDetectionScript,
filterHighlightElementsByKeywords,
@@ -59,11 +60,15 @@ import {
type HighlightPageState,
} from '../utils/layout-stability';
import {
- HIGHLIGHT_PRECONDITION_CAPTURE_OPTIONS,
- HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS,
TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,
+ type ScreenshotCaptureOptions,
} from '../utils/highlight-screenshot';
-import type { Command, CommandResponse, InteractiveElement } from '../types';
+import type {
+ Command,
+ CommandResponse,
+ ElementType,
+ InteractiveElement,
+} from '../types';
console.log('🚀 OpenBrowser extension starting (Strict Mode)...');
const SERVER_HTTP_URL = 'http://127.0.0.1:8765';
@@ -87,13 +92,24 @@ async function compressScreenshotResult(
async function runHighlightPreconditionWarmup(options: {
tabId: number;
conversationId: string;
- elementType: string;
+ elementType: ElementType;
page: number;
+ waitForRender?: number;
+ captureOptions?: ScreenshotCaptureOptions;
+ logLabel?: string;
}): Promise {
- const { tabId, conversationId, elementType, page } = options;
+ const {
+ tabId,
+ conversationId,
+ elementType,
+ page,
+ waitForRender = 350,
+ captureOptions = TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,
+ logLabel = 'HighlightElements',
+ } = options;
const warmupStart = Date.now();
console.log(
- `🔥 [HighlightElements] Starting screenshot warmup precondition for elementType=${elementType}, page=${page}`,
+ `🔥 [${logLabel}] Starting screenshot warmup precondition for elementType=${elementType}, page=${page}`,
);
await captureScreenshot(
@@ -102,12 +118,44 @@ async function runHighlightPreconditionWarmup(options: {
true,
90,
false,
- 350,
- HIGHLIGHT_PRECONDITION_CAPTURE_OPTIONS,
+ waitForRender,
+ captureOptions,
);
console.log(
- `🔥 [HighlightElements] Screenshot warmup precondition completed in ${Date.now() - warmupStart}ms`,
+ `🔥 [${logLabel}] Screenshot warmup precondition completed in ${Date.now() - warmupStart}ms`,
+ );
+}
+
+async function runRawScreenshotPrime(options: {
+ tabId: number;
+ conversationId: string;
+ waitForRender?: number;
+ captureOptions?: ScreenshotCaptureOptions;
+ logLabel?: string;
+}): Promise {
+ const {
+ tabId,
+ conversationId,
+ waitForRender = 350,
+ captureOptions = TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,
+ logLabel = 'HighlightPrime',
+ } = options;
+ const primeStart = Date.now();
+ console.log(`🔥 [${logLabel}] Starting raw screenshot wake-up prime`);
+
+ await captureScreenshot(
+ tabId,
+ conversationId,
+ true,
+ 90,
+ false,
+ waitForRender,
+ captureOptions,
+ );
+
+ console.log(
+ `🔥 [${logLabel}] Raw screenshot wake-up prime completed in ${Date.now() - primeStart}ms`,
);
}
@@ -127,11 +175,7 @@ function buildStoredHighlightPages(options: {
} = options;
if (keywordMode) {
- return [
- assignSequentialElementIds(
- sortElementsByVisualOrder(assignSequentialElementIds(filteredElements)),
- ),
- ];
+ return [sortElementsByVisualOrder(filteredElements)];
}
const pages: InteractiveElement[][] = [];
@@ -142,9 +186,7 @@ function buildStoredHighlightPages(options: {
viewportWidth,
viewportHeight,
);
- pages.push(
- assignSequentialElementIds(sortElementsByVisualOrder(pageElements)),
- );
+ pages.push(sortElementsByVisualOrder(pageElements));
}
return pages;
@@ -196,6 +238,420 @@ function buildHighlightConsistencyScript(
`;
}
+interface ScreenshotPayload {
+ screenshot?: string;
+ dialog_auto_accepted?: unknown;
+ dialog_auto_accepted_list?: unknown;
+}
+
+interface HighlightedPageStateData extends ScreenshotPayload {
+ elements: InteractiveElement[];
+ totalElements: number;
+ totalPages: number;
+ page: number;
+ pageState: HighlightPageState;
+ readinessReasons: string[];
+}
+
+interface HighlightedPageCaptureOptions {
+ tabId: number;
+ conversationId: string;
+ elementType?: ElementType;
+ page?: number;
+ keywords?: string[];
+ logLabel?: string;
+ preconditionWaitForRender?: number;
+ preconditionCaptureOptions?: ScreenshotCaptureOptions;
+}
+
+function buildScreenshotPayload(
+ screenshotResult:
+ | {
+ imageData?: string;
+ dialog_auto_accepted?: unknown;
+ dialog_auto_accepted_list?: unknown;
+ }
+ | null
+ | undefined,
+): ScreenshotPayload {
+ return {
+ screenshot: screenshotResult?.imageData,
+ ...(screenshotResult?.dialog_auto_accepted
+ ? {
+ dialog_auto_accepted: screenshotResult.dialog_auto_accepted,
+ }
+ : {}),
+ ...(screenshotResult?.dialog_auto_accepted_list
+ ? {
+ dialog_auto_accepted_list: screenshotResult.dialog_auto_accepted_list,
+ }
+ : {}),
+ };
+}
+
+async function captureHighlightedPageState(
+ options: HighlightedPageCaptureOptions,
+): Promise {
+ const {
+ tabId,
+ conversationId,
+ elementType = 'any',
+ page = 1,
+ keywords,
+ logLabel = 'HighlightElements',
+ preconditionWaitForRender,
+ preconditionCaptureOptions,
+ } = options;
+
+ await tabManager.ensureTabManaged(tabId, conversationId);
+ tabManager.updateTabActivity(tabId, conversationId);
+
+ const detectionScript = buildHighlightDetectionScript({
+ elementType,
+ });
+
+ await runHighlightPreconditionWarmup({
+ tabId,
+ conversationId,
+ elementType,
+ page,
+ waitForRender: preconditionWaitForRender,
+ captureOptions: preconditionCaptureOptions,
+ logLabel,
+ });
+
+ const maxHighlightAttempts = 3;
+ const highlightDetectionTimeoutMs = 18000;
+ let previousConsistency: HighlightConsistencyResult | null = null;
+
+ for (let attempt = 1; attempt <= maxHighlightAttempts; attempt++) {
+ console.log(`🔁 [${logLabel}] Attempt ${attempt}/${maxHighlightAttempts}`);
+
+ const detectionResult = await javascript.executeJavaScript(
+ tabId,
+ conversationId,
+ detectionScript,
+ true,
+ true,
+ highlightDetectionTimeoutMs,
+ );
+
+ if (!detectionResult.success || !detectionResult.result?.value) {
+ throw new Error(detectionResult.error || 'Failed to detect elements');
+ }
+
+ const allElements = detectionResult.result.value.elements || [];
+ const detectedDocumentId =
+ typeof detectionResult.result.value.documentId === 'string'
+ ? detectionResult.result.value.documentId
+ : '';
+ const detectedViewport = detectionResult.result.value.viewport || {};
+ const layoutStability = detectionResult.result.value.layoutStability;
+ const highlightTraceStart = Date.now();
+ const detectedViewportWidth =
+ typeof detectedViewport.width === 'number' ? detectedViewport.width : 0;
+ const detectedViewportHeight =
+ typeof detectedViewport.height === 'number' ? detectedViewport.height : 0;
+ if (layoutStability) {
+ console.log(
+ `⏳ [${logLabel}] Readiness snapshot: ${JSON.stringify(layoutStability)}`,
+ );
+ }
+
+ const pageState: HighlightPageState = layoutStability?.state || 'ready';
+ const readinessReasons = Array.isArray(layoutStability?.reasons)
+ ? layoutStability.reasons
+ : [];
+
+ if (pageState === 'not_ready' && attempt < maxHighlightAttempts) {
+ const retryDelayMs = getHighlightReadinessRetryDelay(attempt);
+ console.warn(
+ `⚠️ [${logLabel}] Readiness state is not_ready (${readinessReasons.join(', ') || 'no reasons'}), retrying in ${retryDelayMs}ms (attempt ${attempt}/${maxHighlightAttempts})`,
+ );
+ await new Promise((resolve) => setTimeout(resolve, retryDelayMs));
+ continue;
+ }
+
+ const keywordFilterStart = Date.now();
+ const keywordFiltering = filterHighlightElementsByKeywords(
+ allElements,
+ keywords,
+ );
+ const keywordList = keywordFiltering.keywords;
+ const filteredElements = assignHashedElementIds(keywordFiltering.elements);
+
+ if (keywordList.length > 0) {
+ console.log(
+ `🔍 [${logLabel}] Keywords [${keywordList.join(', ')}] matched ${filteredElements.length} of ${allElements.length} elements`,
+ );
+ }
+ console.log(
+ `⏱️ [HighlightTrace] background keyword-filter ${Date.now() - keywordFilterStart}ms (keywords=${keywordList.length}, kept=${filteredElements.length}/${allElements.length})`,
+ );
+
+ let paginatedElements: InteractiveElement[];
+ let totalPages: number;
+ let currentPage = page;
+
+ if (keywordList.length > 0) {
+ paginatedElements = filteredElements;
+ totalPages = 1;
+ currentPage = 1;
+ console.log(
+ `🔍 [${logLabel}] Keywords [${keywordList.join(', ')}] matched ${paginatedElements.length} elements (no pagination)`,
+ );
+ } else {
+ const paginationSelectionStart = Date.now();
+ paginatedElements = selectCollisionFreePage(
+ filteredElements,
+ page,
+ detectedViewportWidth,
+ detectedViewportHeight,
+ );
+ const paginationSelectionMs = Date.now() - paginationSelectionStart;
+ const totalPagesStart = Date.now();
+ totalPages = calculateTotalPages(
+ filteredElements,
+ detectedViewportWidth,
+ detectedViewportHeight,
+ );
+ const totalPagesMs = Date.now() - totalPagesStart;
+ console.log(
+ `📄 [${logLabel}] Page ${page}/${totalPages}, showing ${paginatedElements.length} of ${filteredElements.length} elements`,
+ );
+ console.log(
+ `⏱️ [HighlightTrace] background pagination select=${paginationSelectionMs}ms totalPages=${totalPagesMs}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`,
+ );
+ }
+
+ const screenshotStart = Date.now();
+ const screenshotResult = await captureScreenshot(
+ tabId,
+ conversationId,
+ true,
+ 90,
+ false,
+ 0,
+ TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,
+ );
+
+ if (!screenshotResult?.success || !screenshotResult?.imageData) {
+ throw new Error(
+ `Failed to capture screenshot: ${screenshotResult?.success === false ? 'Screenshot command failed' : 'No image data returned'}`,
+ );
+ }
+ console.log(
+ `📸 [${logLabel}] Screenshot captured, size: ${screenshotResult.imageData.length} bytes`,
+ );
+ console.log(
+ `⏱️ [HighlightTrace] background screenshot ${Date.now() - screenshotStart}ms`,
+ );
+
+ const imageScale =
+ screenshotResult.metadata?.imageScale ||
+ screenshotResult.metadata?.devicePixelRatio ||
+ 1;
+ const viewportWidth = screenshotResult.metadata?.viewportWidth || 0;
+ const viewportHeight = screenshotResult.metadata?.viewportHeight || 0;
+ console.log(`📐 [${logLabel}] Image scale: ${imageScale}`);
+ console.log(
+ `📐 [${logLabel}] Viewport: ${viewportWidth}x${viewportHeight} CSS pixels`,
+ );
+ console.log(
+ `📐 [${logLabel}] Expected image size: ${Math.round(viewportWidth * imageScale)}x${Math.round(viewportHeight * imageScale)} device pixels`,
+ );
+
+ const consistencyCheckStart = Date.now();
+ const consistencyScript =
+ buildHighlightConsistencyScript(paginatedElements);
+ const consistencyResult = await javascript.executeJavaScript(
+ tabId,
+ conversationId,
+ consistencyScript,
+ true,
+ false,
+ 2000,
+ );
+ const currentConsistencySamples =
+ consistencyResult.success &&
+ consistencyResult.result?.value?.samples &&
+ Array.isArray(consistencyResult.result.value.samples)
+ ? consistencyResult.result.value.samples
+ : [];
+ const highlightConsistency = evaluateHighlightConsistency(
+ paginatedElements
+ .slice(0, HIGHLIGHT_CONSISTENCY_CONFIG.maxSampleSize)
+ .map((element) => ({
+ id: element.id,
+ bbox: element.bbox,
+ })),
+ currentConsistencySamples,
+ );
+ console.log(
+ `⏱️ [HighlightTrace] background consistency-check ${Date.now() - consistencyCheckStart}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`,
+ );
+ const repeatedDrift = isRepeatedHighlightDrift(
+ highlightConsistency,
+ previousConsistency,
+ );
+
+ if (
+ highlightConsistency.shouldRetry &&
+ attempt < maxHighlightAttempts &&
+ !repeatedDrift
+ ) {
+ previousConsistency = highlightConsistency;
+ console.warn(
+ `⚠️ [${logLabel}] Layout drift detected after screenshot, retrying (attempt ${attempt}/${maxHighlightAttempts})`,
+ );
+ continue;
+ }
+
+ if (highlightConsistency.shouldRetry) {
+ console.warn(
+ repeatedDrift
+ ? `⚠️ [${logLabel}] Layout drift repeated with near-identical metrics, returning latest screenshot`
+ : `⚠️ [${logLabel}] Layout drift still detected on final attempt, returning latest screenshot`,
+ );
+ }
+
+ const storedPages = buildStoredHighlightPages({
+ filteredElements,
+ totalPages,
+ viewportWidth: detectedViewportWidth,
+ viewportHeight: detectedViewportHeight,
+ keywordMode: keywordList.length > 0,
+ });
+ const displayOrderedElements = storedPages[currentPage - 1] ?? [];
+
+ const cacheStoreStart = Date.now();
+ const storedPage = elementCache.storeHighlightResult({
+ conversationId,
+ tabId,
+ documentId: detectedDocumentId,
+ elementType,
+ keywords: keywordList,
+ totalElements: filteredElements.length,
+ totalPages,
+ pages: storedPages,
+ page: currentPage,
+ });
+ console.log(
+ `⏱️ [HighlightTrace] background cache-store ${Date.now() - cacheStoreStart}ms (page=${storedPage.page}, count=${displayOrderedElements.length})`,
+ );
+
+ if (displayOrderedElements.length > 0) {
+ console.log(
+ `📍 [${logLabel}] First element bbox:`,
+ JSON.stringify(displayOrderedElements[0].bbox),
+ );
+ }
+
+ const drawHighlightsStart = Date.now();
+ const highlightedScreenshot = await drawHighlights(
+ screenshotResult.imageData,
+ storedPage.elements,
+ {
+ scale: imageScale,
+ viewportWidth,
+ viewportHeight,
+ },
+ );
+ console.log(
+ `⏱️ [HighlightTrace] background draw-highlights ${Date.now() - drawHighlightsStart}ms (elements=${storedPage.elements.length})`,
+ );
+
+ const compressStart = Date.now();
+ const compressedScreenshotResult = await compressScreenshotResult({
+ imageData: highlightedScreenshot,
+ dialog_auto_accepted: screenshotResult.dialog_auto_accepted,
+ dialog_auto_accepted_list: screenshotResult.dialog_auto_accepted_list,
+ });
+ console.log(
+ `⏱️ [HighlightTrace] background compress ${Date.now() - compressStart}ms`,
+ );
+ console.log(
+ `⏱️ [HighlightTrace] background total ${Date.now() - highlightTraceStart}ms`,
+ );
+
+ return {
+ elements: storedPage.elements,
+ totalElements: filteredElements.length,
+ totalPages,
+ page: currentPage,
+ pageState,
+ readinessReasons,
+ ...buildScreenshotPayload(compressedScreenshotResult),
+ };
+ }
+
+ throw new Error('Failed to produce a stable highlight screenshot');
+}
+
+async function captureDefaultHighlightedPageState(options: {
+ tabId: number;
+ conversationId: string;
+ logLabel: string;
+ preconditionWaitForRender?: number;
+ preconditionCaptureOptions?: ScreenshotCaptureOptions;
+ primeWithRawScreenshot?: boolean;
+ primeWaitForRender?: number;
+ primeCaptureOptions?: ScreenshotCaptureOptions;
+}): Promise {
+ const {
+ tabId,
+ conversationId,
+ logLabel,
+ preconditionWaitForRender,
+ preconditionCaptureOptions,
+ primeWithRawScreenshot = false,
+ primeWaitForRender,
+ primeCaptureOptions,
+ } = options;
+ const effectivePreconditionWaitForRender = preconditionWaitForRender ?? 350;
+ const effectivePreconditionCaptureOptions =
+ preconditionCaptureOptions ?? TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS;
+
+ if (primeWithRawScreenshot) {
+ await runRawScreenshotPrime({
+ tabId,
+ conversationId,
+ waitForRender: primeWaitForRender ?? effectivePreconditionWaitForRender,
+ captureOptions:
+ primeCaptureOptions ?? effectivePreconditionCaptureOptions,
+ logLabel: `${logLabel} Prime`,
+ });
+ }
+
+ try {
+ return await captureHighlightedPageState({
+ tabId,
+ conversationId,
+ elementType: 'any',
+ page: 1,
+ logLabel,
+ preconditionWaitForRender: effectivePreconditionWaitForRender,
+ preconditionCaptureOptions: effectivePreconditionCaptureOptions,
+ });
+ } catch (error) {
+ console.warn(
+ `⚠️ [${logLabel}] Default any/page 1 highlight failed, falling back to raw screenshot: ${error instanceof Error ? error.message : String(error)}`,
+ );
+ const screenshotResult = await captureScreenshot(
+ tabId,
+ conversationId,
+ true,
+ 90,
+ false,
+ effectivePreconditionWaitForRender,
+ effectivePreconditionCaptureOptions,
+ );
+ const compressedScreenshotResult =
+ await compressScreenshotResult(screenshotResult);
+ return buildScreenshotPayload(compressedScreenshotResult);
+ }
+}
+
function cleanupTabState(conversationId: string, tabId: number): void {
elementCache.invalidate(conversationId, tabId);
dialogManager.disableForTab(tabId);
@@ -796,16 +1252,12 @@ async function handleCommand(command: Command): Promise {
tabManager.setCurrentActiveTabId(conversationId, initResult.tabId);
// Capture screenshot after initialization
- const initScreenshotResult = await captureScreenshot(
- initResult.tabId,
+ const initPageState = await captureDefaultHighlightedPageState({
+ tabId: initResult.tabId,
conversationId,
- true,
- 90,
- false,
- 0,
- );
- const compressedInitScreenshotResult =
- await compressScreenshotResult(initScreenshotResult);
+ logLabel: 'Tab Init',
+ primeWithRawScreenshot: true,
+ });
return {
success: true,
@@ -816,19 +1268,7 @@ async function handleCommand(command: Command): Promise {
url: initResult.url,
conversationId: conversationId,
isManaged: true,
- screenshot: compressedInitScreenshotResult?.imageData,
- ...(compressedInitScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedInitScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedInitScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedInitScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...initPageState,
},
timestamp: Date.now(),
};
@@ -848,18 +1288,14 @@ async function handleCommand(command: Command): Promise {
}
// Capture screenshot after opening
- const openScreenshotResult = openResult.tabId
- ? await captureScreenshot(
- openResult.tabId,
+ const openPageState = openResult.tabId
+ ? await captureDefaultHighlightedPageState({
+ tabId: openResult.tabId,
conversationId,
- true,
- 90,
- false,
- 0,
- )
- : null;
- const compressedOpenScreenshotResult =
- await compressScreenshotResult(openScreenshotResult);
+ logLabel: 'Tab Open',
+ primeWithRawScreenshot: true,
+ })
+ : {};
return {
success: true,
@@ -867,19 +1303,7 @@ async function handleCommand(command: Command): Promise {
data: {
...openResult,
conversationId: conversationId,
- screenshot: compressedOpenScreenshotResult?.imageData,
- ...(compressedOpenScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedOpenScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedOpenScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedOpenScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...openPageState,
},
timestamp: Date.now(),
};
@@ -911,16 +1335,12 @@ async function handleCommand(command: Command): Promise {
tabManager.setCurrentActiveTabId(conversationId, command.tab_id);
// Capture screenshot after switching
- const switchScreenshotResult = await captureScreenshot(
- command.tab_id,
+ const switchPageState = await captureDefaultHighlightedPageState({
+ tabId: command.tab_id,
conversationId,
- true,
- 90,
- false,
- 0,
- );
- const compressedSwitchScreenshotResult =
- await compressScreenshotResult(switchScreenshotResult);
+ logLabel: 'Tab Switch',
+ primeWithRawScreenshot: true,
+ });
return {
success: true,
@@ -928,19 +1348,7 @@ async function handleCommand(command: Command): Promise {
data: {
...switchResult,
conversationId: conversationId,
- screenshot: compressedSwitchScreenshotResult?.imageData,
- ...(compressedSwitchScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedSwitchScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedSwitchScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedSwitchScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...switchPageState,
},
timestamp: Date.now(),
};
@@ -969,16 +1377,12 @@ async function handleCommand(command: Command): Promise {
const refreshResult = await tabs.refreshTab(command.tab_id);
// Capture screenshot after refresh
- const refreshScreenshotResult = await captureScreenshot(
- command.tab_id,
+ const refreshPageState = await captureDefaultHighlightedPageState({
+ tabId: command.tab_id,
conversationId,
- true,
- 90,
- false,
- 0,
- );
- const compressedRefreshScreenshotResult =
- await compressScreenshotResult(refreshScreenshotResult);
+ logLabel: 'Tab Refresh',
+ primeWithRawScreenshot: true,
+ });
return {
success: true,
@@ -986,19 +1390,7 @@ async function handleCommand(command: Command): Promise {
data: {
...refreshResult,
conversationId: conversationId,
- screenshot: compressedRefreshScreenshotResult?.imageData,
- ...(compressedRefreshScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedRefreshScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedRefreshScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedRefreshScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...refreshPageState,
},
timestamp: Date.now(),
};
@@ -1091,17 +1483,14 @@ async function handleCommand(command: Command): Promise {
? await tabs.goBack(targetTabId)
: await tabs.goForward(targetTabId);
- // Capture screenshot after navigation
- const screenshotResult = await captureScreenshot(
- targetTabId,
- conversationId,
- true,
- 90,
- false,
- 0,
- );
- const compressedNavigationScreenshotResult =
- await compressScreenshotResult(screenshotResult);
+ const navigationPageState =
+ await captureDefaultHighlightedPageState({
+ tabId: targetTabId,
+ conversationId,
+ logLabel:
+ command.action === 'back' ? 'Tab Back' : 'Tab Forward',
+ primeWithRawScreenshot: true,
+ });
return {
success: true,
@@ -1110,19 +1499,7 @@ async function handleCommand(command: Command): Promise {
...navigationResult,
tabId: targetTabId,
conversationId: conversationId,
- screenshot: compressedNavigationScreenshotResult?.imageData,
- ...(compressedNavigationScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedNavigationScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedNavigationScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedNavigationScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...navigationPageState,
},
timestamp: Date.now(),
};
@@ -1292,36 +1669,18 @@ async function handleCommand(command: Command): Promise {
tabManager.setCurrentActiveTabId(conversationId, screenshotTabId);
}
- // Always take screenshot
- const jsScreenshotResult = await captureScreenshot(
- screenshotTabId,
+ const jsPageState = await captureDefaultHighlightedPageState({
+ tabId: screenshotTabId,
conversationId,
- true,
- 90,
- false,
- 0,
- );
- const compressedJsScreenshotResult =
- await compressScreenshotResult(jsScreenshotResult);
+ logLabel: 'JavaScript',
+ });
return {
success: true,
message: 'JavaScript executed successfully',
data: {
...jsResult,
- screenshot: compressedJsScreenshotResult?.imageData,
- ...(compressedJsScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedJsScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedJsScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedJsScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...jsPageState,
},
timestamp: Date.now(),
duration: jsDuration,
@@ -1391,15 +1750,11 @@ async function handleCommand(command: Command): Promise {
console.log(`💬 [HandleDialog] Auto-accepting cascading alert`);
await dialogManager.autoAcceptDialog(activeTabId);
- // Take screenshot after auto-accept
- const screenshotResult = await captureScreenshot(
- activeTabId,
+ const dialogPageState = await captureDefaultHighlightedPageState({
+ tabId: activeTabId,
conversationId,
- true, // include_cursor
- 90, // quality
- false, // resizeToPreset
- 0, // waitForRender
- );
+ logLabel: 'HandleDialog',
+ });
return {
success: true,
@@ -1411,22 +1766,7 @@ async function handleCommand(command: Command): Promise {
message: handleResult.newDialog.message,
autoAccepted: true,
},
- screenshot: await compressIfNeeded(
- screenshotResult,
- getCompressionThreshold(),
- ),
- ...(screenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- screenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(screenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- screenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...dialogPageState,
},
timestamp: Date.now(),
};
@@ -1451,16 +1791,11 @@ async function handleCommand(command: Command): Promise {
};
}
- // No cascade - dialog handling complete
- // Take screenshot to show the result
- const screenshotResult = await captureScreenshot(
- activeTabId,
+ const dialogPageState = await captureDefaultHighlightedPageState({
+ tabId: activeTabId,
conversationId,
- true, // include_cursor
- 90, // quality
- false, // resizeToPreset
- 0, // waitForRender
- );
+ logLabel: 'HandleDialog',
+ });
console.log(
`✅ [HandleDialog] Dialog handling complete, screenshot captured`,
@@ -1471,21 +1806,7 @@ async function handleCommand(command: Command): Promise {
message: `Dialog handled successfully: ${handleResult.previousDialog.type} ${action}ed`,
data: {
handledDialog: handleResult.previousDialog,
- screenshot: await compressIfNeeded(
- screenshotResult,
- getCompressionThreshold(),
- ),
- ...(screenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted: screenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(screenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- screenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...dialogPageState,
},
timestamp: Date.now(),
};
@@ -1544,334 +1865,28 @@ async function handleCommand(command: Command): Promise {
const keywords = command.keywords;
const elementType = command.element_type || 'any';
const page = command.page || 1;
-
- const detectionScript = buildHighlightDetectionScript({
- elementType,
- });
-
- await runHighlightPreconditionWarmup({
- tabId: activeTabId,
- conversationId,
- elementType,
- page,
- });
-
- const maxHighlightAttempts = 3;
- const highlightDetectionTimeoutMs = 18000;
- let previousConsistency: HighlightConsistencyResult | null = null;
-
- for (let attempt = 1; attempt <= maxHighlightAttempts; attempt++) {
- console.log(
- `🔁 [HighlightElements] Attempt ${attempt}/${maxHighlightAttempts}`,
- );
-
- // Execute detection script in page context
- const detectionResult = await javascript.executeJavaScript(
- activeTabId,
- conversationId,
- detectionScript,
- true, // returnByValue
- true, // awaitPromise
- highlightDetectionTimeoutMs, // timeout
- );
-
- if (!detectionResult.success || !detectionResult.result?.value) {
- return {
- success: false,
- error: detectionResult.error || 'Failed to detect elements',
- timestamp: Date.now(),
- };
- }
-
- const allElements = detectionResult.result.value.elements || [];
- const detectedDocumentId =
- typeof detectionResult.result.value.documentId === 'string'
- ? detectionResult.result.value.documentId
- : '';
- const detectedViewport = detectionResult.result.value.viewport || {};
- const layoutStability = detectionResult.result.value.layoutStability;
- const highlightTraceStart = Date.now();
- const detectedViewportWidth =
- typeof detectedViewport.width === 'number'
- ? detectedViewport.width
- : 0;
- const detectedViewportHeight =
- typeof detectedViewport.height === 'number'
- ? detectedViewport.height
- : 0;
- if (layoutStability) {
- console.log(
- `⏳ [HighlightElements] Readiness snapshot: ${JSON.stringify(layoutStability)}`,
- );
- }
-
- // Do not wait inside the page for "stability". Hidden/background tabs
- // can throttle page timers hard enough that page-side polling becomes
- // the dominant source of highlight timeouts. Instead, classify the
- // current snapshot and do at most a couple of short background-side
- // retries when the viewport still looks like a loading/skeleton state.
- const pageState: HighlightPageState =
- layoutStability?.state || 'ready';
- const readinessReasons = Array.isArray(layoutStability?.reasons)
- ? layoutStability.reasons
- : [];
-
- if (pageState === 'not_ready' && attempt < maxHighlightAttempts) {
- const retryDelayMs = getHighlightReadinessRetryDelay(attempt);
- console.warn(
- `⚠️ [HighlightElements] Readiness state is not_ready (${readinessReasons.join(', ') || 'no reasons'}), retrying in ${retryDelayMs}ms (attempt ${attempt}/${maxHighlightAttempts})`,
- );
- await new Promise((resolve) => setTimeout(resolve, retryDelayMs));
- continue;
- }
-
- const keywordFilterStart = Date.now();
- const keywordFiltering = filterHighlightElementsByKeywords(
- allElements,
- keywords,
- );
- const keywordList = keywordFiltering.keywords;
- const filteredElements = keywordFiltering.elements;
-
- if (keywordList.length > 0) {
- console.log(
- `🔍 [HighlightElements] Keywords [${keywordList.join(', ')}] matched ${filteredElements.length} of ${allElements.length} elements`,
- );
- }
- console.log(
- `⏱️ [HighlightTrace] background keyword-filter ${Date.now() - keywordFilterStart}ms (keywords=${keywordList.length}, kept=${filteredElements.length}/${allElements.length})`,
- );
-
- let paginatedElements: InteractiveElement[];
- let totalPages: number;
- let currentPage = page;
-
- if (keywordList.length > 0) {
- // Keyword mode: return all matching elements, no pagination.
- // Assign temporary numeric IDs so the consistency check can
- // correlate samples before the final display-order renumbering.
- paginatedElements = assignSequentialElementIds(filteredElements);
- totalPages = 1;
- currentPage = 1;
- console.log(
- `🔍 [HighlightElements] Keywords [${keywordList.join(', ')}] matched ${paginatedElements.length} elements (no pagination)`,
- );
- } else {
- // Normal collision-aware pagination
- const paginationSelectionStart = Date.now();
- paginatedElements = selectCollisionFreePage(
- filteredElements,
- page,
- detectedViewportWidth,
- detectedViewportHeight,
- );
- const paginationSelectionMs = Date.now() - paginationSelectionStart;
- const totalPagesStart = Date.now();
- totalPages = calculateTotalPages(
- filteredElements,
- detectedViewportWidth,
- detectedViewportHeight,
- );
- const totalPagesMs = Date.now() - totalPagesStart;
- console.log(
- `📄 [HighlightElements] Page ${page}/${totalPages}, showing ${paginatedElements.length} of ${filteredElements.length} elements`,
- );
- console.log(
- `⏱️ [HighlightTrace] background pagination select=${paginationSelectionMs}ms totalPages=${totalPagesMs}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`,
- );
- }
-
- // Capture screenshot
- const screenshotStart = Date.now();
- const screenshotResult = await captureScreenshot(
- activeTabId,
- conversationId,
- true,
- 90,
- false,
- 0,
- HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS,
- );
-
- // Validate screenshot result
- if (!screenshotResult?.success || !screenshotResult?.imageData) {
- return {
- success: false,
- error: `Failed to capture screenshot: ${screenshotResult?.success === false ? 'Screenshot command failed' : 'No image data returned'}`,
- timestamp: Date.now(),
- };
- }
- console.log(
- `📸 [HighlightElements] Screenshot captured, size: ${screenshotResult.imageData.length} bytes`,
- );
- console.log(
- `⏱️ [HighlightTrace] background screenshot ${Date.now() - screenshotStart}ms`,
- );
-
- // Get device pixel ratio for coordinate scaling
- const imageScale =
- screenshotResult.metadata?.imageScale ||
- screenshotResult.metadata?.devicePixelRatio ||
- 1;
- const viewportWidth = screenshotResult.metadata?.viewportWidth || 0;
- const viewportHeight = screenshotResult.metadata?.viewportHeight || 0;
- console.log(`📐 [HighlightElements] Image scale: ${imageScale}`);
- console.log(
- `📐 [HighlightElements] Viewport: ${viewportWidth}x${viewportHeight} CSS pixels`,
- );
- console.log(
- `📐 [HighlightElements] Expected image size: ${Math.round(viewportWidth * imageScale)}x${Math.round(viewportHeight * imageScale)} device pixels`,
- );
-
- const consistencyCheckStart = Date.now();
- const consistencyScript =
- buildHighlightConsistencyScript(paginatedElements);
- const consistencyResult = await javascript.executeJavaScript(
- activeTabId,
- conversationId,
- consistencyScript,
- true,
- false,
- 2000,
- );
- const currentConsistencySamples =
- consistencyResult.success &&
- consistencyResult.result?.value?.samples &&
- Array.isArray(consistencyResult.result.value.samples)
- ? consistencyResult.result.value.samples
- : [];
- const highlightConsistency = evaluateHighlightConsistency(
- paginatedElements
- .slice(0, HIGHLIGHT_CONSISTENCY_CONFIG.maxSampleSize)
- .map((element) => ({
- id: element.id,
- bbox: element.bbox,
- })),
- currentConsistencySamples,
- );
- console.log(
- `⏱️ [HighlightTrace] background consistency-check ${Date.now() - consistencyCheckStart}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`,
- );
- const repeatedDrift = isRepeatedHighlightDrift(
- highlightConsistency,
- previousConsistency,
- );
-
- if (
- highlightConsistency.shouldRetry &&
- attempt < maxHighlightAttempts &&
- !repeatedDrift
- ) {
- previousConsistency = highlightConsistency;
- console.warn(
- `⚠️ [HighlightElements] Layout drift detected after screenshot, retrying (attempt ${attempt}/${maxHighlightAttempts})`,
- );
- continue;
- }
-
- if (highlightConsistency.shouldRetry) {
- console.warn(
- repeatedDrift
- ? `⚠️ [HighlightElements] Layout drift repeated with near-identical metrics, returning latest screenshot`
- : `⚠️ [HighlightElements] Layout drift still detected on final attempt, returning latest screenshot`,
- );
- }
-
- // Preserve the original highlight pipeline order for detection,
- // pagination, and consistency checks. Only sort and renumber at the
- // rendering boundary so the screenshot/response stay intuitive
- // without changing the stability gate.
- const storedPages = buildStoredHighlightPages({
- filteredElements,
- totalPages,
- viewportWidth: detectedViewportWidth,
- viewportHeight: detectedViewportHeight,
- keywordMode: keywordList.length > 0,
- });
- const displayOrderedElements = storedPages[currentPage - 1] ?? [];
-
- const cacheStoreStart = Date.now();
- const storedSnapshot = elementCache.storeSnapshot({
- conversationId,
+ try {
+ const highlightedPage = await captureHighlightedPageState({
tabId: activeTabId,
- documentId: detectedDocumentId,
+ conversationId,
elementType,
- keywords: keywordList,
- totalElements: filteredElements.length,
- pages: storedPages,
- page: currentPage,
+ page,
+ keywords,
+ logLabel: 'HighlightElements',
});
- console.log(
- `⏱️ [HighlightTrace] background cache-store ${Date.now() - cacheStoreStart}ms (snapshot=${storedSnapshot.snapshotId}, count=${displayOrderedElements.length})`,
- );
-
- // Log first few element bboxes for debugging
- if (displayOrderedElements.length > 0) {
- console.log(
- `📍 [HighlightElements] First element bbox:`,
- JSON.stringify(displayOrderedElements[0].bbox),
- );
- }
-
- // Draw highlights on screenshot (scale coordinates by DPR)
- const drawHighlightsStart = Date.now();
- const highlightedScreenshot = await drawHighlights(
- screenshotResult.imageData,
- storedSnapshot.elements,
- {
- scale: imageScale,
- viewportWidth,
- viewportHeight,
- },
- );
- console.log(
- `⏱️ [HighlightTrace] background draw-highlights ${Date.now() - drawHighlightsStart}ms (elements=${storedSnapshot.elements.length})`,
- );
-
- const compressStart = Date.now();
- const compressedScreenshot = await compressIfNeeded(
- highlightedScreenshot,
- getCompressionThreshold(),
- );
- console.log(
- `⏱️ [HighlightTrace] background compress ${Date.now() - compressStart}ms`,
- );
- console.log(
- `⏱️ [HighlightTrace] background total ${Date.now() - highlightTraceStart}ms`,
- );
return {
success: true,
- data: {
- highlight_snapshot_id: storedSnapshot.snapshotId,
- elements: storedSnapshot.elements,
- totalElements: filteredElements.length,
- totalPages: totalPages,
- page: currentPage,
- pageState,
- readinessReasons,
- screenshot: compressedScreenshot,
- ...(screenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted: screenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(screenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- screenshotResult.dialog_auto_accepted_list,
- }
- : {}),
- },
+ data: highlightedPage,
+ timestamp: Date.now(),
+ };
+ } catch (error) {
+ return {
+ success: false,
+ error: error instanceof Error ? error.message : String(error),
timestamp: Date.now(),
};
}
-
- return {
- success: false,
- error: 'Failed to produce a stable highlight screenshot',
- timestamp: Date.now(),
- };
}
case 'click_element': {
@@ -1883,7 +1898,6 @@ async function handleCommand(command: Command): Promise {
const clickResult = await performElementClick(
command.conversation_id,
- command.highlight_snapshot_id,
command.element_id,
clickTabId,
);
@@ -1910,35 +1924,17 @@ async function handleCommand(command: Command): Promise {
);
}
- const clickScreenshotResult = await captureScreenshot(
- screenshotTabId,
- command.conversation_id,
- true,
- 90,
- false,
- 0,
- );
- const compressedClickScreenshotResult = await compressScreenshotResult(
- clickScreenshotResult,
- );
+ const clickPageState = await captureDefaultHighlightedPageState({
+ tabId: screenshotTabId,
+ conversationId: command.conversation_id,
+ logLabel: 'ClickElement',
+ });
return {
success: clickResult.success,
data: {
...clickResult,
- screenshot: compressedClickScreenshotResult?.imageData,
- ...(compressedClickScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedClickScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedClickScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedClickScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...clickPageState,
},
error: clickResult.error,
timestamp: Date.now(),
@@ -1954,39 +1950,20 @@ async function handleCommand(command: Command): Promise {
const hoverResult = await performElementHover(
command.conversation_id,
- command.highlight_snapshot_id,
command.element_id,
hoverTabId,
);
- const hoverScreenshotResult = await captureScreenshot(
- hoverTabId,
- command.conversation_id,
- true,
- 90,
- false,
- 0,
- );
- const compressedHoverScreenshotResult = await compressScreenshotResult(
- hoverScreenshotResult,
- );
+ const hoverPageState = await captureDefaultHighlightedPageState({
+ tabId: hoverTabId,
+ conversationId: command.conversation_id,
+ logLabel: 'HoverElement',
+ });
return {
success: hoverResult.success,
data: {
...hoverResult,
- screenshot: compressedHoverScreenshotResult?.imageData,
- ...(compressedHoverScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedHoverScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedHoverScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedHoverScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...hoverPageState,
},
error: hoverResult.error,
timestamp: Date.now(),
@@ -2003,41 +1980,22 @@ async function handleCommand(command: Command): Promise {
// element_id is optional - if not provided, scrolls the entire page
const scrollResult = await performElementScroll(
command.conversation_id,
- command.highlight_snapshot_id,
command.element_id,
command.direction || 'down',
scrollTabId,
command.scroll_amount || 0.5,
);
- const scrollScreenshotResult = await captureScreenshot(
- scrollTabId,
- command.conversation_id,
- true,
- 90,
- false,
- 0,
- );
- const compressedScrollScreenshotResult = await compressScreenshotResult(
- scrollScreenshotResult,
- );
+ const scrollPageState = await captureDefaultHighlightedPageState({
+ tabId: scrollTabId,
+ conversationId: command.conversation_id,
+ logLabel: 'ScrollElement',
+ });
return {
success: scrollResult.success,
data: {
...scrollResult,
- screenshot: compressedScrollScreenshotResult?.imageData,
- ...(compressedScrollScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedScrollScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedScrollScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedScrollScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...scrollPageState,
},
error: scrollResult.error,
timestamp: Date.now(),
@@ -2053,42 +2011,24 @@ async function handleCommand(command: Command): Promise {
const swipeResult = await performElementSwipe(
command.conversation_id,
- command.highlight_snapshot_id,
command.element_id,
command.direction || 'next',
swipeTabId,
command.swipe_count || 1,
);
- const swipeScreenshotResult = await captureScreenshot(
- swipeTabId,
- command.conversation_id,
- true,
- 90,
- false,
- 900,
- TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,
- );
- const compressedSwipeScreenshotResult = await compressScreenshotResult(
- swipeScreenshotResult,
- );
+ const swipePageState = await captureDefaultHighlightedPageState({
+ tabId: swipeTabId,
+ conversationId: command.conversation_id,
+ logLabel: 'SwipeElement',
+ preconditionWaitForRender: 900,
+ preconditionCaptureOptions: TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,
+ });
return {
success: swipeResult.success,
data: {
...swipeResult,
- screenshot: compressedSwipeScreenshotResult?.imageData,
- ...(compressedSwipeScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedSwipeScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedSwipeScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedSwipeScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...swipePageState,
},
error: swipeResult.error,
timestamp: Date.now(),
@@ -2104,40 +2044,21 @@ async function handleCommand(command: Command): Promise {
const inputResult = await performKeyboardInput(
command.conversation_id,
- command.highlight_snapshot_id,
command.element_id,
command.text,
inputTabId,
);
- const inputScreenshotResult = await captureScreenshot(
- inputTabId,
- command.conversation_id,
- true,
- 90,
- false,
- 0,
- );
- const compressedInputScreenshotResult = await compressScreenshotResult(
- inputScreenshotResult,
- );
+ const inputPageState = await captureDefaultHighlightedPageState({
+ tabId: inputTabId,
+ conversationId: command.conversation_id,
+ logLabel: 'KeyboardInput',
+ });
return {
success: inputResult.success,
data: {
...inputResult,
- screenshot: compressedInputScreenshotResult?.imageData,
- ...(compressedInputScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedInputScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedInputScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedInputScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...inputPageState,
},
error: inputResult.error,
timestamp: Date.now(),
@@ -2153,40 +2074,21 @@ async function handleCommand(command: Command): Promise {
const selectResult = await performElementSelect(
command.conversation_id,
- command.highlight_snapshot_id,
command.element_id,
selectTabId,
command.value,
);
- const selectScreenshotResult = await captureScreenshot(
- selectTabId,
- command.conversation_id,
- true,
- 90,
- false,
- 0,
- );
- const compressedSelectScreenshotResult = await compressScreenshotResult(
- selectScreenshotResult,
- );
+ const selectPageState = await captureDefaultHighlightedPageState({
+ tabId: selectTabId,
+ conversationId: command.conversation_id,
+ logLabel: 'SelectElement',
+ });
return {
success: selectResult.success,
data: {
...selectResult,
- screenshot: compressedSelectScreenshotResult?.imageData,
- ...(compressedSelectScreenshotResult?.dialog_auto_accepted
- ? {
- dialog_auto_accepted:
- compressedSelectScreenshotResult.dialog_auto_accepted,
- }
- : {}),
- ...(compressedSelectScreenshotResult?.dialog_auto_accepted_list
- ? {
- dialog_auto_accepted_list:
- compressedSelectScreenshotResult.dialog_auto_accepted_list,
- }
- : {}),
+ ...selectPageState,
},
error: selectResult.error,
timestamp: Date.now(),
@@ -2198,16 +2100,10 @@ async function handleCommand(command: Command): Promise {
throw new Error('conversation_id required for get_element_html');
const conversationId = command.conversation_id;
const elementId = command.element_id;
- const highlightSnapshotId = command.highlight_snapshot_id;
if (!elementId) {
throw new Error('element_id is required for get_element_html');
}
- if (highlightSnapshotId === undefined || highlightSnapshotId === null) {
- throw new Error(
- 'highlight_snapshot_id is required for get_element_html',
- );
- }
// Get current active tab for this conversation
const activeTabId = tabManager.getCurrentActiveTabId(conversationId);
@@ -2221,20 +2117,24 @@ async function handleCommand(command: Command): Promise {
const element = elementCache.getElementById(
conversationId,
activeTabId,
- highlightSnapshotId,
elementId,
);
if (!element) {
console.warn(
- `⚠️ [GetElementHtml] Element ${elementId} not found in cache for conversation ${conversationId}, tab ${activeTabId}, snapshot ${highlightSnapshotId}`,
+ `⚠️ [GetElementHtml] Element ${elementId} not found in cache for conversation ${conversationId}, tab ${activeTabId}`,
);
return {
success: false,
- error: `Element ${elementId} not found in cache for highlight snapshot ${highlightSnapshotId}. The snapshot may have expired or the page may have changed. Try highlight_elements again.`,
+ error: buildElementCacheMissMessage({
+ conversationId,
+ tabId: activeTabId,
+ elementId,
+ refreshHint:
+ 'The highlight cache may have expired or the page may have changed. Try highlight_elements again.',
+ }),
data: {
element_id: elementId,
- highlight_snapshot_id: highlightSnapshotId,
html: null,
},
timestamp: Date.now(),
@@ -2249,10 +2149,16 @@ async function handleCommand(command: Command): Promise {
return {
success: true,
- message: `Retrieved HTML for element ${elementId}`,
+ message:
+ element.elementIdCorrected &&
+ element.resolvedElementId !== elementId
+ ? `Retrieved HTML for element ${element.resolvedElementId} (matched from requested ${elementId})`
+ : `Retrieved HTML for element ${element.resolvedElementId}`,
data: {
- element_id: elementId,
- highlight_snapshot_id: highlightSnapshotId,
+ element_id: element.resolvedElementId,
+ requested_element_id: elementId,
+ resolved_element_id: element.resolvedElementId,
+ element_id_corrected: element.elementIdCorrected,
html: html,
tagName: element.element.tagName,
type: element.element.type,
@@ -2269,27 +2175,24 @@ async function handleCommand(command: Command): Promise {
}
const conversationId = command.conversation_id;
const activeTabId = tabManager.getCurrentActiveTabId(conversationId);
- const highlightSnapshotId = command.highlight_snapshot_id;
if (!activeTabId) {
throw new Error(`No active tab for conversation ${conversationId}`);
}
- if (highlightSnapshotId === undefined || highlightSnapshotId === null) {
- throw new Error(
- 'highlight_snapshot_id is required for highlight_single_element command',
- );
- }
// Get element from cache
const element = elementCache.getElementById(
conversationId,
activeTabId,
- highlightSnapshotId,
command.element_id,
);
if (!element) {
return {
success: false,
- error: `Element ${command.element_id} not found in cache for highlight snapshot ${highlightSnapshotId}. Call highlight_elements() again.`,
+ error: buildElementCacheMissMessage({
+ conversationId,
+ tabId: activeTabId,
+ elementId: command.element_id,
+ }),
timestamp: Date.now(),
};
}
@@ -2424,7 +2327,7 @@ async function handleCommand(command: Command): Promise {
ok: false,
stale: true,
error:
- "Highlight snapshot ${highlightSnapshotId} is stale because the document changed. Call highlight_elements() again."
+ "The highlighted element is stale because the document changed. Call highlight_elements() again."
};
}
if (!el) {
@@ -2432,7 +2335,7 @@ async function handleCommand(command: Command): Promise {
ok: false,
stale: true,
error:
- "Element not found in DOM for this highlight snapshot. Call highlight_elements() again."
+ "Element not found in DOM for the cached highlight result. Call highlight_elements() again."
};
}
const currentFingerprint = getElementFingerprint(el);
@@ -2441,7 +2344,7 @@ async function handleCommand(command: Command): Promise {
ok: false,
stale: true,
error:
- "Highlight snapshot is stale because the target element identity changed. Call highlight_elements() again."
+ "The cached highlight result is stale because the target element identity changed. Call highlight_elements() again."
};
}
const rect = el.getBoundingClientRect();
@@ -2488,7 +2391,7 @@ async function handleCommand(command: Command): Promise {
success: false,
error:
bboxResult.result.value.error ||
- `Element ${command.element_id} is stale for highlight snapshot ${highlightSnapshotId}. Call highlight_elements() again.`,
+ `Element ${command.element_id} is stale. Call highlight_elements() again.`,
timestamp: Date.now(),
};
} else {
@@ -2554,7 +2457,6 @@ async function handleCommand(command: Command): Promise {
`Element ${element.element.id} is not visible in the current viewport. ${scrollHint}`.trim(),
data: {
elementId: element.element.id,
- highlight_snapshot_id: highlightSnapshotId,
bbox: freshBbox,
viewportWidth,
viewportHeight,
@@ -2574,6 +2476,7 @@ async function handleCommand(command: Command): Promise {
screenshotResult.imageData,
elementWithFreshBbox,
{
+ intendedAction: command.intended_action,
scale:
screenshotResult.metadata?.imageScale ||
screenshotResult.metadata?.devicePixelRatio ||
@@ -2591,8 +2494,10 @@ async function handleCommand(command: Command): Promise {
highlightedScreenshot,
getCompressionThreshold(),
),
- elementId: command.element_id,
- highlight_snapshot_id: highlightSnapshotId,
+ elementId: element.resolvedElementId,
+ requestedElementId: command.element_id,
+ resolvedElementId: element.resolvedElementId,
+ elementIdCorrected: element.elementIdCorrected,
...(screenshotResult?.dialog_auto_accepted
? { dialog_auto_accepted: screenshotResult.dialog_auto_accepted }
: {}),
diff --git a/extension/src/commands/__tests__/element-id.test.ts b/extension/src/commands/__tests__/element-id.test.ts
index 12ad0cb..edcaeed 100644
--- a/extension/src/commands/__tests__/element-id.test.ts
+++ b/extension/src/commands/__tests__/element-id.test.ts
@@ -1,8 +1,21 @@
import { describe, expect, test } from 'bun:test';
import type { InteractiveElement } from '../../types';
-import { elementCache } from '../element-cache';
-import { assignSequentialElementIds } from '../element-id';
+import { buildElementCacheMissMessage, elementCache } from '../element-cache';
+import {
+ ELEMENT_ID_CHARSET,
+ ELEMENT_ID_LENGTH,
+ assignHashedElementIds,
+ generateShortHash,
+ normalizeVisualElementIdInput,
+} from '../element-id';
+
+function usesAllowedElementIdChars(id: string): boolean {
+ return (
+ id.length === ELEMENT_ID_LENGTH &&
+ [...id].every((char) => ELEMENT_ID_CHARSET.includes(char))
+ );
+}
function createElement(id: string, selector: string): InteractiveElement {
return {
@@ -10,6 +23,7 @@ function createElement(id: string, selector: string): InteractiveElement {
type: 'clickable',
tagName: 'button',
selector,
+ html: ``,
bbox: { x: 0, y: 0, width: 10, height: 10 },
isVisible: true,
isInViewport: true,
@@ -17,59 +31,175 @@ function createElement(id: string, selector: string): InteractiveElement {
}
describe('element-id', () => {
- test('assigns page-local numeric ids in order', () => {
- const result = assignSequentialElementIds([
- createElement('old-a', '#a'),
- createElement('old-b', '#b'),
- createElement('old-c', '#c'),
+ test('generates fixed-length visual-safe hashes', () => {
+ const hash = generateShortHash('#checkout', '');
+
+ expect(hash).toHaveLength(ELEMENT_ID_LENGTH);
+ expect(usesAllowedElementIdChars(hash)).toBe(true);
+ });
+
+ test('assigns stable hash ids based on selector and html', () => {
+ const result = assignHashedElementIds([
+ createElement('old-a', '#checkout'),
+ createElement('old-b', '#email'),
+ createElement('old-c', '#submit'),
]);
- expect(result.map((element) => element.id)).toEqual(['1', '2', '3']);
- expect(result.map((element) => element.selector)).toEqual([
- '#a',
- '#b',
- '#c',
+ expect(
+ result.every((element) => usesAllowedElementIdChars(element.id)),
+ ).toBe(true);
+ expect(new Set(result.map((element) => element.id)).size).toBe(
+ result.length,
+ );
+
+ const secondPass = assignHashedElementIds([
+ createElement('different-a', '#checkout'),
+ createElement('different-b', '#email'),
+ createElement('different-c', '#submit'),
]);
+
+ expect(secondPass.map((element) => element.id)).toEqual(
+ result.map((element) => element.id),
+ );
+ });
+
+ test('keeps ids stable regardless of input array order', () => {
+ const original = [
+ createElement('first', '#alpha'),
+ createElement('second', '#beta'),
+ createElement('third', '#gamma'),
+ ];
+ const reversed = [...original].reverse();
+
+ const originalAssigned = assignHashedElementIds(original);
+ const reversedAssigned = assignHashedElementIds(reversed);
+
+ expect(
+ Object.fromEntries(
+ originalAssigned.map((element) => [element.selector, element.id]),
+ ),
+ ).toEqual(
+ Object.fromEntries(
+ reversedAssigned.map((element) => [element.selector, element.id]),
+ ),
+ );
});
test('does not mutate the caller-owned element objects', () => {
const original = [createElement('keep-me', '#a')];
- const result = assignSequentialElementIds(original);
+ const result = assignHashedElementIds(original);
expect(original[0].id).toBe('keep-me');
- expect(result[0].id).toBe('1');
+ expect(usesAllowedElementIdChars(result[0].id)).toBe(true);
expect(result[0]).not.toBe(original[0]);
});
+
+ test('normalizes visually ambiguous 3-character element IDs only', () => {
+ expect(normalizeVisualElementIdInput('D02')).toBe('DO2');
+ expect(normalizeVisualElementIdInput(' d o 2 ')).toBe('DO2');
+ expect(normalizeVisualElementIdInput('id-10')).toBe('id-10');
+ });
});
-describe('element-cache highlight snapshots', () => {
- test('stores a page-scoped snapshot and resolves element IDs within that snapshot', () => {
+describe('element-cache document cache', () => {
+ test('stores highlight pages and resolves element IDs from the current document cache', () => {
elementCache.clearAll();
+ const assignedPages = [
+ assignHashedElementIds([createElement('', '#page-1')]),
+ assignHashedElementIds([createElement('', '#page-2')]),
+ ];
- const snapshot = elementCache.storeSnapshot({
+ const storedPage = elementCache.storeHighlightResult({
conversationId: 'conv-1',
tabId: 101,
documentId: 'doc-1',
elementType: 'any',
totalElements: 2,
- pages: [[createElement('1', '#page-1')], [createElement('1', '#page-2')]],
+ totalPages: 2,
+ pages: assignedPages,
page: 1,
});
- expect(snapshot.snapshotId).toBe(1);
- expect(snapshot.page).toBe(1);
- expect(snapshot.elements.map((element) => element.selector)).toEqual([
+ expect(storedPage.documentId).toBe('doc-1');
+ expect(storedPage.page).toBe(1);
+ expect(storedPage.elements.map((element) => element.selector)).toEqual([
'#page-1',
]);
+ expect(storedPage.elements[0]?.id).toBe(assignedPages[0]?.[0]?.id);
+ const storedElementId = storedPage.elements[0]?.id;
+ expect(storedElementId).toBeDefined();
- const lookup = elementCache.getElementById(
- 'conv-1',
- 101,
- snapshot.snapshotId,
- '1',
- );
+ const lookup = elementCache.getElementById('conv-1', 101, storedElementId!);
expect(lookup?.element.selector).toBe('#page-1');
expect(lookup?.documentId).toBe('doc-1');
});
+
+ test('resolves visually ambiguous requested IDs to the cached visual-safe ID', () => {
+ elementCache.clearAll();
+ const page = [createElement('DO2', '#page-corrected')];
+
+ elementCache.storeHighlightResult({
+ conversationId: 'conv-visual-safe',
+ tabId: 202,
+ documentId: 'doc-visual-safe',
+ elementType: 'any',
+ totalElements: 1,
+ totalPages: 1,
+ pages: [page],
+ page: 1,
+ });
+
+ const lookup = elementCache.getElementById('conv-visual-safe', 202, 'D02');
+
+ expect(lookup?.requestedElementId).toBe('D02');
+ expect(lookup?.resolvedElementId).toBe('DO2');
+ expect(lookup?.elementIdCorrected).toBe(true);
+ expect(lookup?.element.selector).toBe('#page-corrected');
+ });
+
+ test('suggests close visual-safe ids when the requested id is missing', () => {
+ elementCache.clearAll();
+ const page = [
+ createElement('MQK', '#heart-button'),
+ createElement('M9H', '#share-button'),
+ createElement('AB1', '#other-button'),
+ ];
+
+ elementCache.storeHighlightResult({
+ conversationId: 'conv-suggestions',
+ tabId: 303,
+ documentId: 'doc-suggestions',
+ elementType: 'any',
+ totalElements: page.length,
+ totalPages: 1,
+ pages: [page],
+ page: 1,
+ });
+
+ const suggestions = elementCache.getElementIdSuggestions(
+ 'conv-suggestions',
+ 303,
+ 'MQH',
+ );
+
+ expect(suggestions.map((suggestion) => suggestion.elementId)).toEqual([
+ 'MQK',
+ 'M9H',
+ ]);
+
+ const missMessage = buildElementCacheMissMessage({
+ conversationId: 'conv-suggestions',
+ tabId: 303,
+ elementId: 'MQH',
+ });
+
+ expect(missMessage).toContain("Maybe try 'MQK', 'M9H'");
+ expect(missMessage).toContain(
+ 'MQK: ',
+ );
+ expect(missMessage).toContain(
+ 'M9H: ',
+ );
+ });
});
diff --git a/extension/src/commands/__tests__/single-highlight.test.ts b/extension/src/commands/__tests__/single-highlight.test.ts
index 5e9aa11..2439aef 100644
--- a/extension/src/commands/__tests__/single-highlight.test.ts
+++ b/extension/src/commands/__tests__/single-highlight.test.ts
@@ -1,7 +1,11 @@
import { describe, expect, test } from 'bun:test';
import type { InteractiveElement } from '../../types';
-import { calculateConfirmationPreviewLayout } from '../single-highlight';
+import {
+ calculateConfirmationBannerLayout,
+ calculateConfirmationPreviewLayout,
+ getConfirmationPromptText,
+} from '../single-highlight';
function createElement(bbox: InteractiveElement['bbox']): InteractiveElement {
return {
@@ -16,6 +20,40 @@ function createElement(bbox: InteractiveElement['bbox']): InteractiveElement {
}
describe('single-highlight confirmation preview', () => {
+ test('formats confirmation reminder text for click and keyboard input', () => {
+ expect(getConfirmationPromptText('click')).toBe(
+ 'Is this the element you wanted to click?',
+ );
+ expect(getConfirmationPromptText('keyboard_input')).toBe(
+ 'Is this the element you wanted to type into?',
+ );
+ });
+
+ test('places the confirmation reminder above the highlight when space is available', () => {
+ const banner = calculateConfirmationBannerLayout({
+ canvasWidth: 720,
+ canvasHeight: 420,
+ elementRect: { x: 220, y: 180, width: 120, height: 40 },
+ message: getConfirmationPromptText('click'),
+ scale: 1,
+ });
+
+ expect(banner.y + banner.height).toBeLessThanOrEqual(180 - 8);
+ expect(banner.x).toBeGreaterThanOrEqual(10);
+ });
+
+ test('falls back below the highlight when there is no room above', () => {
+ const banner = calculateConfirmationBannerLayout({
+ canvasWidth: 720,
+ canvasHeight: 420,
+ elementRect: { x: 220, y: 18, width: 120, height: 40 },
+ message: getConfirmationPromptText('click'),
+ scale: 1,
+ });
+
+ expect(banner.y).toBeGreaterThanOrEqual(18 + 40 + 8);
+ });
+
test('uses a bounded close-up crop around the selected element', () => {
const layout = calculateConfirmationPreviewLayout(
1280,
diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts
index fae90a0..002d3b8 100644
--- a/extension/src/commands/element-actions.ts
+++ b/extension/src/commands/element-actions.ts
@@ -11,7 +11,7 @@ import type { ElementActionResult } from '../types';
* - Handles dialog events using the same pattern as javascript.ts
*/
-import { ELEMENT_CACHE_TTL_DESCRIPTION, elementCache } from './element-cache';
+import { buildElementCacheMissMessage, elementCache } from './element-cache';
import { executeJavaScript, type JavaScriptResult } from './javascript';
import { buildHitTestVisibilityHelpersScript } from '../utils/hit-test-visibility';
@@ -19,14 +19,25 @@ function escapeForDoubleQuotedJavaScriptString(value: string): string {
return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
}
-function buildElementCacheMissMessage(
- elementId: string,
- refreshHint: string = 'Call highlight_elements() again to get a fresh highlight_snapshot_id.',
-): string {
- return `Element '${elementId}' not found in cache for the referenced highlight snapshot. Highlight snapshots expire after ${ELEMENT_CACHE_TTL_DESCRIPTION}. ${refreshHint}`;
+function buildResolvedElementResultFields(
+ requestedElementId: string,
+ resolvedElementId: string,
+): Pick<
+ ElementActionResult,
+ | 'elementId'
+ | 'requestedElementId'
+ | 'resolvedElementId'
+ | 'elementIdCorrected'
+> {
+ return {
+ elementId: resolvedElementId,
+ requestedElementId,
+ resolvedElementId,
+ elementIdCorrected: requestedElementId !== resolvedElementId,
+ };
}
-function buildSnapshotIdentityHelpersScript(): string {
+function buildCachedElementIdentityHelpersScript(): string {
return `
function normalizeIdentityWhitespace(value, maxLength = 240) {
const normalized = String(value ?? '')
@@ -145,14 +156,14 @@ function buildSnapshotIdentityHelpersScript(): string {
return overlap >= Math.max(2, Math.min(4, Math.ceil(expectedTokens.length * 0.5)));
}
- function validateSnapshotElement(expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el) {
+ function validateCachedElement(expectedDocumentId, expectedFingerprint, el) {
const currentDocumentId = getCurrentDocumentId();
if (expectedDocumentId && currentDocumentId !== expectedDocumentId) {
return {
ok: false,
stale: true,
error:
- \`Highlight snapshot \${expectedHighlightSnapshotId} is stale because the document changed. Call highlight_elements() again.\`,
+ 'The cached element is stale because the document changed. Call highlight_elements() again.',
};
}
@@ -162,7 +173,7 @@ function buildSnapshotIdentityHelpersScript(): string {
ok: false,
stale: true,
error:
- 'Highlight snapshot is stale because the target element no longer matches the cached identity. Call highlight_elements() again.',
+ 'The cached element is stale because the target no longer matches the cached identity. Call highlight_elements() again.',
};
}
@@ -177,7 +188,7 @@ function buildSnapshotIdentityHelpersScript(): string {
function buildEditableActivationHelpersScript(): string {
return `
- ${buildSnapshotIdentityHelpersScript()}
+ ${buildCachedElementIdentityHelpersScript()}
${buildHitTestVisibilityHelpersScript()}
function getInteractiveActivationTarget(target) {
@@ -386,21 +397,19 @@ export interface SelectResult extends ElementActionResult {
* 4. Return result with dialog info if applicable
*
* @param conversationId Session ID for element cache lookup
- * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements
- * @param elementId Cached element ID from the referenced highlight snapshot (for example, "1")
+ * @param elementId Cached element ID from the latest highlight cache (for example, "A1H")
* @param tabId Target tab ID
* @param timeout Maximum execution time in milliseconds (default: 30000)
* @returns Click result with success status and dialog info
*/
export async function performElementClick(
conversationId: string,
- highlightSnapshotId: number,
elementId: string,
tabId: number,
timeout: number = 30000,
): Promise {
console.log(
- `👆 [ElementClick] Clicking element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`,
+ `👆 [ElementClick] Clicking element ${elementId} in conversation ${conversationId} on tab ${tabId}`,
);
// ============================================================
@@ -409,23 +418,27 @@ export async function performElementClick(
const cachedElement = elementCache.getElementById(
conversationId,
tabId,
- highlightSnapshotId,
elementId,
);
if (!cachedElement) {
console.log(`❌ [ElementClick] Element ${elementId} not found in cache`);
return {
success: false,
- elementId,
+ ...buildResolvedElementResultFields(elementId, elementId),
clicked: false,
staleElement: false,
- error: buildElementCacheMissMessage(
+ error: buildElementCacheMissMessage({
+ conversationId,
+ tabId,
elementId,
- 'Call highlight_elements() again to get a fresh highlight_snapshot_id and element IDs.',
- ),
+ }),
};
}
const element = cachedElement.element;
+ const resolvedElementFields = buildResolvedElementResultFields(
+ cachedElement.requestedElementId,
+ cachedElement.resolvedElementId,
+ );
console.log(
`✅ [ElementClick] Found element: selector="${element.selector}"`,
@@ -448,7 +461,6 @@ export async function performElementClick(
const script = `
(async function() {
const selector = "${escapedSelector}";
- const expectedHighlightSnapshotId = ${highlightSnapshotId};
const expectedDocumentId = "${escapedDocumentId}";
const expectedFingerprint = "${escapedFingerprint}";
${buildEditableActivationHelpersScript()}
@@ -458,8 +470,7 @@ export async function performElementClick(
return { clicked: false, error: "Element not found in DOM", stale: true };
}
- const snapshotValidation = validateSnapshotElement(
- expectedHighlightSnapshotId,
+ const snapshotValidation = validateCachedElement(
expectedDocumentId,
expectedFingerprint,
el,
@@ -548,7 +559,7 @@ export async function performElementClick(
console.error(`❌ [ElementClick] JavaScript execution error:`, error);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
clicked: false,
staleElement: false,
error: error instanceof Error ? error.message : String(error),
@@ -567,7 +578,7 @@ export async function performElementClick(
console.log(`❌ [ElementClick] Click execution failed: ${jsResult.error}`);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
clicked: false,
staleElement: false,
error: jsResult.error || 'Click JavaScript execution failed',
@@ -588,7 +599,7 @@ export async function performElementClick(
);
const result: ClickResult = {
success: true,
- elementId,
+ ...resolvedElementFields,
clicked: true,
new_tabs_created: jsResult.new_tabs_created,
};
@@ -623,7 +634,7 @@ export async function performElementClick(
);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
clicked: false,
staleElement: false,
error: invalidResultError,
@@ -638,7 +649,7 @@ export async function performElementClick(
return {
success: false,
- elementId,
+ ...resolvedElementFields,
clicked: false,
staleElement: isStale,
error: clickResult?.error,
@@ -650,7 +661,7 @@ export async function performElementClick(
// If dialog opened during click, propagate dialog info
const result: ClickResult = {
success: true,
- elementId,
+ ...resolvedElementFields,
clicked: true,
new_tabs_created: jsResult.new_tabs_created,
};
@@ -683,21 +694,19 @@ export async function performElementClick(
* 4. Return result
*
* @param conversationId Session ID for element cache lookup
- * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements
- * @param elementId Cached element ID from the referenced highlight snapshot (for example, "1")
+ * @param elementId Cached element ID from the latest highlight cache (for example, "A1H")
* @param tabId Target tab ID
* @param timeout Maximum execution time in milliseconds (default: 30000)
* @returns Hover result with success status
*/
export async function performElementHover(
conversationId: string,
- highlightSnapshotId: number,
elementId: string,
tabId: number,
timeout: number = 30000,
): Promise {
console.log(
- `🖱️ [ElementHover] Hovering element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`,
+ `🖱️ [ElementHover] Hovering element ${elementId} in conversation ${conversationId} on tab ${tabId}`,
);
// ============================================================
@@ -706,20 +715,27 @@ export async function performElementHover(
const cachedElement = elementCache.getElementById(
conversationId,
tabId,
- highlightSnapshotId,
elementId,
);
if (!cachedElement) {
console.log(`❌ [ElementHover] Element ${elementId} not found in cache`);
return {
success: false,
- elementId,
+ ...buildResolvedElementResultFields(elementId, elementId),
hovered: false,
staleElement: false,
- error: buildElementCacheMissMessage(elementId),
+ error: buildElementCacheMissMessage({
+ conversationId,
+ tabId,
+ elementId,
+ }),
};
}
const element = cachedElement.element;
+ const resolvedElementFields = buildResolvedElementResultFields(
+ cachedElement.requestedElementId,
+ cachedElement.resolvedElementId,
+ );
console.log(
`✅ [ElementHover] Found element: selector="${element.selector}"`,
@@ -741,18 +757,16 @@ export async function performElementHover(
const script = `
(function() {
const selector = "${escapedSelector}";
- const expectedHighlightSnapshotId = ${highlightSnapshotId};
const expectedDocumentId = "${escapedDocumentId}";
const expectedFingerprint = "${escapedFingerprint}";
- ${buildSnapshotIdentityHelpersScript()}
+ ${buildCachedElementIdentityHelpersScript()}
const el = document.querySelector(selector);
if (!el) {
return { hovered: false, error: "Element not found in DOM", stale: true };
}
- const snapshotValidation = validateSnapshotElement(
- expectedHighlightSnapshotId,
+ const snapshotValidation = validateCachedElement(
expectedDocumentId,
expectedFingerprint,
el,
@@ -823,7 +837,7 @@ export async function performElementHover(
console.error(`❌ [ElementHover] JavaScript execution error:`, error);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
hovered: false,
staleElement: false,
};
@@ -836,7 +850,7 @@ export async function performElementHover(
console.log(`❌ [ElementHover] Hover execution failed: ${jsResult.error}`);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
hovered: false,
staleElement: false,
};
@@ -856,7 +870,7 @@ export async function performElementHover(
);
const result: HoverResult = {
success: true,
- elementId,
+ ...resolvedElementFields,
hovered: true,
new_tabs_created: jsResult.new_tabs_created,
};
@@ -887,7 +901,7 @@ export async function performElementHover(
return {
success: false,
- elementId,
+ ...resolvedElementFields,
hovered: false,
staleElement: isStale,
};
@@ -898,7 +912,7 @@ export async function performElementHover(
// If dialog opened during hover, propagate dialog info
const result: HoverResult = {
success: true,
- elementId,
+ ...resolvedElementFields,
hovered: true,
};
@@ -960,8 +974,7 @@ export interface SwipeResult extends ElementActionResult {
* 3. Execute and return result
*
* @param conversationId Session ID for element cache lookup
- * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements. Required when elementId is provided
- * @param elementId Cached element ID from the referenced highlight snapshot. Optional - if not provided, scrolls the entire page
+ * @param elementId Cached element ID from the latest highlight cache. Optional - if not provided, scrolls the entire page
* @param direction Swipe direction ('next' or 'prev')
* @param tabId Target tab ID
* @param timeout Maximum execution time in milliseconds (default: 30000)
@@ -969,7 +982,6 @@ export interface SwipeResult extends ElementActionResult {
*/
export async function performElementScroll(
conversationId: string,
- highlightSnapshotId: number | undefined,
elementId: string | undefined,
direction: ScrollDirection,
tabId: number,
@@ -977,7 +989,7 @@ export async function performElementScroll(
timeout: number = 30000,
): Promise {
console.log(
- `📜 [ElementScroll] Scrolling ${elementId ? `element ${elementId}` : 'entire page'} ${direction} (amount: ${scrollAmount}x viewport) in conversation ${conversationId} on tab ${tabId}`,
+ `📜 [ElementScroll] Scrolling ${elementId ? `element ${elementId}` : 'entire page'} ${direction} (amount factor: ${scrollAmount}x current scroll target) in conversation ${conversationId} on tab ${tabId}`,
);
// ============================================================
@@ -985,8 +997,8 @@ export async function performElementScroll(
// ============================================================
// Calculate scroll multipliers based on direction
- // scrollAmount is relative to viewport height (0.5 = half page, 1.0 = full page)
- // For horizontal scroll, we use viewport width
+ // scrollAmount is relative to the current scroll target's visible size.
+ // For vertical scroll we use clientHeight; for horizontal scroll we use clientWidth.
const scrollMultipliers: Record = {
up: { x: 0, y: -scrollAmount },
down: { x: 0, y: scrollAmount },
@@ -997,35 +1009,35 @@ export async function performElementScroll(
const { x: xMultiplier, y: yMultiplier } = scrollMultipliers[direction];
let script: string;
+ let resolvedElementFields:
+ | ReturnType
+ | undefined;
if (elementId) {
// Scroll a specific element
- if (highlightSnapshotId === undefined || highlightSnapshotId === null) {
- return {
- success: false,
- elementId,
- scrolled: false,
- error:
- 'highlight_snapshot_id is required when scrolling a highlighted element.',
- };
- }
-
const cachedElement = elementCache.getElementById(
conversationId,
tabId,
- highlightSnapshotId,
elementId,
);
if (!cachedElement) {
console.log(`❌ [ElementScroll] Element ${elementId} not found in cache`);
return {
success: false,
- elementId,
+ ...buildResolvedElementResultFields(elementId, elementId),
scrolled: false,
- error: buildElementCacheMissMessage(elementId),
+ error: buildElementCacheMissMessage({
+ conversationId,
+ tabId,
+ elementId,
+ }),
};
}
const element = cachedElement.element;
+ resolvedElementFields = buildResolvedElementResultFields(
+ cachedElement.requestedElementId,
+ cachedElement.resolvedElementId,
+ );
console.log(
`✅ [ElementScroll] Found element: selector="${element.selector}"`,
@@ -1043,20 +1055,18 @@ export async function performElementScroll(
script = `
(function() {
const selector = "${escapedSelector}";
- const expectedHighlightSnapshotId = ${highlightSnapshotId};
const expectedDocumentId = "${escapedDocumentId}";
const expectedFingerprint = "${escapedFingerprint}";
const el = document.querySelector(selector);
const xMultiplier = ${xMultiplier};
const yMultiplier = ${yMultiplier};
- ${buildSnapshotIdentityHelpersScript()}
+ ${buildCachedElementIdentityHelpersScript()}
if (!el) {
return { scrolled: false, error: "Element not found in DOM", stale: true };
}
- const snapshotValidation = validateSnapshotElement(
- expectedHighlightSnapshotId,
+ const snapshotValidation = validateCachedElement(
expectedDocumentId,
expectedFingerprint,
el,
@@ -1212,7 +1222,10 @@ export async function performElementScroll(
console.error(`❌ [ElementScroll] JavaScript execution error:`, error);
return {
success: false,
- elementId,
+ ...(resolvedElementFields ??
+ (elementId
+ ? buildResolvedElementResultFields(elementId, elementId)
+ : {})),
scrolled: false,
};
}
@@ -1224,7 +1237,10 @@ export async function performElementScroll(
);
return {
success: false,
- elementId,
+ ...(resolvedElementFields ??
+ (elementId
+ ? buildResolvedElementResultFields(elementId, elementId)
+ : {})),
scrolled: false,
};
}
@@ -1243,7 +1259,10 @@ export async function performElementScroll(
);
const result: ScrollResult = {
success: true,
- elementId,
+ ...(resolvedElementFields ??
+ (elementId
+ ? buildResolvedElementResultFields(elementId, elementId)
+ : {})),
scrolled: true,
new_tabs_created: jsResult.new_tabs_created,
};
@@ -1282,7 +1301,10 @@ export async function performElementScroll(
return {
success: false,
- elementId,
+ ...(resolvedElementFields ??
+ (elementId
+ ? buildResolvedElementResultFields(elementId, elementId)
+ : {})),
scrolled: false,
staleElement: isStale,
};
@@ -1301,7 +1323,10 @@ export async function performElementScroll(
const result: ScrollResult = {
success: true,
- elementId,
+ ...(resolvedElementFields ??
+ (elementId
+ ? buildResolvedElementResultFields(elementId, elementId)
+ : {})),
scrolled: true,
scrollEffective,
...(warning ? { warning } : {}),
@@ -1336,7 +1361,6 @@ export async function performElementScroll(
*/
export async function performElementSwipe(
conversationId: string,
- highlightSnapshotId: number,
elementId: string,
direction: SwipeDirection,
tabId: number,
@@ -1344,25 +1368,32 @@ export async function performElementSwipe(
timeout: number = 30000,
): Promise {
console.log(
- `🫳 [ElementSwipe] Swiping element ${elementId} from snapshot ${highlightSnapshotId} ${direction} (count: ${swipeCount}) in conversation ${conversationId} on tab ${tabId}`,
+ `🫳 [ElementSwipe] Swiping element ${elementId} ${direction} (count: ${swipeCount}) in conversation ${conversationId} on tab ${tabId}`,
);
const cachedElement = elementCache.getElementById(
conversationId,
tabId,
- highlightSnapshotId,
elementId,
);
if (!cachedElement) {
console.log(`❌ [ElementSwipe] Element ${elementId} not found in cache`);
return {
success: false,
- elementId,
+ ...buildResolvedElementResultFields(elementId, elementId),
swiped: false,
- error: buildElementCacheMissMessage(elementId),
+ error: buildElementCacheMissMessage({
+ conversationId,
+ tabId,
+ elementId,
+ }),
};
}
const element = cachedElement.element;
+ const resolvedElementFields = buildResolvedElementResultFields(
+ cachedElement.requestedElementId,
+ cachedElement.resolvedElementId,
+ );
console.log(
`✅ [ElementSwipe] Found element: selector="${element.selector}"`,
@@ -1381,20 +1412,18 @@ export async function performElementSwipe(
const script = `
(async function() {
const selector = "${escapedSelector}";
- const expectedHighlightSnapshotId = ${highlightSnapshotId};
const expectedDocumentId = "${escapedDocumentId}";
const expectedFingerprint = "${escapedFingerprint}";
const direction = "${direction}";
const swipeCount = ${swipeCount};
- ${buildSnapshotIdentityHelpersScript()}
+ ${buildCachedElementIdentityHelpersScript()}
const el = document.querySelector(selector);
if (!el) {
return { swiped: false, error: "Element not found in DOM", stale: true };
}
- const snapshotValidation = validateSnapshotElement(
- expectedHighlightSnapshotId,
+ const snapshotValidation = validateCachedElement(
expectedDocumentId,
expectedFingerprint,
el,
@@ -2392,7 +2421,7 @@ export async function performElementSwipe(
console.error(`❌ [ElementSwipe] JavaScript execution error:`, error);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
swiped: false,
error: error instanceof Error ? error.message : String(error),
};
@@ -2402,7 +2431,7 @@ export async function performElementSwipe(
console.log(`❌ [ElementSwipe] Swipe execution failed: ${jsResult.error}`);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
swiped: false,
error: jsResult.error || 'Swipe JavaScript execution failed',
};
@@ -2414,7 +2443,7 @@ export async function performElementSwipe(
);
const result: SwipeResult = {
success: true,
- elementId,
+ ...resolvedElementFields,
swiped: true,
new_tabs_created: jsResult.new_tabs_created,
};
@@ -2449,7 +2478,7 @@ export async function performElementSwipe(
console.log(`❌ [ElementSwipe] Swipe failed: ${error}, stale=${isStale}`);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
swiped: false,
staleElement: isStale,
error,
@@ -2471,7 +2500,7 @@ export async function performElementSwipe(
return {
success: true,
- elementId,
+ ...resolvedElementFields,
swiped: true,
swipeEffective,
...(warning ? { warning } : {}),
@@ -2499,8 +2528,7 @@ export interface InputResult extends ElementActionResult {
* 4. Return result with input value
*
* @param conversationId Session ID for element cache lookup
- * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements
- * @param elementId Cached element ID from the referenced highlight snapshot
+ * @param elementId Cached element ID from the latest highlight cache
* @param text Text to input into the element
* @param tabId Target tab ID
* @param timeout Maximum execution time in milliseconds (default: 30000)
@@ -2508,14 +2536,13 @@ export interface InputResult extends ElementActionResult {
*/
export async function performKeyboardInput(
conversationId: string,
- highlightSnapshotId: number,
elementId: string,
text: string,
tabId: number,
timeout: number = 30000,
): Promise {
console.log(
- `⌨️ [KeyboardInput] Inputting text to element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`,
+ `⌨️ [KeyboardInput] Inputting text to element ${elementId} in conversation ${conversationId} on tab ${tabId}`,
);
// ============================================================
@@ -2524,20 +2551,27 @@ export async function performKeyboardInput(
const cachedElement = elementCache.getElementById(
conversationId,
tabId,
- highlightSnapshotId,
elementId,
);
if (!cachedElement) {
console.log(`❌ [KeyboardInput] Element ${elementId} not found in cache`);
return {
success: false,
- elementId,
+ ...buildResolvedElementResultFields(elementId, elementId),
input: false,
staleElement: false,
- error: buildElementCacheMissMessage(elementId),
+ error: buildElementCacheMissMessage({
+ conversationId,
+ tabId,
+ elementId,
+ }),
};
}
const element = cachedElement.element;
+ const resolvedElementFields = buildResolvedElementResultFields(
+ cachedElement.requestedElementId,
+ cachedElement.resolvedElementId,
+ );
console.log(
`✅ [KeyboardInput] Found element: selector="${element.selector}"`,
@@ -2561,7 +2595,6 @@ export async function performKeyboardInput(
const script = `
(function() {
const selector = "${escapedSelector}";
- const expectedHighlightSnapshotId = ${highlightSnapshotId};
const expectedDocumentId = "${escapedDocumentId}";
const expectedFingerprint = "${escapedFingerprint}";
const text = "${escapedText}";
@@ -2572,8 +2605,7 @@ export async function performKeyboardInput(
return { input: false, error: "Element not found in DOM", stale: true };
}
- const snapshotValidation = validateSnapshotElement(
- expectedHighlightSnapshotId,
+ const snapshotValidation = validateCachedElement(
expectedDocumentId,
expectedFingerprint,
el,
@@ -2689,7 +2721,7 @@ export async function performKeyboardInput(
console.error(`❌ [KeyboardInput] JavaScript execution error:`, error);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
input: false,
staleElement: false,
};
@@ -2707,7 +2739,7 @@ export async function performKeyboardInput(
console.log(`❌ [KeyboardInput] Input execution failed: ${jsResult.error}`);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
input: false,
staleElement: false,
};
@@ -2727,7 +2759,7 @@ export async function performKeyboardInput(
);
const result: InputResult = {
success: true,
- elementId,
+ ...resolvedElementFields,
input: true,
value: undefined,
new_tabs_created: jsResult.new_tabs_created,
@@ -2759,7 +2791,7 @@ export async function performKeyboardInput(
return {
success: false,
- elementId,
+ ...resolvedElementFields,
input: false,
staleElement: isStale,
};
@@ -2772,7 +2804,7 @@ export async function performKeyboardInput(
// If dialog opened during input, propagate dialog info
const result: InputResult = {
success: true,
- elementId,
+ ...resolvedElementFields,
input: true,
value: inputResult.value,
};
@@ -2805,8 +2837,7 @@ export async function performKeyboardInput(
* 4. Return result with selected values/labels/indices
*
* @param conversationId Session ID for element cache lookup
- * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements
- * @param elementId Cached element ID from the referenced highlight snapshot (for example, "1")
+ * @param elementId Cached element ID from the latest highlight cache (for example, "A1H")
* @param tabId Target tab ID
* @param value Option value(s) to select. Use string for single select, array for multi-select
* @param timeout Maximum execution time in milliseconds (default: 30000)
@@ -2814,14 +2845,13 @@ export async function performKeyboardInput(
*/
export async function performElementSelect(
conversationId: string,
- highlightSnapshotId: number,
elementId: string,
tabId: number,
value: string | string[],
timeout: number = 30000,
): Promise {
console.log(
- `📋 [ElementSelect] Selecting element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`,
+ `📋 [ElementSelect] Selecting element ${elementId} in conversation ${conversationId} on tab ${tabId}`,
);
// ============================================================
@@ -2830,20 +2860,27 @@ export async function performElementSelect(
const cachedElement = elementCache.getElementById(
conversationId,
tabId,
- highlightSnapshotId,
elementId,
);
if (!cachedElement) {
console.log(`❌ [ElementSelect] Element ${elementId} not found in cache`);
return {
success: false,
- elementId,
+ ...buildResolvedElementResultFields(elementId, elementId),
selected: false,
staleElement: false,
- error: buildElementCacheMissMessage(elementId),
+ error: buildElementCacheMissMessage({
+ conversationId,
+ tabId,
+ elementId,
+ }),
};
}
const element = cachedElement.element;
+ const resolvedElementFields = buildResolvedElementResultFields(
+ cachedElement.requestedElementId,
+ cachedElement.resolvedElementId,
+ );
console.log(
`✅ [ElementSelect] Found element: selector="${element.selector}"`,
@@ -2869,11 +2906,10 @@ export async function performElementSelect(
const script = `
(function() {
const selector = "${escapedSelector}";
- const expectedHighlightSnapshotId = ${highlightSnapshotId};
const expectedDocumentId = "${escapedDocumentId}";
const expectedFingerprint = "${escapedFingerprint}";
const value = ${valueJson};
- ${buildSnapshotIdentityHelpersScript()}
+ ${buildCachedElementIdentityHelpersScript()}
const el = document.querySelector(selector);
@@ -2881,8 +2917,7 @@ export async function performElementSelect(
return { selected: false, error: "Element not found in DOM", stale: true };
}
- const snapshotValidation = validateSnapshotElement(
- expectedHighlightSnapshotId,
+ const snapshotValidation = validateCachedElement(
expectedDocumentId,
expectedFingerprint,
el,
@@ -2999,7 +3034,7 @@ export async function performElementSelect(
console.error(`❌ [ElementSelect] JavaScript execution error:`, error);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
selected: false,
staleElement: false,
};
@@ -3016,7 +3051,7 @@ export async function performElementSelect(
);
return {
success: false,
- elementId,
+ ...resolvedElementFields,
selected: false,
staleElement: false,
};
@@ -3036,7 +3071,7 @@ export async function performElementSelect(
);
const result: SelectResult = {
success: true,
- elementId,
+ ...resolvedElementFields,
selected: true,
new_tabs_created: jsResult.new_tabs_created,
};
@@ -3075,7 +3110,7 @@ export async function performElementSelect(
return {
success: false,
- elementId,
+ ...resolvedElementFields,
selected: false,
staleElement: isStale,
error: selectResult?.error,
@@ -3089,7 +3124,7 @@ export async function performElementSelect(
// Build result with selected values
const result: SelectResult = {
success: true,
- elementId,
+ ...resolvedElementFields,
selected: true,
selectedValues: selectResult.selectedValues,
selectedLabels: selectResult.selectedLabels,
diff --git a/extension/src/commands/element-cache.ts b/extension/src/commands/element-cache.ts
index 61babb5..a3f56b6 100644
--- a/extension/src/commands/element-cache.ts
+++ b/extension/src/commands/element-cache.ts
@@ -1,16 +1,21 @@
/**
- * Highlight snapshot cache manager.
+ * Document-scoped element cache manager.
*
- * Two cache layers are maintained:
- * 1. Per-call highlight inventories used to serve requested pages and keep
- * page-local element IDs stable within one highlight response.
- * 2. Page-scoped highlight snapshots returned to callers and used for
- * element interactions together with page-local element IDs.
+ * Each conversation/tab keeps one active cache for the current highlighted
+ * document:
+ * 1. Persistent element-id assignments for the current document
+ * 2. A merged element lookup table keyed only by element_id
+ * 3. Latest highlight metadata for the current document
*/
import type { ElementType, InteractiveElement } from '../types';
+import {
+ buildElementIdentityKey,
+ generateUniqueHash,
+ normalizeVisualElementIdInput,
+} from './element-id';
-interface HighlightInventoryEntry {
+interface DocumentElementCacheEntry {
tabId: number;
createdAt: number;
lastAccessedAt: number;
@@ -18,19 +23,13 @@ interface HighlightInventoryEntry {
elementType: ElementType;
keywords: string[];
totalElements: number;
- pages: InteractiveElement[][];
-}
-
-interface HighlightSnapshotViewEntry {
- tabId: number;
- inventoryId: number;
- createdAt: number;
- page: number;
+ totalPages: number;
+ idByIdentityKey: Map;
+ usedIds: Set;
+ elementsById: Map;
}
-export interface HighlightSnapshotPage {
- snapshotId: number;
- inventoryId: number;
+export interface StoredHighlightPage {
page: number;
totalPages: number;
totalElements: number;
@@ -41,150 +40,172 @@ export interface HighlightSnapshotPage {
}
export interface CachedElementLookup {
- snapshotId: number;
- inventoryId: number;
- page: number;
- totalPages: number;
- totalElements: number;
documentId: string;
elementType: ElementType;
keywords: string[];
+ totalElements: number;
+ totalPages: number;
+ requestedElementId: string;
+ resolvedElementId: string;
+ normalizedRequestedElementId: string;
+ elementIdCorrected: boolean;
element: InteractiveElement;
}
+export interface ElementIdSuggestion {
+ elementId: string;
+ html: string;
+ matchedPositions: number;
+}
+
export const ELEMENT_CACHE_TTL_MS = 1_200_000; // 20 minutes
export const ELEMENT_CACHE_TTL_DESCRIPTION = `${ELEMENT_CACHE_TTL_MS / 60_000} minutes`;
-const MAX_HIGHLIGHT_INVENTORIES_PER_TAB = 12;
class ElementCacheImpl {
- private inventories = new Map();
-
- private snapshotViews = new Map();
+ private documents = new Map();
- private nextInventoryId = 1;
-
- private nextSnapshotId = 1;
-
- private buildInventoryKey(
- conversationId: string,
- tabId: number,
- inventoryId: number,
- ): string {
- return `${conversationId}:${tabId}:inventory:${inventoryId}`;
- }
-
- private buildSnapshotKey(
- conversationId: string,
- tabId: number,
- snapshotId: number,
- ): string {
- return `${conversationId}:${tabId}:snapshot:${snapshotId}`;
- }
-
- private touchInventory(entry: HighlightInventoryEntry): void {
- entry.lastAccessedAt = Date.now();
+ private buildDocumentKey(conversationId: string, tabId: number): string {
+ return `${conversationId}:${tabId}`;
}
private isExpired(timestamp: number): boolean {
return Date.now() - timestamp > ELEMENT_CACHE_TTL_MS;
}
- private removeInventoryByKey(key: string): void {
- const inventory = this.inventories.get(key);
- if (!inventory) {
- return;
- }
+ private cloneElement(
+ element: InteractiveElement,
+ id: string,
+ ): InteractiveElement {
+ return {
+ ...element,
+ bbox: { ...element.bbox },
+ id,
+ };
+ }
- this.inventories.delete(key);
+ private touchEntry(entry: DocumentElementCacheEntry): void {
+ entry.lastAccessedAt = Date.now();
+ }
- const snapshotKeysToDelete: string[] = [];
- for (const [snapshotKey, snapshot] of this.snapshotViews.entries()) {
- if (snapshot.inventoryId === this.parseInventoryIdFromKey(key)) {
- snapshotKeysToDelete.push(snapshotKey);
+ private cleanupExpired(): void {
+ for (const [key, entry] of this.documents.entries()) {
+ if (this.isExpired(entry.lastAccessedAt)) {
+ this.documents.delete(key);
+ console.log(`⏰ [ElementCache] Document cache expired for key ${key}`);
}
}
- for (const snapshotKey of snapshotKeysToDelete) {
- this.snapshotViews.delete(snapshotKey);
- }
-
- console.log(
- `🗑️ [ElementCache] Removed highlight inventory ${key} (${inventory.pages.length} pages, ${snapshotKeysToDelete.length} snapshots)`,
- );
}
- private parseInventoryIdFromKey(key: string): number {
- const maybeId = Number.parseInt(key.split(':').at(-1) ?? '', 10);
- return Number.isFinite(maybeId) ? maybeId : -1;
- }
+ private getOrCreateEntry(options: {
+ conversationId: string;
+ tabId: number;
+ documentId: string;
+ elementType: ElementType;
+ keywords: string[];
+ totalElements: number;
+ totalPages: number;
+ }): DocumentElementCacheEntry {
+ const {
+ conversationId,
+ tabId,
+ documentId,
+ elementType,
+ keywords,
+ totalElements,
+ totalPages,
+ } = options;
- private cleanupExpired(): void {
- const activeInventoryKeys = new Set();
+ this.cleanupExpired();
- for (const [snapshotKey, snapshot] of this.snapshotViews.entries()) {
- if (this.isExpired(snapshot.createdAt)) {
- this.snapshotViews.delete(snapshotKey);
- console.log(
- `⏰ [ElementCache] Snapshot expired for key ${snapshotKey}`,
- );
- continue;
- }
+ const key = this.buildDocumentKey(conversationId, tabId);
+ const existing = this.documents.get(key);
+ const now = Date.now();
- const inventoryKey = snapshotKey.replace(
- /:snapshot:\d+$/,
- `:inventory:${snapshot.inventoryId}`,
- );
- activeInventoryKeys.add(inventoryKey);
+ if (existing && existing.documentId === documentId) {
+ existing.lastAccessedAt = now;
+ existing.elementType = elementType;
+ existing.keywords = [...keywords];
+ existing.totalElements = totalElements;
+ existing.totalPages = totalPages;
+ return existing;
}
- const inventoryKeysToDelete: string[] = [];
- for (const [inventoryKey, inventory] of this.inventories.entries()) {
- if (this.isExpired(inventory.lastAccessedAt)) {
- inventoryKeysToDelete.push(inventoryKey);
- continue;
- }
-
- if (
- !activeInventoryKeys.has(inventoryKey) &&
- this.isExpired(inventory.createdAt)
- ) {
- inventoryKeysToDelete.push(inventoryKey);
- }
- }
+ const created: DocumentElementCacheEntry = {
+ tabId,
+ createdAt: now,
+ lastAccessedAt: now,
+ documentId,
+ elementType,
+ keywords: [...keywords],
+ totalElements,
+ totalPages,
+ idByIdentityKey: new Map(),
+ usedIds: new Set(),
+ elementsById: new Map(),
+ };
- for (const inventoryKey of inventoryKeysToDelete) {
- this.removeInventoryByKey(inventoryKey);
- }
+ this.documents.set(key, created);
+ console.log(
+ `📁 [ElementCache] Started new document cache for conversation ${conversationId}, tab ${tabId}, document ${documentId}`,
+ );
+ return created;
}
- private pruneInventoriesForTab(conversationId: string, tabId: number): void {
- const prefix = `${conversationId}:${tabId}:inventory:`;
- const matchingInventories = Array.from(this.inventories.entries())
- .filter(([key]) => key.startsWith(prefix))
- .sort((a, b) => a[1].createdAt - b[1].createdAt);
-
- if (matchingInventories.length <= MAX_HIGHLIGHT_INVENTORIES_PER_TAB) {
- return;
+ private assignIdsForEntry(
+ entry: DocumentElementCacheEntry,
+ elements: InteractiveElement[],
+ ): InteractiveElement[] {
+ const assignedIds = new Array(elements.length);
+
+ const elementsByStableKey = elements
+ .map((element, index) => ({
+ element,
+ index,
+ identityKey: buildElementIdentityKey(element),
+ }))
+ .sort((left, right) => {
+ const keyOrder = left.identityKey.localeCompare(right.identityKey);
+ if (keyOrder !== 0) {
+ return keyOrder;
+ }
+ return left.index - right.index;
+ });
+
+ for (const { element, index, identityKey } of elementsByStableKey) {
+ let elementId = entry.idByIdentityKey.get(identityKey);
+ if (!elementId) {
+ if (element.id && !entry.usedIds.has(element.id)) {
+ elementId = element.id;
+ } else {
+ const { hash } = generateUniqueHash(
+ element.selector,
+ entry.usedIds,
+ element.html,
+ );
+ elementId = hash;
+ }
+ entry.idByIdentityKey.set(identityKey, elementId);
+ entry.usedIds.add(elementId);
+ }
+ assignedIds[index] = elementId;
}
- const toDelete = matchingInventories.slice(
- 0,
- matchingInventories.length - MAX_HIGHLIGHT_INVENTORIES_PER_TAB,
+ return elements.map((element, index) =>
+ this.cloneElement(element, assignedIds[index] || element.id),
);
- for (const [inventoryKey] of toDelete) {
- this.removeInventoryByKey(inventoryKey);
- }
}
- storeSnapshot(options: {
+ storeHighlightResult(options: {
conversationId: string;
tabId: number;
documentId: string;
elementType: ElementType;
keywords?: string[];
totalElements: number;
- pages: InteractiveElement[][];
+ totalPages: number;
page: number;
- }): HighlightSnapshotPage {
+ pages: InteractiveElement[][];
+ }): StoredHighlightPage {
const {
conversationId,
tabId,
@@ -192,243 +213,255 @@ class ElementCacheImpl {
elementType,
keywords = [],
totalElements,
- pages,
+ totalPages,
page,
+ pages,
} = options;
- this.cleanupExpired();
-
- const inventoryId = this.nextInventoryId++;
- const snapshotId = this.nextSnapshotId++;
- const now = Date.now();
- const inventoryKey = this.buildInventoryKey(
- conversationId,
- tabId,
- inventoryId,
- );
- const snapshotKey = this.buildSnapshotKey(
+ const entry = this.getOrCreateEntry({
conversationId,
tabId,
- snapshotId,
- );
-
- this.inventories.set(inventoryKey, {
- tabId,
- createdAt: now,
- lastAccessedAt: now,
documentId,
elementType,
- keywords: [...keywords],
+ keywords,
totalElements,
- pages: pages.map((snapshotPage) =>
- snapshotPage.map((element) => ({
- ...element,
- bbox: { ...element.bbox },
- })),
- ),
+ totalPages,
});
- this.snapshotViews.set(snapshotKey, {
- tabId,
- inventoryId,
- createdAt: now,
- page,
- });
-
- this.pruneInventoriesForTab(conversationId, tabId);
-
- const snapshotPage = this.getSnapshotPage(
- conversationId,
- tabId,
- snapshotId,
- );
- if (!snapshotPage) {
- throw new Error(
- `Failed to retrieve newly stored highlight snapshot ${snapshotId}`,
- );
- }
-
- console.log(
- `📁 [ElementCache] Stored highlight inventory ${inventoryId} and snapshot ${snapshotId} for conversation ${conversationId}, tab ${tabId} (${pages.length} pages, ${totalElements} total elements)`,
+ const assignedPages = pages.map((pageElements) =>
+ this.assignIdsForEntry(entry, pageElements),
);
- return snapshotPage;
- }
-
- getSnapshotPage(
- conversationId: string,
- tabId: number,
- snapshotId: number,
- ): HighlightSnapshotPage | undefined {
- this.cleanupExpired();
- const snapshot = this.getSnapshotView(conversationId, tabId, snapshotId);
- if (!snapshot) {
- return undefined;
- }
-
- const inventory = this.getInventory(
- conversationId,
- tabId,
- snapshot.inventoryId,
- );
- if (!inventory) {
- return undefined;
+ for (const pageElements of assignedPages) {
+ for (const element of pageElements) {
+ entry.elementsById.set(
+ element.id,
+ this.cloneElement(element, element.id),
+ );
+ }
}
- this.touchInventory(inventory);
+ this.touchEntry(entry);
- const pageIndex = Math.max(0, snapshot.page - 1);
- const elements = inventory.pages[pageIndex] ?? [];
+ console.log(
+ `📁 [ElementCache] Stored ${assignedPages.length} highlight pages for conversation ${conversationId}, tab ${tabId} (${totalElements} total elements on document ${documentId})`,
+ );
return {
- snapshotId,
- inventoryId: snapshot.inventoryId,
- page: snapshot.page,
- totalPages: inventory.pages.length,
- totalElements: inventory.totalElements,
- elementType: inventory.elementType,
- keywords: [...inventory.keywords],
- documentId: inventory.documentId,
- elements: elements.map((element) => ({
- ...element,
- bbox: { ...element.bbox },
- })),
+ page,
+ totalPages,
+ totalElements,
+ elementType,
+ keywords: [...keywords],
+ documentId,
+ elements: (assignedPages[Math.max(0, page - 1)] ?? []).map((element) =>
+ this.cloneElement(element, element.id),
+ ),
};
}
getElementById(
conversationId: string,
tabId: number,
- snapshotId: number,
elementId: string,
): CachedElementLookup | undefined {
- const snapshotPage = this.getSnapshotPage(
- conversationId,
- tabId,
- snapshotId,
- );
- if (!snapshotPage) {
+ this.cleanupExpired();
+
+ const key = this.buildDocumentKey(conversationId, tabId);
+ const entry = this.documents.get(key);
+ if (!entry || entry.tabId !== tabId) {
return undefined;
}
- const element = snapshotPage.elements.find(
- (candidate) => candidate.id === elementId,
- );
+ this.touchEntry(entry);
+ const requestedElementId = elementId;
+ const normalizedRequestedElementId =
+ normalizeVisualElementIdInput(requestedElementId);
+ let resolvedElementId = requestedElementId;
+ let element = entry.elementsById.get(requestedElementId);
+
+ if (!element && normalizedRequestedElementId !== requestedElementId) {
+ element = entry.elementsById.get(normalizedRequestedElementId);
+ if (element) {
+ resolvedElementId = normalizedRequestedElementId;
+ }
+ }
+
if (!element) {
return undefined;
}
return {
- snapshotId,
- inventoryId: snapshotPage.inventoryId,
- page: snapshotPage.page,
- totalPages: snapshotPage.totalPages,
- totalElements: snapshotPage.totalElements,
- documentId: snapshotPage.documentId,
- elementType: snapshotPage.elementType,
- keywords: snapshotPage.keywords,
- element,
+ documentId: entry.documentId,
+ elementType: entry.elementType,
+ keywords: [...entry.keywords],
+ totalElements: entry.totalElements,
+ totalPages: entry.totalPages,
+ requestedElementId,
+ resolvedElementId,
+ normalizedRequestedElementId,
+ elementIdCorrected: requestedElementId !== resolvedElementId,
+ element: this.cloneElement(element, element.id),
};
}
- getSnapshotView(
+ getElementIdSuggestions(
conversationId: string,
tabId: number,
- snapshotId: number,
- ): HighlightSnapshotViewEntry | undefined {
- if (!conversationId) {
- return undefined;
- }
-
- const snapshotKey = this.buildSnapshotKey(
- conversationId,
- tabId,
- snapshotId,
- );
- const snapshot = this.snapshotViews.get(snapshotKey);
- if (!snapshot) {
- return undefined;
- }
+ elementId: string,
+ limit: number = 3,
+ ): ElementIdSuggestion[] {
+ this.cleanupExpired();
- if (snapshot.tabId !== tabId || this.isExpired(snapshot.createdAt)) {
- this.snapshotViews.delete(snapshotKey);
- console.log(
- `⏰ [ElementCache] Snapshot expired or mismatched for key ${snapshotKey}`,
- );
- return undefined;
+ const key = this.buildDocumentKey(conversationId, tabId);
+ const entry = this.documents.get(key);
+ if (!entry || entry.tabId !== tabId) {
+ return [];
}
- return snapshot;
- }
+ this.touchEntry(entry);
- getInventory(
- conversationId: string,
- tabId: number,
- inventoryId: number,
- ): HighlightInventoryEntry | undefined {
- if (!conversationId) {
- return undefined;
+ const normalizedRequestedElementId =
+ normalizeVisualElementIdInput(elementId).toUpperCase();
+ if (!normalizedRequestedElementId) {
+ return [];
}
- const inventoryKey = this.buildInventoryKey(
- conversationId,
- tabId,
- inventoryId,
+ const minimumMatchedPositions = Math.max(
+ 2,
+ normalizedRequestedElementId.length - 1,
);
- const inventory = this.inventories.get(inventoryKey);
- if (!inventory) {
- return undefined;
- }
- if (inventory.tabId !== tabId || this.isExpired(inventory.lastAccessedAt)) {
- this.removeInventoryByKey(inventoryKey);
- return undefined;
- }
+ return Array.from(entry.elementsById.values())
+ .map((element) => {
+ const candidateId = element.id.toUpperCase();
+ if (
+ candidateId === normalizedRequestedElementId ||
+ candidateId.length !== normalizedRequestedElementId.length
+ ) {
+ return null;
+ }
+
+ const matchedPositions = Array.from(candidateId).reduce(
+ (count, char, index) =>
+ count +
+ (char === normalizedRequestedElementId.charAt(index) ? 1 : 0),
+ 0,
+ );
+ const weightedPositionScore = Array.from(candidateId).reduce(
+ (score, char, index) =>
+ score +
+ (char === normalizedRequestedElementId.charAt(index)
+ ? normalizedRequestedElementId.length - index
+ : 0),
+ 0,
+ );
- return inventory;
+ if (matchedPositions < minimumMatchedPositions) {
+ return null;
+ }
+
+ return {
+ elementId: element.id,
+ html: compactHtmlSnippet(element.html || `<${element.tagName}>`),
+ matchedPositions,
+ weightedPositionScore,
+ };
+ })
+ .filter(
+ (
+ suggestion,
+ ): suggestion is ElementIdSuggestion & {
+ weightedPositionScore: number;
+ } => suggestion !== null,
+ )
+ .sort((left, right) => {
+ if (right.matchedPositions !== left.matchedPositions) {
+ return right.matchedPositions - left.matchedPositions;
+ }
+ if (right.weightedPositionScore !== left.weightedPositionScore) {
+ return right.weightedPositionScore - left.weightedPositionScore;
+ }
+ return left.elementId.localeCompare(right.elementId);
+ })
+ .map(
+ ({ weightedPositionScore: _weightedPositionScore, ...suggestion }) =>
+ suggestion,
+ )
+ .slice(0, Math.max(0, limit));
}
invalidate(conversationId: string, tabId?: number): void {
- const inventoryPrefix =
- tabId !== undefined
- ? `${conversationId}:${tabId}:inventory:`
- : `${conversationId}:`;
- const snapshotPrefix =
- tabId !== undefined
- ? `${conversationId}:${tabId}:snapshot:`
- : `${conversationId}:`;
-
- const inventoryKeysToDelete = Array.from(this.inventories.keys()).filter(
- (key) => key.startsWith(inventoryPrefix),
- );
- const snapshotKeysToDelete = Array.from(this.snapshotViews.keys()).filter(
- (key) => key.startsWith(snapshotPrefix),
- );
+ const keysToDelete = Array.from(this.documents.keys()).filter((key) => {
+ if (tabId === undefined) {
+ return key.startsWith(`${conversationId}:`);
+ }
+ return key === this.buildDocumentKey(conversationId, tabId);
+ });
- for (const key of inventoryKeysToDelete) {
- this.inventories.delete(key);
- }
- for (const key of snapshotKeysToDelete) {
- this.snapshotViews.delete(key);
+ for (const key of keysToDelete) {
+ this.documents.delete(key);
}
- if (inventoryKeysToDelete.length > 0 || snapshotKeysToDelete.length > 0) {
+ if (keysToDelete.length > 0) {
const scope = tabId !== undefined ? `tab ${tabId}` : 'all tabs';
console.log(
- `🗑️ [ElementCache] Invalidated ${inventoryKeysToDelete.length} inventories and ${snapshotKeysToDelete.length} snapshots for conversation ${conversationId} (${scope})`,
+ `🗑️ [ElementCache] Invalidated ${keysToDelete.length} document caches for conversation ${conversationId} (${scope})`,
);
}
}
clearAll(): void {
- this.inventories.clear();
- this.snapshotViews.clear();
+ this.documents.clear();
console.log('🧹 [ElementCache] Cleared all caches');
}
get size(): number {
- return this.snapshotViews.size;
+ return this.documents.size;
}
}
export const elementCache = new ElementCacheImpl();
+
+function compactHtmlSnippet(html: string): string {
+ return html.replace(/\s+/g, ' ').trim().slice(0, 180);
+}
+
+export function buildElementCacheMissMessage(options: {
+ conversationId: string;
+ tabId: number;
+ elementId: string;
+ refreshHint?: string;
+}): string {
+ const {
+ conversationId,
+ tabId,
+ elementId,
+ refreshHint = 'Call highlight_elements() again to refresh the element cache.',
+ } = options;
+ const normalizedElementId = normalizeVisualElementIdInput(elementId);
+ const suggestions = elementCache.getElementIdSuggestions(
+ conversationId,
+ tabId,
+ normalizedElementId,
+ );
+
+ const baseMessage =
+ normalizedElementId && normalizedElementId !== elementId
+ ? `Element '${elementId}' was interpreted as '${normalizedElementId}' for visual-safe ID matching, but no cached element matched.`
+ : `Element '${elementId}' not found in cache.`;
+ const ttlMessage = `Highlight caches expire after ${ELEMENT_CACHE_TTL_DESCRIPTION}. ${refreshHint}`;
+
+ if (suggestions.length === 0) {
+ return `${baseMessage} ${ttlMessage}`;
+ }
+
+ const suggestedIds = suggestions
+ .map((suggestion) => `'${suggestion.elementId}'`)
+ .join(', ');
+ const suggestedHtml = suggestions
+ .map((suggestion) => `${suggestion.elementId}: ${suggestion.html}`)
+ .join(' | ');
+
+ return `${baseMessage} ${ttlMessage} Maybe try ${suggestedIds}. Candidate HTML: ${suggestedHtml}`;
+}
diff --git a/extension/src/commands/element-id.ts b/extension/src/commands/element-id.ts
index 57513be..f312eea 100644
--- a/extension/src/commands/element-id.ts
+++ b/extension/src/commands/element-id.ts
@@ -1,17 +1,167 @@
import type { InteractiveElement } from '../types';
+export const ELEMENT_ID_CHARSET = '123456789ACDEFHJKMNOPQRTUVWXY';
+export const ELEMENT_ID_LENGTH = 3;
+const ELEMENT_ID_SPACE = ELEMENT_ID_CHARSET.length ** ELEMENT_ID_LENGTH;
+const NORMALIZABLE_ELEMENT_ID_PATTERN = /^[0-9A-Za-z]{3}$/;
+const AMBIGUOUS_ELEMENT_ID_CHAR_MAP: Record = {
+ '0': 'O',
+ o: 'O',
+ O: 'O',
+ i: '1',
+ I: '1',
+ l: '1',
+ L: '1',
+ z: '2',
+ Z: '2',
+ s: '5',
+ S: '5',
+ g: '6',
+ G: '6',
+ b: '8',
+ B: '8',
+};
+
+function encodeFixedVisualId(value: number): string {
+ let remaining = value;
+ const chars = Array.from(
+ { length: ELEMENT_ID_LENGTH },
+ () => ELEMENT_ID_CHARSET[0],
+ );
+
+ for (let index = ELEMENT_ID_LENGTH - 1; index >= 0; index -= 1) {
+ chars[index] = ELEMENT_ID_CHARSET[remaining % ELEMENT_ID_CHARSET.length];
+ remaining = Math.floor(remaining / ELEMENT_ID_CHARSET.length);
+ }
+
+ return chars.join('');
+}
+
+/**
+ * Generate a short stable hash from a selector and optional HTML content.
+ *
+ * Uses FNV-1a for speed and reasonable distribution, then projects into the
+ * fixed 3-character visual-safe ID space used by highlight labels.
+ */
+export function generateShortHash(
+ cssPath: string,
+ html?: string,
+ salt: number = 0,
+): string {
+ const FNV_PRIME = 0x01000193;
+ const FNV_OFFSET = 0x811c9dc5;
+
+ let input = html ? `${cssPath}:${html}` : cssPath;
+ if (salt > 0) {
+ input = `${input}:${salt}`;
+ }
+
+ let hash = FNV_OFFSET;
+ for (let index = 0; index < input.length; index += 1) {
+ hash ^= input.charCodeAt(index);
+ hash = Math.imul(hash, FNV_PRIME);
+ }
+
+ return encodeFixedVisualId((hash >>> 0) % ELEMENT_ID_SPACE);
+}
+
+export function generateUniqueHash(
+ cssPath: string,
+ existingHashes: Set,
+ html?: string,
+ maxAttempts: number = 512,
+): { hash: string; salt: number } {
+ let salt = 0;
+
+ while (salt < maxAttempts) {
+ const hash = generateShortHash(cssPath, html, salt);
+ if (!existingHashes.has(hash)) {
+ return { hash, salt };
+ }
+ salt += 1;
+ }
+
+ const fallbackSalt = Date.now();
+ return {
+ hash: generateShortHash(cssPath, html, fallbackSalt),
+ salt: fallbackSalt,
+ };
+}
+
+export function normalizeVisualElementIdInput(value: string): string {
+ const compact = value.trim().replace(/\s+/g, '');
+ if (!compact) {
+ return '';
+ }
+
+ if (!NORMALIZABLE_ELEMENT_ID_PATTERN.test(compact)) {
+ return compact;
+ }
+
+ return compact
+ .split('')
+ .map((char) => {
+ const mapped = AMBIGUOUS_ELEMENT_ID_CHAR_MAP[char];
+ if (mapped) {
+ return mapped;
+ }
+
+ return char.toUpperCase();
+ })
+ .join('');
+}
+
+export function buildElementIdentityKey(element: InteractiveElement): string {
+ return `${element.selector}\u0000${element.html ?? ''}`;
+}
+
/**
- * Reassign element IDs to page-local sequential numbers.
+ * Assign short hash IDs that stay stable for the same selector/content.
*
- * IDs are intentionally page-local: each highlight snapshot page starts at 1
- * again, and the page-local IDs must be paired with highlight_snapshot_id.
+ * IDs are opaque references, not sequence numbers.
*/
-export function assignSequentialElementIds(
+export function assignHashedElementIds(
elements: InteractiveElement[],
): InteractiveElement[] {
- return elements.map((element, index) => ({
- ...element,
- bbox: { ...element.bbox },
- id: String(index + 1),
- }));
+ const existingHashes = new Set();
+ const assignedIds = new Array(elements.length);
+
+ const elementsByStableKey = elements
+ .map((element, index) => ({
+ element,
+ index,
+ identityKey: buildElementIdentityKey(element),
+ }))
+ .sort((left, right) => {
+ const keyOrder = left.identityKey.localeCompare(right.identityKey);
+ if (keyOrder !== 0) {
+ return keyOrder;
+ }
+ return left.index - right.index;
+ });
+
+ for (const { element, index } of elementsByStableKey) {
+ const { hash } = generateUniqueHash(
+ element.selector,
+ existingHashes,
+ element.html,
+ );
+ existingHashes.add(hash);
+ assignedIds[index] = hash;
+ }
+
+ return elements.map((element, index) => {
+ const assignedId = assignedIds[index];
+ if (!assignedId) {
+ throw new Error(
+ `Failed to assign an element ID for selector "${element.selector}"`,
+ );
+ }
+
+ return {
+ ...element,
+ bbox: { ...element.bbox },
+ id: assignedId,
+ };
+ });
}
diff --git a/extension/src/commands/label-constants.ts b/extension/src/commands/label-constants.ts
index 9679b11..36862b3 100644
--- a/extension/src/commands/label-constants.ts
+++ b/extension/src/commands/label-constants.ts
@@ -4,6 +4,7 @@
*/
export const LABEL_FONT_SIZE = 16;
-export const LABEL_PADDING = 5;
-export const LABEL_HEIGHT = LABEL_FONT_SIZE + LABEL_PADDING * 2; // 26px
+export const LABEL_PADDING = 3;
+export const LABEL_HEIGHT = LABEL_FONT_SIZE + LABEL_PADDING * 2; // 22px
export const MAX_LABEL_WIDTH = 120; // Maximum label width for collision detection
+export const LABEL_FONT_FAMILY = 'Arial';
diff --git a/extension/src/commands/single-highlight.ts b/extension/src/commands/single-highlight.ts
index 98c3a54..2b1237f 100644
--- a/extension/src/commands/single-highlight.ts
+++ b/extension/src/commands/single-highlight.ts
@@ -6,15 +6,23 @@
import type { InteractiveElement } from '../types';
// Visual style for single-element confirmation
-const CONFIRMATION_COLOR = '#FF6600'; // Orange border
+const CONFIRMATION_COLOR = '#FFD400'; // Yellow border
+const CONFIRMATION_TEXT_COLOR = '#111111';
+const CONFIRMATION_BANNER_COLOR = 'rgba(255, 212, 0, 0.5)';
+const CONFIRMATION_BANNER_BORDER_COLOR = 'rgba(17, 17, 17, 0.18)';
const BASE_BOX_PADDING = 2;
-const BASE_LINE_WIDTH = 3;
+const BASE_LINE_WIDTH = 4;
const BASE_CONTEXT_PADDING_X = 96;
const BASE_CONTEXT_PADDING_Y = 112;
const BASE_MIN_CROP_WIDTH = 520;
const BASE_MIN_CROP_HEIGHT = 320;
const MIN_CROP_WIDTH_RATIO = 0.58;
const MIN_CROP_HEIGHT_RATIO = 0.58;
+const BASE_BANNER_FONT_SIZE = 22;
+const BASE_BANNER_PADDING_X = 12;
+const BASE_BANNER_PADDING_Y = 12;
+const BASE_BANNER_MARGIN = 14;
+const BASE_BANNER_GAP = 12;
interface DeviceRect {
x: number;
@@ -39,7 +47,12 @@ interface ConfirmationPreviewLayout {
export async function highlightSingleElement(
screenshotDataUrl: string,
element: InteractiveElement,
- options?: { scale?: number; viewportWidth?: number; viewportHeight?: number },
+ options?: {
+ intendedAction?: 'click' | 'keyboard_input';
+ scale?: number;
+ viewportWidth?: number;
+ viewportHeight?: number;
+ },
): Promise {
console.log(
`🎨 [SingleHighlight] Drawing highlight for element ${element.id}...`,
@@ -199,6 +212,12 @@ export async function highlightSingleElement(
// Draw the single element bounding box
drawSingleBoundingBox(ctx, previewLayout.element, scale);
+ drawConfirmationBanner(
+ ctx,
+ previewLayout.element,
+ options?.intendedAction,
+ scale,
+ );
const resultBlob = await canvas.convertToBlob({ type: 'image/png' });
@@ -224,6 +243,102 @@ export async function highlightSingleElement(
}
}
+export function formatConfirmationOperationLabel(
+ intendedAction?: 'click' | 'keyboard_input',
+): string {
+ switch (intendedAction) {
+ case 'click':
+ return 'click';
+ case 'keyboard_input':
+ return 'type into';
+ default:
+ return 'interact with';
+ }
+}
+
+export function getConfirmationPromptText(
+ intendedAction?: 'click' | 'keyboard_input',
+): string {
+ return `Is this the element you wanted to ${formatConfirmationOperationLabel(intendedAction)}?`;
+}
+
+export function calculateConfirmationBannerLayout(options: {
+ canvasWidth: number;
+ canvasHeight: number;
+ elementRect: DeviceRect;
+ message: string;
+ scale: number;
+ textWidth?: number;
+}): DeviceRect {
+ const { canvasWidth, canvasHeight, elementRect, message, scale, textWidth } =
+ options;
+ const fontSize = Math.max(16, Math.round(BASE_BANNER_FONT_SIZE * scale));
+ const paddingX = Math.max(12, Math.round(BASE_BANNER_PADDING_X * scale));
+ const paddingY = Math.max(8, Math.round(BASE_BANNER_PADDING_Y * scale));
+ const margin = Math.max(10, Math.round(BASE_BANNER_MARGIN * scale));
+ const gap = Math.max(8, Math.round(BASE_BANNER_GAP * scale));
+ const estimatedTextWidth = Math.ceil(message.length * fontSize * 0.6);
+ const resolvedTextWidth = Math.ceil(textWidth ?? estimatedTextWidth);
+ const width = Math.min(
+ canvasWidth - margin * 2,
+ resolvedTextWidth + paddingX * 2,
+ );
+ const height = fontSize + paddingY * 2;
+
+ const clampX = (value: number): number =>
+ clamp(value, margin, Math.max(margin, canvasWidth - width - margin));
+ const clampY = (value: number): number =>
+ clamp(value, margin, Math.max(margin, canvasHeight - height - margin));
+ const centeredX = clampX(elementRect.x + elementRect.width / 2 - width / 2);
+ const centeredY = clampY(elementRect.y + elementRect.height / 2 - height / 2);
+
+ if (elementRect.y - gap - height >= margin) {
+ return {
+ x: centeredX,
+ y: elementRect.y - gap - height,
+ width,
+ height,
+ };
+ }
+
+ if (
+ elementRect.y + elementRect.height + gap + height <=
+ canvasHeight - margin
+ ) {
+ return {
+ x: centeredX,
+ y: elementRect.y + elementRect.height + gap,
+ width,
+ height,
+ };
+ }
+
+ if (elementRect.x + elementRect.width + gap + width <= canvasWidth - margin) {
+ return {
+ x: elementRect.x + elementRect.width + gap,
+ y: centeredY,
+ width,
+ height,
+ };
+ }
+
+ if (elementRect.x - gap - width >= margin) {
+ return {
+ x: elementRect.x - gap - width,
+ y: centeredY,
+ width,
+ height,
+ };
+ }
+
+ return {
+ x: centeredX,
+ y: clampY(elementRect.y + elementRect.height + gap),
+ width,
+ height,
+ };
+}
+
/**
* Calculate a focused preview crop around the target element.
*/
@@ -311,10 +426,58 @@ function drawSingleBoundingBox(
`[SingleHighlight] Drawing confirmation bbox at (${x}, ${y}, ${width}, ${height}) scale=${scale}`,
);
- // Draw bounding box with orange color
+ // Draw bounding box with a bright yellow confirmation color.
+ ctx.save();
ctx.strokeStyle = CONFIRMATION_COLOR;
ctx.lineWidth = lineWidth;
+ ctx.shadowColor = 'rgba(255, 212, 0, 0.7)';
+ ctx.shadowBlur = 12 * scale;
ctx.strokeRect(x, y, width, height);
+ ctx.restore();
+}
+
+function drawConfirmationBanner(
+ ctx: OffscreenCanvasRenderingContext2D,
+ elementRect: DeviceRect,
+ intendedAction: 'click' | 'keyboard_input' | undefined,
+ scale: number,
+): void {
+ const message = getConfirmationPromptText(intendedAction);
+ const fontSize = Math.max(16, Math.round(BASE_BANNER_FONT_SIZE * scale));
+ const paddingX = Math.max(12, Math.round(BASE_BANNER_PADDING_X * scale));
+
+ ctx.save();
+ ctx.font = `700 ${fontSize}px sans-serif`;
+ ctx.textBaseline = 'middle';
+ const measuredTextWidth = ctx.measureText(message).width;
+ const bannerRect = calculateConfirmationBannerLayout({
+ canvasWidth: ctx.canvas.width,
+ canvasHeight: ctx.canvas.height,
+ elementRect,
+ message,
+ scale,
+ textWidth: measuredTextWidth,
+ });
+
+ ctx.fillStyle = CONFIRMATION_BANNER_COLOR;
+ ctx.fillRect(bannerRect.x, bannerRect.y, bannerRect.width, bannerRect.height);
+ ctx.strokeStyle = CONFIRMATION_BANNER_BORDER_COLOR;
+ ctx.lineWidth = Math.max(1, scale);
+ ctx.strokeRect(
+ bannerRect.x,
+ bannerRect.y,
+ bannerRect.width,
+ bannerRect.height,
+ );
+
+ ctx.fillStyle = CONFIRMATION_TEXT_COLOR;
+ ctx.fillText(
+ message,
+ bannerRect.x + paddingX,
+ bannerRect.y + bannerRect.height / 2,
+ bannerRect.width - paddingX * 2,
+ );
+ ctx.restore();
}
function clamp(value: number, min: number, max: number): number {
diff --git a/extension/src/commands/visual-highlight.ts b/extension/src/commands/visual-highlight.ts
index 574e111..eba80fd 100644
--- a/extension/src/commands/visual-highlight.ts
+++ b/extension/src/commands/visual-highlight.ts
@@ -10,7 +10,7 @@ import type {
HighlightOptions,
} from '../types';
import { LABEL_FONT_SIZE, LABEL_PADDING } from './label-constants';
-import { getLabelDimensions } from '../utils/label-geometry';
+import { getLabelDimensions, getLabelFont } from '../utils/label-geometry';
/**
* Color mapping for different element types (with transparency for label backgrounds)
@@ -497,8 +497,7 @@ function drawLabel(
const fontSize = Math.round(LABEL_FONT_SIZE * scale);
const labelPadding = Math.round(LABEL_PADDING * scale);
- // Set font before measuring text
- ctx.font = `bold ${fontSize}px Arial`;
+ ctx.font = getLabelFont(fontSize);
// Measure text width
let labelX: number;
diff --git a/extension/src/types.ts b/extension/src/types.ts
index 0685fd4..56526e7 100644
--- a/extension/src/types.ts
+++ b/extension/src/types.ts
@@ -130,10 +130,8 @@ export interface HighlightElementsCommand extends BaseCommand {
export interface ClickElementCommand extends BaseCommand {
type: 'click_element';
- /** Element ID from the referenced highlight snapshot response (page-local numeric string) */
+ /** Element ID from highlight response (short opaque string) */
element_id: string;
- /** Highlight snapshot ID returned by highlight_elements */
- highlight_snapshot_id: number;
/**
* Target tab ID (optional - auto-resolved from conversation if not provided)
* Note: Required in Python models, but optional here as extension auto-resolves it
@@ -143,10 +141,8 @@ export interface ClickElementCommand extends BaseCommand {
export interface HoverElementCommand extends BaseCommand {
type: 'hover_element';
- /** Element ID from the referenced highlight snapshot response (page-local numeric string) */
+ /** Element ID from highlight response (short opaque string) */
element_id: string;
- /** Highlight snapshot ID returned by highlight_elements */
- highlight_snapshot_id: number;
/**
* Target tab ID (optional - auto-resolved from conversation if not provided)
* Note: Required in Python models, but optional here as extension auto-resolves it
@@ -156,12 +152,10 @@ export interface HoverElementCommand extends BaseCommand {
export interface ScrollElementCommand extends BaseCommand {
type: 'scroll_element';
- /** Element ID from the referenced highlight snapshot response (page-local numeric string). If not provided, scrolls the entire page */
+ /** Element ID from highlight response (short opaque string). If not provided, scrolls the entire page */
element_id?: string;
- /** Highlight snapshot ID returned by highlight_elements. Required when element_id is provided */
- highlight_snapshot_id?: number;
direction?: ScrollDirection;
- /** Scroll amount relative to page/element height (0.5 = half page, 1.0 = full page) */
+ /** Scroll amount relative to the current scroll target's visible size (0.5 = half target, 1.0 = one full visible span) */
scroll_amount?: number;
/**
* Target tab ID (optional - auto-resolved from conversation if not provided)
@@ -174,10 +168,8 @@ export type SwipeDirection = 'next' | 'prev';
export interface SwipeElementCommand extends BaseCommand {
type: 'swipe_element';
- /** Element ID from the referenced highlight snapshot response (page-local numeric string) */
+ /** Element ID from highlight response (short opaque string) */
element_id: string;
- /** Highlight snapshot ID returned by highlight_elements */
- highlight_snapshot_id: number;
direction?: SwipeDirection;
/** Number of swipe steps for carousel/swiper interactions */
swipe_count?: number;
@@ -190,10 +182,8 @@ export interface SwipeElementCommand extends BaseCommand {
export interface KeyboardInputCommand extends BaseCommand {
type: 'keyboard_input';
- /** Element ID from the referenced highlight snapshot response (page-local numeric string) */
+ /** Element ID from highlight response (short opaque string) */
element_id: string;
- /** Highlight snapshot ID returned by highlight_elements */
- highlight_snapshot_id: number;
text: string;
/**
* Target tab ID (optional - auto-resolved from conversation if not provided)
@@ -204,10 +194,8 @@ export interface KeyboardInputCommand extends BaseCommand {
export interface SelectElementCommand extends BaseCommand {
type: 'select_element';
- /** Element ID from the referenced highlight snapshot response (page-local numeric string) */
+ /** Element ID from highlight response (short opaque string) */
element_id: string;
- /** Highlight snapshot ID returned by highlight_elements */
- highlight_snapshot_id: number;
/** Option value(s) to select. Use string for single select, array for multi-select (