diff --git a/AGENTS.md b/AGENTS.md index 93244e4..0feb22c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -165,6 +165,13 @@ OpenBrowser uses Jinja2 templates for agent prompts, enabling dynamic content in - **Clean output**: `trim_blocks=True` and `lstrip_blocks=True` remove extra whitespace - **Caching**: Templates are cached after first load for performance +### Model Profile Differences +- Model profile is resolved from session metadata and exposed to prompt rendering as `model_profile` / `small_model`; see `server/agent/manager.py` and `server/agent/tools/prompt_context.py` +- Tool prompt variants are split by model profile under `server/agent/prompts/small_model/` and `server/agent/prompts/big_model/` +- Small-model browser guidance intentionally avoids `keywords` fallback and leans harder on same-mode highlight pagination when dense UI may be split across collision-aware pages +- Observation rendering also differs by model profile: large models keep clickable highlights compact (`... and N clickable elements`), while small models include clickable element HTML in the LLM-visible observation text for extra semantic grounding +- The small-model clickable-observation branch is implemented in `server/agent/tools/base.py`; the per-conversation `small_model` flag is attached in `server/agent/tools/browser_executor.py` + ### Keyword Discipline - Highlight pagination remains the default discovery flow for controls and dense UI - After any significant page-state change, restart discovery with `highlight_elements(element_type="any")` before choosing the next element @@ -206,10 +213,13 @@ Elements are paginated to ensure **no visual overlap** in each screenshot: - Reason: OpenBrowser intentionally keeps automated tabs in the browser background, and Chrome may heavily throttle hidden-tab timers. A page-side `setTimeout` stability loop can therefore take far longer than its nominal budget and become the main cause of highlight timeouts. - In practice, the main cause of unstable first-highlight screenshots is often **missing warmup**, not a bad readiness classifier. A background tab may answer lightweight `Runtime.evaluate` probes while still sitting in a partially painted / partially decoded state. - A screenshot-style warmup is therefore the default precondition for `highlight_elements`. It helps force hidden-tab paint/compositor/image-decode work before interactive-element detection runs. +- All highlight warmup and highlight screenshot captures now reuse the same screenshot wake-up profile as `tab view` (`TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS`) instead of a weaker highlight-only profile. The goal is consistency: if a screenshot is needed to wake the page, the highlight path should not use a different, less effective capture mode. +- For navigation-driven default observations such as `tab init`, `tab open`, `tab switch`, `tab refresh`, `tab back`, and `tab forward`, the extension now performs an **internal raw screenshot prime** first, then runs the normal highlight warmup + detection + highlighted screenshot flow. That raw prime screenshot is only for waking the background page and is **not** returned to the agent. - If `highlight_elements` keeps returning `not_ready` but `tab view` immediately makes the next highlight succeed, treat that as a warmup issue first. - The extension samples viewport readiness signals once per attempt: document readiness, viewport text/media density, pending images, and loading placeholders such as skeleton/shimmer/spinner indicators. - Readiness is graded as `ready`, `provisionally_ready`, or `not_ready`. - If readiness is `not_ready`, the extension performs only a couple of short **background-side** retries before proceeding or returning the latest result. +- The screenshot-side wake-up itself also runs a bounded pre-capture warmup loop. It touches visible viewport media, samples readiness, and retries only a couple of times when the snapshot still looks `not_ready`. - After screenshot capture, highlight still runs a **consistency check**. This is a drift detector, not a loading detector: it verifies whether sampled highlighted elements moved or disappeared between detection and screenshot. - Design rule: prefer snapshot classification plus bounded retries; avoid depending on repeated timers inside the target page for highlight stability. @@ -321,10 +331,10 @@ OpenBrowser has explicit screenshot control for maximum flexibility: | Command | Auto-Screenshot | Notes | |---------|------------------|-------| -| `tab init` | Yes | Verify page load | -| `tab open` | Yes | Verify new tab | -| `tab switch` | Yes | Verify tab switch | -| `tab refresh` | Yes | Verify refresh result | +| `tab init` | Yes | Returns default `highlight any page 1`; first does an internal raw screenshot prime to wake the page | +| `tab open` | Yes | Returns default `highlight any page 1`; first does an internal raw screenshot prime to wake the page | +| `tab switch` | Yes | Returns default `highlight any page 1`; first does an internal raw screenshot prime to wake the page | +| `tab refresh` | Yes | Returns default `highlight any page 1`; first does an internal raw screenshot prime to wake the page | |---------|------------------|-------| | `highlight_elements` | Yes | Visual overlay for element selection | | `click_element` | Yes | Verify interaction result | @@ -367,7 +377,9 @@ Automated testing framework for evaluating AI agent performance on browser autom ``` OpenBrowser/eval/ ├── evaluate_browser_agent.py # Main evaluation entry point -├── dataset/ # YAML test case definitions (9 tests) +├── dataset/ # YAML test case definitions (12 tests) +│ ├── bluebook_simple.yaml # BlueBook search and like test +│ ├── bluebook_complex.yaml # BlueBook multi-image reply test │ ├── gbr.yaml # GBR search test │ ├── gbr_detailed.yaml # GBR detailed search test │ ├── techforum.yaml # TechForum upvote test @@ -376,10 +388,11 @@ OpenBrowser/eval/ │ ├── cloudstack_interactive.yaml # CloudStack DAS interactive test │ ├── finviz_simple.yaml # Finviz simple screener test │ ├── finviz_complex.yaml # Finviz multi-filter test -│ └── dataflow.yaml # DataFlow visual challenge test +│ ├── dataflow.yaml # DataFlow visual challenge test +│ └── northstar_add_bag.yaml # Combined fit-guide and add-to-bag geometry test ├── output/ # Generated results and images ├── server.py # Mock websites server with tracking API -└── (mock websites: gbr/, techforum/, cloudstack/, dataflow/, finviz/) +└── (mock websites: gbr/, techforum/, cloudstack/, dataflow/, finviz/, bluebook/, northstar/) ``` ### Key Features @@ -550,17 +563,19 @@ Tests are defined in YAML format with: | `gbr` | GBR Search Test | easy | 400s (~6.7min) | 0.8 RMB | Search for "fed" related news | | `finviz_simple` | Finviz Simple Screener Test | easy | 300s (5min) | 0.8 RMB | Filter stocks by market cap over 10 billion | | `techforum` | TechForum Upvote Test | medium | 300s (5min) | 0.5 RMB | Upvote the first AI-related post | +| `bluebook_simple` | BlueBook Search And Like Test | medium | 300s (5min) | 0.6 RMB | Search for the target note and like it | | `gbr_detailed` | GBR Detailed Search & Read Test | medium | 600s (10min) | 1.5 RMB | Search for "fed", click into each article (3 articles), and summarize content | | `finviz_complex` | Finviz Multi-Filter Screener Test | medium | 400s (~6.7min) | 1.0 RMB | Multi-filter stock screener: market cap, P/E, volume | | `dataflow` | DataFlow Visual Challenge Test | medium | 300s (5min) | 0.5 RMB | Dashboard interactions: settings, reports, navigation | +| `northstar_add_bag` | Northstar Fit Guide + Add To Bag Test | medium | 540s (9min) | 1.2 RMB | Save the Care & Wash fit guide section, then choose size M and add the shell to bag | #### Advanced Tests | ID | Name | Difficulty | Time Limit | Cost Limit | Description | |----|------|------------|------------|------------|-------------| +| `bluebook_complex` | BlueBook Multi-Image Reply Test | hard | 500s (~8.3min) | 1.2 RMB | Search for the OpenClaw note, view all images, and leave a quick comment | | `cloudstack` | CloudStack DAS Agent Test | hard | 500s (~8.3min) | 1.2 RMB | Find DAS console and greet DAS agent | | `techforum_reply` | TechForum Comment Reply Test | hard | 500s (~8.3min) | 1.0 RMB | Open comments, find "Graduate Student" comment, reply with paper name | | `cloudstack_interactive` | CloudStack DAS Interactive Test | very hard | 700s (~11.7min) | 2.0 RMB | Multi-turn conversation with DAS agent: greeting, system status, storage check | - #### Event Matching Notes - **Standard events**: `page_view`, `click`, `input`, `submit`, `hover`, `scroll`, `answer_action` - **Special event types**: @@ -586,6 +601,10 @@ Criteria match tracked events using flexible pattern matching: - Page URLs, input values, custom fields - Alternative conditions for flexible scoring +### Deferred Prompt And Observation Follow-Ups +- Observation design: add structured geometry hints such as `partly_visible`, `near_viewport_edge`, `occluded_by_sticky_ui`, explicit scroll-container identity, and structured stale-element causes before expanding prompt text again. +- Prompt compaction: after geometry-focused eval results stabilize, reduce duplicated rules between the SDK system prompt and tool prompts so tool templates keep only tool-local contracts and recovery guidance. + ## NOTES - **Git dependencies:** `openhands-sdk` and `openhands-tools` from git subdirectories diff --git a/eval/cloudstack/js/das-agent.js b/eval/cloudstack/js/das-agent.js index f2a66d4..101a9cc 100644 --- a/eval/cloudstack/js/das-agent.js +++ b/eval/cloudstack/js/das-agent.js @@ -42,6 +42,75 @@ document.addEventListener('DOMContentLoaded', function() { this.style.height = (this.scrollHeight) + 'px'; }); } + + function normalizeMessage(message) { + return message.toLowerCase().replace(/\s+/g, ' ').trim(); + } + + function containsAny(text, keywords) { + return keywords.some(keyword => text.includes(keyword)); + } + + function buildAgentReply(message) { + const normalizedMessage = normalizeMessage(message); + + const greetingKeywords = [ + 'hello', + 'hi', + 'hey', + 'greetings', + 'good morning', + 'good afternoon', + 'good evening' + ]; + const statusKeywords = [ + 'status', + 'system', + 'health', + 'report', + 'running', + 'current state', + 'how are you' + ]; + const storageKeywords = [ + 'storage', + 'disk', + 'space', + 'capacity', + 'usage', + 'utilization', + 'volume' + ]; + const cpuKeywords = ['cpu', 'load']; + const memoryKeywords = ['memory', 'ram']; + const alertKeywords = ['alert', 'warning', 'alarm', 'incident', 'issue']; + + if (containsAny(normalizedMessage, storageKeywords)) { + return 'Storage usage check complete: primary cluster is at 68% used, log volume is at 42%, and free capacity is enough for current workload. No immediate storage risk detected.'; + } + + if (containsAny(normalizedMessage, statusKeywords)) { + return 'Current system status is stable. Core database services are online, replication delay is within threshold, and there are no critical incidents at the moment.'; + } + + if (containsAny(normalizedMessage, cpuKeywords)) { + return 'CPU load is moderate right now, averaging around 34% across the main database nodes. No hot node is currently flagged.'; + } + + if (containsAny(normalizedMessage, memoryKeywords)) { + return 'Memory usage is healthy. Working set pressure is low and cache hit rate remains within the expected range.'; + } + + if (containsAny(normalizedMessage, alertKeywords)) { + return 'There are no active P1 alerts. I only see a few low-priority optimization suggestions related to slow-query tuning and index review.'; + } + + if (containsAny(normalizedMessage, greetingKeywords)) { + return 'Hello. I am DAS Agent. I can help with system status, storage usage, alerts, and database operations checks.'; + } + + return 'I can help with database operations. You can ask me for current system status, storage usage, performance health, or active alerts.'; + } // Send message function function sendMessage() { @@ -74,7 +143,7 @@ document.addEventListener('DOMContentLoaded', function() { // Simulate agent response delay setTimeout(function() { - addAgentMessage('Hello, I am DAS Agent'); + addAgentMessage(buildAgentReply(message)); sendBtn.disabled = false; sendBtn.textContent = 'Send Message'; }, 800); diff --git a/eval/dataset/northstar_add_bag.yaml b/eval/dataset/northstar_add_bag.yaml new file mode 100644 index 0000000..7da7ca2 --- /dev/null +++ b/eval/dataset/northstar_add_bag.yaml @@ -0,0 +1,43 @@ +id: northstar_add_bag +name: "Northstar Fit Guide + Add To Bag Test" +difficulty: medium +description: "Open the fit guide, save the Care & Wash section, then reposition the purchase rail, choose size M, and add the shell to bag." +start_url: "http://localhost:16605/northstar/" +instruction: "On the Northstar Outfitters Commuter Shell page, open the fit guide, scroll inside it until the Care & Wash section is centered, save the guide from that section, then choose size Medium and add the jacket to your bag." +time_limit: 540.0 +cost_limit: 1.2 + +criteria: + - type: open_fit_guide + description: "Open the fit guide drawer" + points: 0.5 + expected: + event_type: fit_guide_open + page: "/northstar/" + drawer: "fit-guide" + + - type: save_fit_guide + description: "Save the fit guide from the Care & Wash section" + points: 2.0 + expected: + event_type: fit_guide_save + page: "/northstar/" + section: "care-wash" + + - type: select_medium_size + description: "Select size Medium" + points: 1.5 + expected: + event_type: product_size_select + page: "/northstar/" + productId: "commuter-shell" + size: "M" + + - type: add_to_bag + description: "Add the selected shell to bag" + points: 2.0 + expected: + event_type: product_add_to_bag + page: "/northstar/" + productId: "commuter-shell" + size: "M" diff --git a/eval/evaluate_browser_agent.py b/eval/evaluate_browser_agent.py index d23d99c..047743a 100644 --- a/eval/evaluate_browser_agent.py +++ b/eval/evaluate_browser_agent.py @@ -7,26 +7,27 @@ """ import argparse -import asyncio +import atexit import base64 +import datetime +import fcntl import json +import logging import os -import sys -import time -import yaml -import requests -from pathlib import Path -from typing import Dict, List, Any, Optional, Tuple -from dataclasses import dataclass -import subprocess import shutil import signal -import atexit -import logging -import datetime +import sys import threading -import fcntl +import time +from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait from contextlib import AbstractContextManager +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import urlparse + +import requests +import yaml logger = logging.getLogger(__name__) @@ -108,6 +109,18 @@ class MessageRunResult: error: Optional[str] = None +@dataclass(frozen=True) +class ScheduledJob: + """One scheduled automated evaluation job.""" + + target_index: int + test_index: int + target: LLMTarget + test_case: TestCase + model_key: str + site_bucket: str + + class OpenBrowserClient: """Client for OpenBrowser server API""" @@ -563,18 +576,24 @@ def health_check(self) -> bool: except requests.exceptions.RequestException: return False - def clear_events(self) -> bool: - """Clear all tracked events""" + def clear_events(self, site: Optional[str] = None) -> bool: + """Clear tracked events, optionally scoped to one mock site.""" try: - response = self.session.get(f"{self.base_url}/api/events/clear", timeout=2) + params = {"site": site} if site else None + response = self.session.get( + f"{self.base_url}/api/events/clear", params=params, timeout=2 + ) return response.status_code == 200 except Exception: return False - def get_events(self) -> List[Dict[str, Any]]: - """Get all tracked events""" + def get_events(self, site: Optional[str] = None) -> List[Dict[str, Any]]: + """Get tracked events, optionally scoped to one mock site.""" try: - response = self.session.get(f"{self.base_url}/api/events", timeout=5) + params = {"site": site} if site else None + response = self.session.get( + f"{self.base_url}/api/events", params=params, timeout=5 + ) if response.status_code == 200: data = response.json() return data.get("events", []) @@ -744,6 +763,69 @@ def __init__(self, chrome_uuid: Optional[str] = None): self.current_model: Optional[str] = None # Current model being tested self.current_target: Optional[LLMTarget] = None # Current CLI target + @staticmethod + def _sanitize_model_name(model_name: str) -> str: + """Make a model name safe for filesystem paths.""" + return model_name.replace("/", "_").replace(":", "_") + + @staticmethod + def _get_model_key(target: LLMTarget) -> str: + """Return the concurrency key for one target.""" + return target.model_name or target.alias or target.name + + @staticmethod + def _get_test_site_bucket(test_case: TestCase) -> str: + """Infer the mock-site bucket from the test start URL.""" + parsed = urlparse(test_case.start_url) + segments = [segment for segment in parsed.path.split("/") if segment] + if segments: + return segments[0] + return test_case.id + + def _ensure_model_output_dir(self, model_name: Optional[str]) -> Path: + """Ensure the per-model output directory exists.""" + if self.output_dir is None: + timestamp = time.strftime("%Y%m%d_%H%M%S") + self.output_dir = OUTPUT_BASE_DIR / timestamp + self.output_dir.mkdir(parents=True, exist_ok=True) + logger.info(f"Created base output directory: {self.output_dir}") + + if not model_name: + return self.output_dir + + model_output_dir = self.output_dir / self._sanitize_model_name(model_name) + model_output_dir.mkdir(exist_ok=True) + return model_output_dir + + def _build_error_result( + self, test_case: TestCase, model_name: Optional[str], error: str + ) -> TestResult: + """Build a failed test result for scheduler/runtime errors.""" + max_score = sum(criterion.get("points", 1) for criterion in test_case.criteria) + return TestResult( + test_case=test_case, + passed=False, + score=0, + max_score=max_score, + events=[], + sse_events=[], + track_events=[], + images=[], + error=error, + duration=0.0, + cost=0.0, + efficiency_score=0.0, + usage_score=0.0, + total_score=0.0, + model=model_name, + ) + + def _create_worker_evaluator(self) -> "Evaluator": + """Create a short-lived evaluator with independent HTTP sessions.""" + worker = Evaluator(chrome_uuid=self.chrome_uuid) + worker.output_dir = self.output_dir + return worker + def resolve_targets(self, targets: List[LLMTarget]) -> List[LLMTarget]: """Resolve configured aliases to raw model names.""" llm_configs = self.openbrowser.get_llm_configs() @@ -856,61 +938,46 @@ def load_test_cases(self) -> List[TestCase]: return test_cases - def run_test(self, test_case: TestCase) -> TestResult: + def run_test( + self, + test_case: TestCase, + target: Optional[LLMTarget] = None, + model_name: Optional[str] = None, + ) -> TestResult: """Run a single test case""" - logger.info(f"Running test: {test_case.name}") + active_target = target or self.current_target + active_model_name = model_name or self.current_model + site_bucket = self._get_test_site_bucket(test_case) - # Ensure output directory exists with model subdirectory - if self.output_dir is None: - timestamp = time.strftime("%Y%m%d_%H%M%S") - self.output_dir = OUTPUT_BASE_DIR / timestamp - self.output_dir.mkdir(parents=True, exist_ok=True) - logger.info(f"Created base output directory: {self.output_dir}") + logger.info( + "Running test: %s [model=%s site=%s]", + test_case.name, + active_model_name, + site_bucket, + ) - # Create model-specific subdirectory if model is set - model_output_dir = self.output_dir - if self.current_model: - # Sanitize model name for filesystem - model_name_safe = self.current_model.replace("/", "_").replace(":", "_") - model_output_dir = self.output_dir / model_name_safe - model_output_dir.mkdir(exist_ok=True) - logger.info(f"Using model output directory: {model_output_dir}") + model_output_dir = self._ensure_model_output_dir(active_model_name) - # Clear previous events - self.eval_server.clear_events() + # Clear only the current mock-site event bucket. + self.eval_server.clear_events(site=site_bucket) # Create new conversation with current model conversation_id = self.openbrowser.create_conversation( - model_alias=self.current_target.alias if self.current_target else None, + model_alias=active_target.alias if active_target else None, ) if conversation_id: logger.debug(f"Created conversation: {conversation_id}") else: logger.warning( - f"Failed to create conversation for model {self.current_model}" - ) - max_score = sum( - criterion.get("points", 1) for criterion in test_case.criteria + f"Failed to create conversation for model {active_model_name}" ) - return TestResult( - test_case=test_case, - passed=False, - score=0, - max_score=max_score, - events=[], - sse_events=[], - track_events=[], - images=[], - error=( - f"Failed to create conversation for target {self.current_model}. " + return self._build_error_result( + test_case, + active_model_name, + ( + f"Failed to create conversation for target {active_model_name}. " "See logs for server response details." ), - duration=0.0, - cost=0.0, - efficiency_score=0.0, - usage_score=0.0, - total_score=0.0, - model=self.current_model, ) start_time = time.time() @@ -979,7 +1046,7 @@ def run_test(self, test_case: TestCase) -> TestResult: time.sleep(min(pending_event_wait, max(0.0, deadline - time.time()))) # Get tracking events - track_events = self.eval_server.get_events() + track_events = self.eval_server.get_events(site=site_bucket) # Save track events to file track_events_file = self._save_track_events( @@ -1033,7 +1100,7 @@ def run_test(self, test_case: TestCase) -> TestResult: total_score=total_score, sse_events_file=sse_events_file, track_events_file=track_events_file, - model=self.current_model, + model=active_model_name, ) finally: self._cleanup_openbrowser_conversation(conversation_id) @@ -1354,7 +1421,7 @@ def _check_criterion( logger.debug(f"Criterion matched by SSE event: {event.get('type')}") return True - logger.debug(f"Criterion not met") + logger.debug("Criterion not met") return False def _check_count_min_condition( @@ -1638,6 +1705,7 @@ def generate_report(self): def run_manual_test(self, test_case: TestCase) -> TestResult: """Run a test case in manual mode with human performing the same task as OpenBrowser""" logger.info(f"Running manual test: {test_case.name}") + site_bucket = self._get_test_site_bucket(test_case) # Ensure output directory exists if self.output_dir is None: @@ -1646,8 +1714,8 @@ def run_manual_test(self, test_case: TestCase) -> TestResult: self.output_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Created output directory: {self.output_dir}") - # Clear previous events - self.eval_server.clear_events() + # Clear previous events for the current mock site only. + self.eval_server.clear_events(site=site_bucket) # Print test information print("\n" + "=" * 60) @@ -1656,14 +1724,14 @@ def run_manual_test(self, test_case: TestCase) -> TestResult: print("=" * 60) if test_case.start_url: - print(f"\n📋 Please open your browser and navigate to:") + print("\n📋 Please open your browser and navigate to:") print(f" {test_case.start_url}") print("Make sure the eval server is running (localhost:16605).") print("The browser should load the test page.") input("\nPress Enter when ready to continue...") # Show the SAME instruction that would be given to OpenBrowser - print(f"\n📝 Task Instruction (same as given to OpenBrowser):") + print("\n📝 Task Instruction (same as given to OpenBrowser):") print(f" {test_case.instruction}") print( "\nPerform this task in the browser. Events will be tracked from this moment." @@ -1692,7 +1760,7 @@ def run_manual_test(self, test_case: TestCase) -> TestResult: time.sleep(2) # Get tracking events - track_events = self.eval_server.get_events() + track_events = self.eval_server.get_events(site=site_bucket) # Save track events to file (no conversation_id for manual mode, use "manual") track_events_file = self._save_track_events( @@ -1738,7 +1806,7 @@ def run_manual_test(self, test_case: TestCase) -> TestResult: # Print completion message print(f"\n{'=' * 60}") - print(f"Manual test completed!") + print("Manual test completed!") print(f"Duration: {duration:.1f}s") print(f"Track events recorded: {len(track_events)}") print(f"Task score: {score:.1f}/{max_score:.1f}") @@ -1751,19 +1819,159 @@ def run_manual_test(self, test_case: TestCase) -> TestResult: return result + def _build_scheduled_jobs( + self, test_cases: List[TestCase], targets: List[LLMTarget] + ) -> Dict[int, List[ScheduledJob]]: + """Build the full job matrix for automated evaluation.""" + jobs_by_target: Dict[int, List[ScheduledJob]] = {} + for target_index, target in enumerate(targets): + model_key = self._get_model_key(target) + jobs_by_target[target_index] = [ + ScheduledJob( + target_index=target_index, + test_index=test_index, + target=target, + test_case=test_case, + model_key=model_key, + site_bucket=self._get_test_site_bucket(test_case), + ) + for test_index, test_case in enumerate(test_cases) + ] + return jobs_by_target + + def _execute_scheduled_job(self, job: ScheduledJob) -> TestResult: + """Run one scheduled job in an isolated worker evaluator.""" + worker = self._create_worker_evaluator() + try: + return worker.run_test( + job.test_case, + target=job.target, + model_name=job.model_key, + ) + except Exception as e: + logger.exception( + "Scheduled job failed: model=%s test=%s site=%s", + job.model_key, + job.test_case.id, + job.site_bucket, + ) + return self._build_error_result( + job.test_case, + job.model_key, + f"Unhandled scheduler worker error: {e}", + ) + + def _run_scheduled_jobs( + self, + test_cases: List[TestCase], + targets: List[LLMTarget], + parallel: int, + single_model_parallel: int, + ) -> Dict[int, List[TestResult]]: + """Run scheduled jobs with global and per-model concurrency limits.""" + max_parallel = max(1, parallel) + per_model_limit = max(1, single_model_parallel) + jobs_by_target = self._build_scheduled_jobs(test_cases, targets) + results_by_target: Dict[int, List[Optional[TestResult]]] = { + target_index: [None] * len(test_cases) + for target_index in range(len(targets)) + } + + logger.info( + "Scheduler limits: parallel=%s, single_model_parallel=%s", + max_parallel, + per_model_limit, + ) + + running_by_model: Dict[str, int] = {} + busy_sites: set[str] = set() + in_flight: Dict[Any, ScheduledJob] = {} + target_order = list(range(len(targets))) + + with ThreadPoolExecutor(max_workers=max_parallel) as executor: + while True: + pending_jobs = any(jobs for jobs in jobs_by_target.values()) + + while len(in_flight) < max_parallel: + scheduled_job: Optional[ScheduledJob] = None + + for target_index in target_order: + pending_for_target = jobs_by_target[target_index] + if not pending_for_target: + continue + + model_key = pending_for_target[0].model_key + if running_by_model.get(model_key, 0) >= per_model_limit: + continue + + for job_index, job in enumerate(pending_for_target): + if job.site_bucket in busy_sites: + continue + scheduled_job = pending_for_target.pop(job_index) + break + + if scheduled_job is not None: + break + + if scheduled_job is None: + break + + future = executor.submit(self._execute_scheduled_job, scheduled_job) + in_flight[future] = scheduled_job + running_by_model[scheduled_job.model_key] = ( + running_by_model.get(scheduled_job.model_key, 0) + 1 + ) + busy_sites.add(scheduled_job.site_bucket) + + logger.info( + "Scheduled test '%s' for model '%s' on site '%s'", + scheduled_job.test_case.id, + scheduled_job.model_key, + scheduled_job.site_bucket, + ) + + if not in_flight and not pending_jobs: + break + + if not in_flight: + logger.warning("Pending jobs remain but none could be scheduled") + break + + done, _ = wait(set(in_flight.keys()), return_when=FIRST_COMPLETED) + for future in done: + job = in_flight.pop(future) + running_by_model[job.model_key] -= 1 + if running_by_model[job.model_key] <= 0: + del running_by_model[job.model_key] + busy_sites.discard(job.site_bucket) + + result = future.result() + results_by_target[job.target_index][job.test_index] = result + + status = "PASSED" if result.passed else "FAILED" + logger.info( + "Completed test '%s' for model '%s': %s %.1f/%.1f", + job.test_case.name, + job.model_key, + status, + result.score, + result.max_score, + ) + + return { + target_index: [result for result in target_results if result is not None] + for target_index, target_results in results_by_target.items() + } + def run_all( self, targets: Optional[List[LLMTarget]] = None, skip_services: bool = False, manual: bool = False, + parallel: int = 1, + single_model_parallel: int = 1, ): - """Run all test cases for specified LLM targets. - - Args: - targets: Explicit LLM targets to test. - skip_services: If True, skip service availability checks - manual: If True, only check eval server (manual mode doesn't need OpenBrowser) - """ + """Run all test cases for specified LLM targets.""" if not self.ensure_services(skip_services=skip_services, manual=manual): logger.error("Cannot run tests: services unavailable") return False @@ -1772,7 +1980,6 @@ def run_all( logger.error("No model aliases provided") return False - # Create timestamped output directory timestamp = time.strftime("%Y%m%d_%H%M%S") self.output_dir = OUTPUT_BASE_DIR / timestamp self.output_dir.mkdir(parents=True, exist_ok=True) @@ -1783,51 +1990,35 @@ def run_all( logger.warning("No test cases found") return False - # Store overall results for summary report - all_results = [] + scheduled_results = self._run_scheduled_jobs( + test_cases=test_cases, + targets=targets, + parallel=parallel, + single_model_parallel=single_model_parallel, + ) - target_names = [target.name for target in targets] + all_results: List[TestResult] = [] + target_names = [self._get_model_key(target) for target in targets] - for target in targets: + for target_index, target in enumerate(targets): logger.info(f"\n{'=' * 60}") logger.info( - f"Testing target alias: {target.alias} -> model: {target.model_name}" + "Finished target alias: %s -> model: %s", + target.alias, + target.model_name, ) logger.info(f"{'=' * 60}") - self.current_target = target - self.current_model = target.model_name or target.name - - # Clear results for this model - self.results = [] - - # Run all test cases for this model - for test_case in test_cases: - result = self.run_test(test_case) - self.results.append(result) - - status = "PASSED" if result.passed else "FAILED" - logger.info( - f"Test '{test_case.name}' {status}: {result.score:.1f}/{result.max_score:.1f}" - ) - - # Generate report for this model + self.results = scheduled_results.get(target_index, []) if self.results: model_report_path = self.generate_report() logger.info(f"Model report saved to: {model_report_path}") - - # Add model information to results and store for summary - for result in self.results: - result.model = target.model_name or target.name all_results.extend(self.results) - # Generate cross-model summary report if we tested multiple models if len(targets) > 1 and all_results: self._generate_cross_model_summary(all_results, target_names) - # Restore results for backward compatibility self.results = all_results - return True def run_all_manual(self, skip_services: bool = False) -> bool: @@ -1852,9 +2043,9 @@ def run_all_manual(self, skip_services: bool = False) -> bool: return False print(f"\n{'=' * 60}") - print(f"MANUAL ALL-TESTS MODE") + print("MANUAL ALL-TESTS MODE") print(f"Found {len(test_cases)} test cases to complete") - print(f"Each test will start when you confirm ready after seeing start URL") + print("Each test will start when you confirm ready after seeing start URL") print(f"{'=' * 60}") # Store overall results for summary report @@ -1945,7 +2136,7 @@ def _generate_manual_summary(self, all_results: List[TestResult]): # Print final summary print(f"\n{'=' * 60}") - print(f"MANUAL TESTING COMPLETE") + print("MANUAL TESTING COMPLETE") print(f"{'=' * 60}") print(f"Total tests: {total_tests}") print(f"Passed tests: {passed_tests} ({pass_rate:.1f}%)") @@ -1956,7 +2147,7 @@ def _generate_manual_summary(self, all_results: List[TestResult]): print(f"{'=' * 60}") # Print per-test summary table - print(f"\nTest Results Summary:") + print("\nTest Results Summary:") print( f"{'Test Name':40} {'Status':10} {'Task Score':12} {'Efficiency':12} {'Duration':10}" ) @@ -2269,6 +2460,18 @@ def main(): action="append", help="Configured LLM alias to evaluate. Can be passed multiple times.", ) + parser.add_argument( + "--parallel", + type=int, + default=1, + help="Maximum number of automated test jobs running at once.", + ) + parser.add_argument( + "--single-model-parallel", + type=int, + default=1, + help="Maximum concurrent test jobs allowed for the same resolved model.", + ) parser.add_argument( "--no-services", action="store_true", help="Don't start services" ) @@ -2316,6 +2519,12 @@ def main(): "(or set OPENBROWSER_CHROME_UUID)" ) + if args.parallel < 1: + parser.error("--parallel must be >= 1") + + if args.single_model_parallel < 1: + parser.error("--single-model-parallel must be >= 1") + evaluator = Evaluator(chrome_uuid=args.chrome_uuid) # Register cleanup @@ -2371,9 +2580,9 @@ def main(): if args.manual: logger.info(f"Running manual test: {test_case.name}") print(f"\n{'=' * 60}") - print(f"MANUAL MODE ENABLED") + print("MANUAL MODE ENABLED") print(f"Test: {test_case.name}") - print(f"Model selection ignored (manual human test)") + print("Model selection ignored (manual human test)") print(f"{'=' * 60}") result = evaluator.run_manual_test(test_case) @@ -2405,21 +2614,22 @@ def main(): # Normal (automated) mode else: all_results = [] + scheduled_results = evaluator._run_scheduled_jobs( + test_cases=[test_case], + targets=llm_targets, + parallel=args.parallel, + single_model_parallel=args.single_model_parallel, + ) target_names = [ - target.model_name or target.name for target in llm_targets + evaluator._get_model_key(target) for target in llm_targets ] - for target in llm_targets: - logger.info(f"\n{'=' * 60}") - logger.info( - f"Testing target alias: {target.alias} -> model: {target.model_name}" - ) - logger.info(f"{'=' * 60}") - evaluator.current_target = target - evaluator.current_model = target.model_name or target.name + for target_index, target in enumerate(llm_targets): + target_results = scheduled_results.get(target_index, []) + if not target_results: + continue - result = evaluator.run_test(test_case) - result.model = target.model_name or target.name + result = target_results[0] all_results.append(result) print( @@ -2430,7 +2640,6 @@ def main(): print(f" Task score: {result.score:.1f}/{result.max_score:.1f}") print(f" Efficiency score: {result.efficiency_score or 0:.2f}/1.0") print(f" Usage score: {result.usage_score or 0:.2f}/1.0") - # Calculate composite score for this test passed_float = 1.0 if result.passed else 0.0 eff_score = result.efficiency_score or 0.0 usage_score_val = result.usage_score or 0.0 @@ -2473,10 +2682,10 @@ def main(): else: # Run all tests for all models (manual mode now supported) if args.manual: - logger.info(f"Running all tests in MANUAL mode") + logger.info("Running all tests in MANUAL mode") print(f"\n{'=' * 60}") - print(f"ALL TESTS MANUAL MODE") - print(f"Model selection ignored (manual human test)") + print("ALL TESTS MANUAL MODE") + print("Model selection ignored (manual human test)") print(f"{'=' * 60}") success = evaluator.run_all_manual(skip_services=args.no_services) @@ -2491,7 +2700,11 @@ def main(): sys.exit(1) success = evaluator.run_all( - targets=llm_targets, skip_services=args.no_services, manual=False + targets=llm_targets, + skip_services=args.no_services, + manual=False, + parallel=args.parallel, + single_model_parallel=args.single_model_parallel, ) if not success: sys.exit(1) diff --git a/eval/evaluation_report.json b/eval/evaluation_report.json index 3f53998..345d6ae 100644 --- a/eval/evaluation_report.json +++ b/eval/evaluation_report.json @@ -1,11 +1,11 @@ { "evaluation": { - "timestamp": "2026-03-28 21:09:34", - "unix_timestamp": 1774703374.492846, + "timestamp": "2026-03-30 11:17:06", + "unix_timestamp": 1774840626.757355, "summary": { - "total_tests": 22, - "passed_tests": 21, - "pass_rate": 95.45, + "total_tests": 24, + "passed_tests": 24, + "pass_rate": 100.0, "models_tested": [ "dashscope/qwen3.5-flash", "dashscope/qwen3.5-plus" @@ -13,28 +13,28 @@ }, "model_performance": { "dashscope/qwen3.5-flash": { - "pass_rate": 90.91, - "task_score": 59.0, - "task_max_score": 62.5, - "efficiency_score": 7.0914, - "usage_score": 8.6369, - "composite_score": 0.8314, - "avg_duration": 165.81, - "avg_cost": 0.2005, - "passed_count": 10, - "total_tests": 11 + "pass_rate": 100.0, + "task_score": 68.5, + "task_max_score": 68.5, + "efficiency_score": 9.0653, + "usage_score": 11.0829, + "composite_score": 0.9358, + "avg_duration": 114.89, + "avg_cost": 0.075442, + "passed_count": 12, + "total_tests": 12 }, "dashscope/qwen3.5-plus": { "pass_rate": 100.0, - "task_score": 62.5, - "task_max_score": 62.5, - "efficiency_score": 7.638, - "usage_score": 4.4691, - "composite_score": 0.8201, - "avg_duration": 144.49, - "avg_cost": 0.620445, - "passed_count": 11, - "total_tests": 11 + "task_score": 67.5, + "task_max_score": 68.5, + "efficiency_score": 8.2484, + "usage_score": 8.3979, + "composite_score": 0.8774, + "avg_duration": 149.63, + "avg_cost": 0.291952, + "passed_count": 12, + "total_tests": 12 } }, "test_results": { @@ -45,23 +45,23 @@ "passed": true, "task_score": 6.0, "task_max_score": 6.0, - "efficiency_score": 0.656, - "usage_score": 0.7975, - "composite_score": 0.8907, - "total_score": 7.45, - "duration": 103.21, - "cost": 0.121497 + "efficiency_score": 0.6992, + "usage_score": 0.8882, + "composite_score": 0.9175, + "total_score": 7.59, + "duration": 90.23, + "cost": 0.067092 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 6.0, "task_max_score": 6.0, - "efficiency_score": 0.6513, - "usage_score": 0.2662, - "composite_score": 0.7835, - "total_score": 6.92, - "duration": 104.6, - "cost": 0.440265 + "efficiency_score": 0.6702, + "usage_score": 0.6682, + "composite_score": 0.8677, + "total_score": 7.34, + "duration": 98.94, + "cost": 0.199068 } } }, @@ -72,23 +72,23 @@ "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.817, - "usage_score": 0.9464, - "composite_score": 0.9527, - "total_score": 4.76, - "duration": 54.91, - "cost": 0.042865 + "efficiency_score": 0.7026, + "usage_score": 0.9286, + "composite_score": 0.9263, + "total_score": 4.63, + "duration": 89.21, + "cost": 0.05708 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7766, - "usage_score": 0.777, - "composite_score": 0.9107, - "total_score": 4.55, - "duration": 67.03, - "cost": 0.178378 + "efficiency_score": 0.7458, + "usage_score": 0.8466, + "composite_score": 0.9185, + "total_score": 4.59, + "duration": 76.26, + "cost": 0.122742 } } }, @@ -97,25 +97,25 @@ "results_by_model": { "dashscope/qwen3.5-flash": { "passed": true, - "task_score": 7.5, + "task_score": 9.0, "task_max_score": 9.0, - "efficiency_score": 0.7165, - "usage_score": 0.873, - "composite_score": 0.9179, - "total_score": 9.09, - "duration": 198.43, - "cost": 0.254081 + "efficiency_score": 0.7799, + "usage_score": 0.9379, + "composite_score": 0.9436, + "total_score": 10.72, + "duration": 154.09, + "cost": 0.124176 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 9.0, "task_max_score": 9.0, - "efficiency_score": 0.6635, - "usage_score": 0.3651, - "composite_score": 0.8057, - "total_score": 10.03, - "duration": 235.55, - "cost": 1.269709 + "efficiency_score": 0.592, + "usage_score": 0.6905, + "composite_score": 0.8565, + "total_score": 10.28, + "duration": 285.6, + "cost": 0.619067 } } }, @@ -123,26 +123,26 @@ "name": "GBR Search Test", "results_by_model": { "dashscope/qwen3.5-flash": { - "passed": false, - "task_score": 0.5, + "passed": true, + "task_score": 2.5, "task_max_score": 2.5, - "efficiency_score": 0.0806, - "usage_score": 0.4183, - "composite_score": 0.0998, - "total_score": 1.0, - "duration": 367.76, - "cost": 0.465362 + "efficiency_score": 0.8086, + "usage_score": 0.9414, + "composite_score": 0.95, + "total_score": 4.25, + "duration": 76.54, + "cost": 0.046907 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 2.5, "task_max_score": 2.5, - "efficiency_score": 0.7707, - "usage_score": 0.5913, - "composite_score": 0.8724, - "total_score": 3.86, - "duration": 91.71, - "cost": 0.326986 + "efficiency_score": 0.7862, + "usage_score": 0.7942, + "composite_score": 0.9161, + "total_score": 4.08, + "duration": 85.52, + "cost": 0.164629 } } }, @@ -153,23 +153,23 @@ "passed": true, "task_score": 9.5, "task_max_score": 9.5, - "efficiency_score": 0.2831, - "usage_score": 0.5356, - "composite_score": 0.7637, - "total_score": 10.32, - "duration": 358.43, - "cost": 0.464415 + "efficiency_score": 0.7537, + "usage_score": 0.9936, + "composite_score": 0.9495, + "total_score": 11.25, + "duration": 123.16, + "cost": 0.006363 }, "dashscope/qwen3.5-plus": { "passed": true, - "task_score": 9.5, + "task_score": 8.5, "task_max_score": 9.5, - "efficiency_score": 0.6498, - "usage_score": 0.0922, - "composite_score": 0.7484, - "total_score": 10.24, - "duration": 175.1, - "cost": 0.907785 + "efficiency_score": 0.7346, + "usage_score": 0.7258, + "composite_score": 0.8921, + "total_score": 9.96, + "duration": 132.72, + "cost": 0.274247 } } }, @@ -180,23 +180,23 @@ "passed": true, "task_score": 12.0, "task_max_score": 12.0, - "efficiency_score": 0.7024, - "usage_score": 0.8476, - "composite_score": 0.91, - "total_score": 13.55, - "duration": 148.82, - "cost": 0.182839 + "efficiency_score": 0.7297, + "usage_score": 0.9161, + "composite_score": 0.9292, + "total_score": 13.65, + "duration": 135.13, + "cost": 0.100653 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 12.0, "task_max_score": 12.0, - "efficiency_score": 0.6839, - "usage_score": 0.3953, - "composite_score": 0.8158, - "total_score": 13.08, - "duration": 158.07, - "cost": 0.725694 + "efficiency_score": 0.6813, + "usage_score": 0.7499, + "composite_score": 0.8862, + "total_score": 13.43, + "duration": 159.34, + "cost": 0.300169 } } }, @@ -207,23 +207,23 @@ "passed": true, "task_score": 2, "task_max_score": 2, - "efficiency_score": 0.9001, - "usage_score": 0.9546, - "composite_score": 0.9709, - "total_score": 3.85, - "duration": 29.97, - "cost": 0.022705 + "efficiency_score": 0.8776, + "usage_score": 0.9646, + "composite_score": 0.9684, + "total_score": 3.84, + "duration": 36.73, + "cost": 0.0177 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 2, "task_max_score": 2, - "efficiency_score": 0.8533, - "usage_score": 0.7708, - "composite_score": 0.9248, - "total_score": 3.62, - "duration": 44.02, - "cost": 0.114576 + "efficiency_score": 0.7867, + "usage_score": 0.8596, + "composite_score": 0.9293, + "total_score": 3.65, + "duration": 63.98, + "cost": 0.070214 } } }, @@ -234,23 +234,23 @@ "passed": true, "task_score": 5.0, "task_max_score": 5.0, - "efficiency_score": 0.7334, - "usage_score": 0.8726, - "composite_score": 0.9212, - "total_score": 6.61, - "duration": 106.62, - "cost": 0.127404 + "efficiency_score": 0.6489, + "usage_score": 0.902, + "composite_score": 0.9102, + "total_score": 6.55, + "duration": 140.44, + "cost": 0.09796 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 5.0, "task_max_score": 5.0, - "efficiency_score": 0.5376, - "usage_score": 0.2214, - "composite_score": 0.7518, - "total_score": 5.76, - "duration": 184.97, - "cost": 0.778617 + "efficiency_score": 0.5245, + "usage_score": 0.6334, + "composite_score": 0.8316, + "total_score": 6.16, + "duration": 190.18, + "cost": 0.366565 } } }, @@ -261,23 +261,23 @@ "passed": true, "task_score": 3.5, "task_max_score": 3.5, - "efficiency_score": 0.7705, - "usage_score": 0.8949, - "composite_score": 0.9331, - "total_score": 5.17, - "duration": 114.74, - "cost": 0.126168 + "efficiency_score": 0.8244, + "usage_score": 0.9518, + "composite_score": 0.9552, + "total_score": 5.28, + "duration": 87.78, + "cost": 0.057875 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 3.5, "task_max_score": 3.5, - "efficiency_score": 0.592, - "usage_score": 0.5048, - "composite_score": 0.8194, - "total_score": 4.6, - "duration": 203.98, - "cost": 0.594295 + "efficiency_score": 0.5207, + "usage_score": 0.6071, + "composite_score": 0.8256, + "total_score": 4.63, + "duration": 239.64, + "cost": 0.471474 } } }, @@ -288,23 +288,23 @@ "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7678, - "usage_score": 0.6428, - "composite_score": 0.8821, - "total_score": 4.41, - "duration": 139.31, - "cost": 0.178579 + "efficiency_score": 0.7973, + "usage_score": 0.8323, + "composite_score": 0.9259, + "total_score": 4.63, + "duration": 121.6, + "cost": 0.08386 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 3, "task_max_score": 3, - "efficiency_score": 0.7283, - "usage_score": 0, - "composite_score": 0.7457, - "total_score": 3.73, - "duration": 163.02, - "cost": 0.716118 + "efficiency_score": 0.6867, + "usage_score": 0.2032, + "composite_score": 0.778, + "total_score": 3.89, + "duration": 187.96, + "cost": 0.398401 } } }, @@ -315,26 +315,53 @@ "passed": true, "task_score": 7.0, "task_max_score": 7.0, - "efficiency_score": 0.6639, - "usage_score": 0.8536, - "composite_score": 0.9035, - "total_score": 8.52, - "duration": 201.65, - "cost": 0.219589 + "efficiency_score": 0.6144, + "usage_score": 0.8758, + "composite_score": 0.8981, + "total_score": 8.49, + "duration": 231.34, + "cost": 0.186276 }, "dashscope/qwen3.5-plus": { "passed": true, "task_score": 7.0, "task_max_score": 7.0, - "efficiency_score": 0.7311, - "usage_score": 0.485, - "composite_score": 0.8432, - "total_score": 8.22, - "duration": 161.35, - "cost": 0.772474 + "efficiency_score": 0.7343, + "usage_score": 0.7991, + "composite_score": 0.9067, + "total_score": 8.53, + "duration": 159.39, + "cost": 0.301292 + } + } + }, + "northstar_add_bag": { + "name": "Northstar Fit Guide + Add To Bag Test", + "results_by_model": { + "dashscope/qwen3.5-flash": { + "passed": true, + "task_score": 6.0, + "task_max_score": 6.0, + "efficiency_score": 0.8288, + "usage_score": 0.9505, + "composite_score": 0.9559, + "total_score": 7.78, + "duration": 92.44, + "cost": 0.059363 + }, + "dashscope/qwen3.5-plus": { + "passed": true, + "task_score": 6.0, + "task_max_score": 6.0, + "efficiency_score": 0.7852, + "usage_score": 0.8204, + "composite_score": 0.9211, + "total_score": 7.61, + "duration": 116.0, + "cost": 0.215559 } } } } } -} +} \ No newline at end of file diff --git a/eval/northstar/css/northstar.css b/eval/northstar/css/northstar.css new file mode 100644 index 0000000..b878956 --- /dev/null +++ b/eval/northstar/css/northstar.css @@ -0,0 +1,554 @@ +:root { + --bg: #f3f0ea; + --surface: #fcfbf8; + --ink: #1d1a17; + --muted: #6d655d; + --line: rgba(29, 26, 23, 0.12); + --accent: #2747d8; + --accent-soft: rgba(39, 71, 216, 0.12); + --success: #1c7a4e; + --warning: #d36b25; + --shadow: 0 28px 60px rgba(37, 27, 18, 0.12); + font-family: "Avenir Next", "Segoe UI", sans-serif; +} + +* { + box-sizing: border-box; +} + +body { + margin: 0; + background: linear-gradient(180deg, #f7f4ee 0%, #efe8dd 100%); + color: var(--ink); +} + +button, +input { + font: inherit; +} + +.announcement-bar { + position: sticky; + top: 0; + z-index: 30; + background: #1b1f2c; + color: #f6f7fb; + text-align: center; + padding: 10px 16px; + font-size: 13px; + letter-spacing: 0.04em; + text-transform: uppercase; +} + +.site-header { + position: sticky; + top: 42px; + z-index: 20; + display: grid; + grid-template-columns: auto 1fr auto; + gap: 28px; + align-items: center; + padding: 18px 32px; + background: rgba(252, 251, 248, 0.92); + border-bottom: 1px solid var(--line); + backdrop-filter: blur(12px); +} + +.brand { + display: flex; + gap: 14px; + align-items: center; +} + +.brand-mark { + width: 42px; + height: 42px; + border-radius: 14px; + display: grid; + place-items: center; + background: linear-gradient(135deg, #2f416f, #111827); + color: #f7fbff; + font-weight: 700; + font-size: 18px; +} + +.brand-name { + font-size: 17px; + font-weight: 700; +} + +.brand-tag { + font-size: 12px; + color: var(--muted); +} + +.site-nav { + display: flex; + gap: 24px; + justify-content: center; +} + +.site-link { + color: var(--ink); + text-decoration: none; + font-size: 14px; +} + +.header-actions { + display: flex; + gap: 12px; + align-items: center; +} + +.search-shell input { + width: 280px; + border: 1px solid var(--line); + border-radius: 999px; + padding: 11px 16px; + background: #fff; +} + +.header-btn { + border: 1px solid var(--line); + border-radius: 999px; + background: #fff; + padding: 11px 16px; + color: var(--ink); +} + +.page-shell { + max-width: 1300px; + margin: 0 auto; + padding: 28px 32px 220px; +} + +.breadcrumbs { + display: flex; + gap: 10px; + align-items: center; + font-size: 13px; + color: var(--muted); + margin-bottom: 24px; +} + +.crumb-sep { + opacity: 0.55; +} + +.product-layout { + display: grid; + grid-template-columns: minmax(0, 1.1fr) minmax(360px, 420px); + gap: 28px; + align-items: start; +} + +.gallery-column, +.detail-column { + display: grid; + gap: 24px; +} + +.hero-card, +.panel-card, +.story-card { + border-radius: 28px; + background: var(--surface); + border: 1px solid rgba(29, 26, 23, 0.08); + box-shadow: var(--shadow); +} + +.hero-card { + min-height: 520px; + padding: 30px; + display: flex; + flex-direction: column; + justify-content: space-between; + overflow: hidden; +} + +.hero-1 { + background: + linear-gradient(145deg, rgba(16, 22, 41, 0.95), rgba(43, 70, 122, 0.85)), + radial-gradient(circle at top right, rgba(255, 255, 255, 0.18), transparent 35%); + color: #f7f8fb; +} + +.hero-2 { + min-height: 340px; + background: linear-gradient(160deg, #d4d8df, #f6f8fb 44%, #f0ece6); +} + +.hero-3 { + min-height: 360px; + background: linear-gradient(160deg, #d7d1c8, #f8f3ea 44%, #efe8de); +} + +.media-badge, +.collection-label, +.section-kicker, +.story-label, +.drawer-kicker { + display: inline-flex; + align-items: center; + gap: 8px; + font-size: 12px; + letter-spacing: 0.08em; + text-transform: uppercase; + color: var(--muted); +} + +.hero-1 .media-badge { + color: #dce8ff; +} + +.hero-copy h1, +.hero-copy h2, +.panel-title, +.product-name { + margin: 0; + line-height: 1.06; +} + +.hero-copy h1 { + max-width: 78%; + font-size: 54px; +} + +.hero-copy.slim h2 { + max-width: 72%; + font-size: 34px; +} + +.detail-column .panel-card, +.product-summary { + padding: 28px; +} + +.product-summary { + border-radius: 28px; + background: rgba(252, 251, 248, 0.9); + border: 1px solid var(--line); + box-shadow: var(--shadow); +} + +.price-row { + display: flex; + gap: 14px; + align-items: baseline; + margin: 12px 0 14px; +} + +.price { + font-size: 34px; + font-weight: 700; +} + +.price-note, +.summary-copy, +.fit-note, +.panel-copy, +.shipping-banner, +.drawer-subhead, +.review-topline, +.guide-section p { + color: var(--muted); +} + +.swatch-row, +.feature-pills, +.size-grid, +.purchase-actions { + display: flex; + flex-wrap: wrap; + gap: 12px; +} + +.swatch, +.size-option, +.secondary-btn, +.review-action, +.header-btn, +.sticky-btn { + border: 1px solid var(--line); + background: #fff; + color: var(--ink); + border-radius: 999px; + padding: 12px 16px; +} + +.swatch.active, +.size-option.active { + border-color: var(--accent); + background: var(--accent-soft); + color: var(--accent); +} + +.feature-pill { + border-radius: 999px; + background: rgba(29, 26, 23, 0.05); + padding: 10px 14px; + font-size: 13px; +} + +.add-btn { + border: none; + border-radius: 999px; + padding: 14px 22px; + background: var(--accent); + color: #fff; + font-weight: 700; + box-shadow: 0 16px 30px rgba(39, 71, 216, 0.24); +} + +.add-btn:disabled { + opacity: 0.48; + cursor: not-allowed; + box-shadow: none; +} + +.add-btn.is-complete, +.review-action.is-complete { + border-color: rgba(28, 122, 78, 0.2); + background: #1c7a4e; + color: #fff; + box-shadow: 0 16px 30px rgba(28, 122, 78, 0.22); +} + +.add-btn.is-complete:disabled, +.review-action.is-complete:disabled { + opacity: 1; + cursor: default; +} + +.action-feedback { + margin: 14px 0 0; + min-height: 20px; + font-size: 14px; + font-weight: 600; + color: var(--success); + opacity: 0; + transform: translateY(4px); + transition: opacity 140ms ease, transform 140ms ease; +} + +.action-feedback.visible { + opacity: 1; + transform: translateY(0); +} + +.drawer-feedback { + padding: 0 28px 12px; + margin: 0; +} + +.story-grid { + margin-top: 32px; + display: grid; + grid-template-columns: repeat(3, minmax(0, 1fr)); + gap: 20px; +} + +.story-card { + padding: 24px; +} + +.story-card h3 { + margin: 12px 0 0; + font-size: 20px; + line-height: 1.3; +} + +.sticky-cta { + position: fixed; + left: 24px; + right: 24px; + bottom: 20px; + z-index: 24; + display: flex; + justify-content: space-between; + align-items: center; + padding: 18px 22px; + border-radius: 24px; + background: rgba(24, 27, 36, 0.94); + color: #f6f7fb; + box-shadow: 0 26px 60px rgba(15, 17, 23, 0.32); +} + +.sticky-label { + font-size: 15px; + font-weight: 700; +} + +.sticky-meta { + font-size: 13px; + color: rgba(246, 247, 251, 0.72); +} + +.sticky-btn { + border-color: rgba(255, 255, 255, 0.2); + background: rgba(255, 255, 255, 0.08); + color: #fff; +} + +.drawer-overlay { + position: fixed; + inset: 0; + z-index: 50; + display: flex; + justify-content: flex-end; + background: rgba(22, 18, 15, 0.34); + backdrop-filter: blur(4px); +} + +.drawer-overlay.hidden { + display: none; +} + +.drawer { + width: min(520px, 100%); + height: 100%; + background: var(--surface); + display: flex; + flex-direction: column; + position: relative; + box-shadow: -24px 0 60px rgba(28, 21, 16, 0.18); +} + +.drawer-header { + display: flex; + justify-content: space-between; + align-items: flex-start; + padding: 28px 28px 18px; + border-bottom: 1px solid var(--line); +} + +.drawer-header h2 { + margin: 6px 0 0; + font-size: 30px; +} + +.drawer-close { + border: 1px solid var(--line); + background: #fff; + border-radius: 999px; + padding: 10px 14px; +} + +.drawer-subhead { + padding: 0 28px 18px; + font-size: 14px; +} + +.drawer-scroll { + flex: 1; + overflow-y: auto; + padding: 0 28px 140px; + scroll-behavior: smooth; +} + +.guide-scroll { + padding-bottom: 260px; +} + +.fit-guide-refresh-fab { + position: fixed; + left: var(--fit-guide-refresh-left, calc(100vw - min(520px, 100vw) + 78px)); + top: var(--fit-guide-refresh-top, calc(100vh - 150px)); + z-index: 60; + width: 82px; + height: 82px; + border: 1px solid rgba(29, 26, 23, 0.12); + border-radius: 999px; + display: grid; + place-items: center; + background: rgba(252, 251, 248, 0.96); + color: #1d1a17; + box-shadow: 0 18px 34px rgba(28, 21, 16, 0.18); + backdrop-filter: blur(12px); + transition: opacity 140ms ease, box-shadow 140ms ease; +} + +.fit-guide-refresh-fab svg { + width: 32px; + height: 32px; + fill: currentColor; +} + +.review-card, +.guide-section { + border-radius: 22px; + background: #fff; + border: 1px solid var(--line); + padding: 22px; + margin-bottom: 18px; +} + +.review-card h3, +.guide-section h3 { + margin: 8px 0 10px; + font-size: 24px; +} + +.review-card p, +.guide-section p { + margin: 0 0 18px; + line-height: 1.55; +} + +.target-action { + background: rgba(39, 71, 216, 0.08); + border-color: rgba(39, 71, 216, 0.32); + color: var(--accent); +} + +.review-card.is-complete, +.guide-section.is-complete, +.panel-card.is-complete { + border-color: rgba(28, 122, 78, 0.22); + box-shadow: 0 18px 36px rgba(28, 122, 78, 0.14); +} + +.review-action.is-complete.target-action { + border-color: rgba(28, 122, 78, 0.2); + background: #1c7a4e; + color: #fff; +} + +.target-guide-section { + border-color: rgba(39, 71, 216, 0.24); + background: linear-gradient(180deg, rgba(39, 71, 216, 0.05), rgba(39, 71, 216, 0.01)); +} + +.target-guide-section.is-complete { + border-color: rgba(28, 122, 78, 0.22); + background: linear-gradient(180deg, rgba(28, 122, 78, 0.08), rgba(28, 122, 78, 0.02)); +} + +@media (max-width: 1080px) { + .site-header { + grid-template-columns: 1fr; + justify-items: start; + } + + .product-layout, + .story-grid { + grid-template-columns: 1fr; + } + + .search-shell input { + width: min(100%, 320px); + } + + .sticky-cta { + left: 14px; + right: 14px; + bottom: 14px; + padding: 16px 18px; + gap: 12px; + } + + .fit-guide-refresh-fab { + width: 76px; + height: 76px; + } +} diff --git a/eval/northstar/index.html b/eval/northstar/index.html new file mode 100644 index 0000000..f43b4f7 --- /dev/null +++ b/eval/northstar/index.html @@ -0,0 +1,270 @@ + + + + + + Northstar Outfitters - Commuter Shell + + + +
+ Free two-day shipping on commuting layers and rain shells +
+ + + +
+ + +
+ + + +
+ +
+
+ +

20k / 20k membrane with seam tape on the shoulder line and hood crown.

+
+
+ +

Dropped rear hem and articulated sleeves keep the cuff line steady on the bars.

+
+
+ +

Front chest entry plus inside drop pocket sized for notebook, gloves, and compact charger.

+
+
+
+ +
+
+
Commuter Shell
+
Locker pickup available
+
+ +
+ + + + + + + + + + + diff --git a/eval/northstar/js/northstar.js b/eval/northstar/js/northstar.js new file mode 100644 index 0000000..d13cba6 --- /dev/null +++ b/eval/northstar/js/northstar.js @@ -0,0 +1,354 @@ +window.tracker = new AgentTracker("northstaroutfitters.com", "hard"); + +document.addEventListener("DOMContentLoaded", () => { + const state = { + selectedSize: null, + purchaseRailTracked: false, + reviewsGeometryTracked: false, + fitGuideGeometryTracked: false, + }; + + const purchaseRail = document.getElementById("purchase-rail"); + const addToBagBtn = document.getElementById("add-to-bag"); + const purchaseFeedback = document.getElementById("purchase-feedback"); + const sizeButtons = Array.from(document.querySelectorAll(".size-option")); + const reviewsOverlay = document.getElementById("reviews-overlay"); + const reviewsFeedback = document.getElementById("reviews-feedback"); + const reviewsScroll = document.getElementById("reviews-scroll"); + const fitGuideOverlay = document.getElementById("fit-guide-overlay"); + const fitGuideScroll = document.getElementById("fit-guide-scroll"); + const targetReview = document.querySelector('[data-review-id="review-trail-commute"]')?.closest(".review-card"); + const careSection = document.getElementById("care-wash-section"); + const fitGuideFeedback = document.getElementById("fit-guide-feedback"); + const saveFitGuideBtn = document.getElementById("save-fit-guide"); + const fitGuideRefreshBtn = document.getElementById("fit-guide-refresh"); + + function showFeedback(node, message) { + if (!node) { + return; + } + + node.textContent = message; + node.classList.add("visible"); + } + + function resetAddToBagState() { + addToBagBtn.textContent = "Add to bag"; + addToBagBtn.classList.remove("is-complete"); + addToBagBtn.disabled = state.selectedSize === null; + } + + function clamp(value, min, max) { + return Math.min(Math.max(value, min), max); + } + + function resetFitGuideRefreshPlacement() { + if (!fitGuideRefreshBtn) { + return; + } + + fitGuideRefreshBtn.style.removeProperty("--fit-guide-refresh-left"); + fitGuideRefreshBtn.style.removeProperty("--fit-guide-refresh-top"); + } + + function placeFitGuideRefreshBlocker() { + if (!fitGuideRefreshBtn || !saveFitGuideBtn || fitGuideOverlay.classList.contains("hidden")) { + resetFitGuideRefreshPlacement(); + return; + } + + const drawer = fitGuideOverlay.querySelector(".drawer"); + const saveRect = saveFitGuideBtn.getBoundingClientRect(); + const drawerRect = drawer?.getBoundingClientRect(); + const blockerWidth = fitGuideRefreshBtn.offsetWidth || 82; + const blockerHeight = fitGuideRefreshBtn.offsetHeight || 82; + let left = saveRect.left + saveRect.width / 2 - blockerWidth / 2; + let top = saveRect.top + saveRect.height / 2 - blockerHeight / 2; + + if (drawerRect) { + const minLeft = drawerRect.left + 16; + const maxLeft = drawerRect.right - blockerWidth - 16; + const minTop = drawerRect.top + 96; + const maxTop = drawerRect.bottom - blockerHeight - 16; + + left = clamp(left, minLeft, Math.max(minLeft, maxLeft)); + top = clamp(top, minTop, Math.max(minTop, maxTop)); + } + + fitGuideRefreshBtn.style.setProperty("--fit-guide-refresh-left", `${Math.round(left)}px`); + fitGuideRefreshBtn.style.setProperty("--fit-guide-refresh-top", `${Math.round(top)}px`); + } + + function queueFitGuideRefreshPlacement() { + requestAnimationFrame(() => { + requestAnimationFrame(placeFitGuideRefreshBlocker); + }); + } + + function trackGeometryReadyOnce() { + if (!purchaseRail || state.purchaseRailTracked) { + return; + } + + const rect = purchaseRail.getBoundingClientRect(); + const topSafe = 130; + const bottomSafe = window.innerHeight - 180; + const centeredEnough = rect.top >= topSafe && rect.bottom <= bottomSafe; + + if (!centeredEnough) { + return; + } + + state.purchaseRailTracked = true; + tracker.track("purchase_rail_geometry_ready", { + section: "purchase-rail", + scrollY: Math.round(window.scrollY), + }); + } + + function isCenteredInContainer(target, container) { + if (!target || !container) { + return false; + } + + const targetRect = target.getBoundingClientRect(); + const containerRect = container.getBoundingClientRect(); + const targetCenter = targetRect.top + targetRect.height / 2; + const containerCenter = containerRect.top + containerRect.height / 2; + const tolerance = Math.max(60, containerRect.height * 0.18); + + return ( + targetRect.top >= containerRect.top + 24 && + targetRect.bottom <= containerRect.bottom - 110 && + Math.abs(targetCenter - containerCenter) <= tolerance + ); + } + + function overlapsBlocker(target, blocker) { + if (!target || !blocker) { + return false; + } + + const targetRect = target.getBoundingClientRect(); + const blockerRect = blocker.getBoundingClientRect(); + + const horizontalOverlap = + Math.min(targetRect.right, blockerRect.right) - + Math.max(targetRect.left, blockerRect.left); + const verticalOverlap = + Math.min(targetRect.bottom, blockerRect.bottom) - + Math.max(targetRect.top, blockerRect.top); + + return horizontalOverlap > 8 && verticalOverlap > 8; + } + + function trackReviewsGeometry() { + if (state.reviewsGeometryTracked || reviewsOverlay.classList.contains("hidden")) { + return; + } + + if (!isCenteredInContainer(targetReview, reviewsScroll)) { + return; + } + + state.reviewsGeometryTracked = true; + tracker.track("reviews_drawer_geometry_ready", { + reviewId: "review-trail-commute", + scrollTop: Math.round(reviewsScroll.scrollTop), + }); + } + + function trackFitGuideGeometry() { + if (state.fitGuideGeometryTracked || fitGuideOverlay.classList.contains("hidden")) { + return; + } + + if (!isCenteredInContainer(careSection, fitGuideScroll)) { + return; + } + + if (overlapsBlocker(saveFitGuideBtn, fitGuideRefreshBtn)) { + return; + } + + state.fitGuideGeometryTracked = true; + tracker.track("fit_guide_geometry_ready", { + section: "care-wash", + scrollTop: Math.round(fitGuideScroll.scrollTop), + }); + } + + window.addEventListener("scroll", trackGeometryReadyOnce, { passive: true }); + window.addEventListener("resize", () => { + trackGeometryReadyOnce(); + + if (!fitGuideOverlay.classList.contains("hidden")) { + queueFitGuideRefreshPlacement(); + } + }); + + sizeButtons.forEach((button) => { + button.addEventListener("click", () => { + sizeButtons.forEach((item) => item.classList.remove("active")); + button.classList.add("active"); + state.selectedSize = button.dataset.size || null; + resetAddToBagState(); + showFeedback( + purchaseFeedback, + `Size ${state.selectedSize} selected. Ready to add the shell to bag.`, + ); + tracker.track("product_size_select", { + productId: "commuter-shell", + size: state.selectedSize, + }); + }); + }); + + addToBagBtn.addEventListener("click", () => { + if (!state.selectedSize) { + return; + } + + tracker.track("product_add_to_bag", { + productId: "commuter-shell", + size: state.selectedSize, + }); + + addToBagBtn.textContent = "Added to bag"; + addToBagBtn.classList.add("is-complete"); + addToBagBtn.disabled = true; + purchaseRail.classList.add("is-complete"); + showFeedback( + purchaseFeedback, + `Commuter Shell in size ${state.selectedSize} added to your bag.`, + ); + }); + + document.getElementById("open-reviews")?.addEventListener("click", () => { + state.reviewsGeometryTracked = false; + reviewsOverlay.classList.remove("hidden"); + reviewsScroll.scrollTop = 0; + showFeedback( + reviewsFeedback, + "Reviews drawer opened. Scroll inside this panel to reach the target review.", + ); + tracker.track("reviews_drawer_open", { + drawer: "reviews", + }); + }); + + document.getElementById("open-fit-guide")?.addEventListener("click", () => { + state.fitGuideGeometryTracked = false; + fitGuideOverlay.classList.remove("hidden"); + fitGuideScroll.scrollTop = 0; + queueFitGuideRefreshPlacement(); + showFeedback( + fitGuideFeedback, + "Fit guide opened. Scroll this panel until Care & Wash is centered.", + ); + tracker.track("fit_guide_open", { + drawer: "fit-guide", + }); + }); + + document.querySelectorAll(".drawer-close").forEach((button) => { + button.addEventListener("click", () => { + const overlayId = button.dataset.close; + if (!overlayId) { + return; + } + + document.getElementById(overlayId)?.classList.add("hidden"); + if (overlayId === "fit-guide-overlay") { + resetFitGuideRefreshPlacement(); + } + tracker.track("drawer_close", { + drawer: overlayId, + }); + }); + }); + + reviewsOverlay.addEventListener("click", (event) => { + if (event.target === reviewsOverlay) { + reviewsOverlay.classList.add("hidden"); + tracker.track("drawer_close", { + drawer: "reviews-overlay", + method: "overlay", + }); + } + }); + + fitGuideOverlay.addEventListener("click", (event) => { + if (event.target === fitGuideOverlay) { + fitGuideOverlay.classList.add("hidden"); + resetFitGuideRefreshPlacement(); + tracker.track("drawer_close", { + drawer: "fit-guide-overlay", + method: "overlay", + }); + } + }); + + fitGuideRefreshBtn?.addEventListener("click", () => { + window.location.reload(); + }); + + reviewsScroll.addEventListener( + "scroll", + () => { + tracker.track("reviews_panel_scroll", { + scrollTop: Math.round(reviewsScroll.scrollTop), + }); + trackReviewsGeometry(); + }, + { passive: true }, + ); + + fitGuideScroll.addEventListener( + "scroll", + () => { + tracker.track("fit_guide_scroll", { + scrollTop: Math.round(fitGuideScroll.scrollTop), + }); + trackFitGuideGeometry(); + }, + { passive: true }, + ); + + document.querySelectorAll(".review-action").forEach((button) => { + button.addEventListener("click", () => { + const reviewId = button.dataset.reviewId; + const card = button.closest(".review-card"); + const title = card?.querySelector("h3")?.textContent?.trim() || ""; + + tracker.track("review_helpful", { + reviewId, + title, + helpful: true, + }); + + button.textContent = "Helpful saved"; + button.classList.add("is-complete"); + button.disabled = true; + card?.classList.add("is-complete"); + showFeedback(reviewsFeedback, `Saved "${title}" as Helpful.`); + }); + }); + + saveFitGuideBtn?.addEventListener("click", () => { + tracker.track("fit_guide_save", { + section: "care-wash", + }); + + saveFitGuideBtn.textContent = "Guide saved"; + saveFitGuideBtn.classList.add("is-complete"); + saveFitGuideBtn.disabled = true; + careSection?.classList.add("is-complete"); + showFeedback( + fitGuideFeedback, + "Care & Wash saved. This section is now marked as complete.", + ); + }); + + trackGeometryReadyOnce(); +}); diff --git a/eval/server.py b/eval/server.py index 5eaea3b..9bd38df 100644 --- a/eval/server.py +++ b/eval/server.py @@ -13,20 +13,94 @@ 4. Export events via /api/events endpoint """ -import http.server -import socketserver import html +import http.server import json import os +import socketserver +import threading +from copy import deepcopy from datetime import datetime -from urllib.parse import urlparse, parse_qs +from urllib.parse import parse_qs, urlparse # Configuration PORT = 16605 EVAL_DIR = os.path.dirname(os.path.abspath(__file__)) # In-memory event storage -events_store = {"events": [], "sessions": {}} +events_store = {"events": [], "sessions": {}, "sites": {}} +events_store_lock = threading.Lock() + +SITE_NAME_TO_BUCKET = { + "globalbusinessreview.com": "gbr", + "techforum.com": "techforum", + "cloudstack.com": "cloudstack", + "dataflow.io": "dataflow", + "finviz": "finviz", + "bluebook.life": "bluebook", + "northstaroutfitters.com": "northstar", +} + + +def _normalize_site_bucket(raw_value): + """Normalize a site/path/domain into a mock-site bucket key.""" + if not raw_value or not isinstance(raw_value, str): + return None + + parsed = urlparse(raw_value) + candidate = parsed.path if parsed.scheme or parsed.netloc else raw_value + candidate = candidate.strip() + + if candidate.startswith("/"): + segments = [segment for segment in candidate.split("/") if segment] + if segments: + return segments[0] + + normalized = candidate.strip().lower() + return SITE_NAME_TO_BUCKET.get(normalized, normalized or None) + + +def _get_event_site_bucket(event): + """Infer the mock-site bucket for one tracked event.""" + for key in ("page", "url", "site"): + bucket = _normalize_site_bucket(event.get(key)) + if bucket: + return bucket + return "unknown" + + +def _get_or_create_site_store(site_bucket): + """Return the per-site event store, creating it if needed.""" + return events_store["sites"].setdefault(site_bucket, {"events": [], "sessions": {}}) + + +def _snapshot_events(site_bucket=None): + """Return a JSON-safe snapshot of tracked events.""" + with events_store_lock: + if site_bucket: + site_store = events_store["sites"].get( + site_bucket, {"events": [], "sessions": {}} + ) + return { + "site": site_bucket, + "events": deepcopy(site_store["events"]), + "sessions": deepcopy(site_store["sessions"]), + } + + return deepcopy(events_store) + + +def _clear_events(site_bucket=None): + """Clear tracked events globally or for a specific site.""" + with events_store_lock: + if site_bucket: + events_store["sites"][site_bucket] = {"events": [], "sessions": {}} + return + + events_store["events"] = [] + events_store["sessions"] = {} + events_store["sites"] = {} + # URL mappings URL_MAPPINGS = { @@ -177,6 +251,8 @@ "/finviz/index.html": ("/finviz/index.html", "text/html"), "/bluebook/": ("/bluebook/index.html", "text/html"), "/bluebook/index.html": ("/bluebook/index.html", "text/html"), + "/northstar/": ("/northstar/index.html", "text/html"), + "/northstar/index.html": ("/northstar/index.html", "text/html"), } CSS_MIMETYPE = "text/css" @@ -197,16 +273,25 @@ def do_GET(self): """Handle GET requests""" parsed_path = urlparse(self.path) path = parsed_path.path + query_params = parse_qs(parsed_path.query) + site_bucket = _normalize_site_bucket(query_params.get("site", [None])[0]) # API endpoints if path == "/api/events": - self.send_json_response(events_store) + self.send_json_response(_snapshot_events(site_bucket)) return elif path == "/api/events/clear": - events_store["events"] = [] - events_store["sessions"] = {} + _clear_events(site_bucket) self.send_json_response( - {"status": "cleared", "message": "All events cleared"} + { + "status": "cleared", + "site": site_bucket, + "message": ( + f"Events cleared for site '{site_bucket}'" + if site_bucket + else "All events cleared" + ), + } ) return elif path == "/api/sites": @@ -248,6 +333,12 @@ def do_GET(self): "url": "/bluebook/", "description": "Xiaohongshu-like feed - test search, note modal, comment actions, and dense visual layouts", }, + { + "name": "northstaroutfitters.com", + "difficulty": "hard", + "url": "/northstar/", + "description": "Apparel product page - test geometry-first scrolling, sticky UI, and drawer-scoped scrolling", + }, ] } self.send_json_response(sites) @@ -255,8 +346,8 @@ def do_GET(self): elif path == "/api/help": help_text = { "endpoints": { - "GET /api/events": "Get all tracked events", - "GET /api/events/clear": "Clear all events", + "GET /api/events": "Get tracked events (optional ?site=)", + "GET /api/events/clear": "Clear tracked events (optional ?site=)", "GET /api/sites": "List available mock sites", "GET /api/help": "Show this help", "POST /api/track": "Submit tracking event (from browser)", @@ -268,6 +359,7 @@ def do_GET(self): "/dataflow/": "DataFlow analytics dashboard mock (medium)", "/finviz/": "Finviz stock screener mock (hard)", "/bluebook/": "BlueBook lifestyle feed mock (hard)", + "/northstar/": "Northstar Outfitters product page mock (hard)", }, } self.send_json_response(help_text) @@ -317,6 +409,7 @@ def do_GET(self): "dataflow", "finviz", "bluebook", + "northstar", ]: if path.startswith(f"/{site}/js/") and path.endswith(".js"): self.send_file(path, JS_MIMETYPE) @@ -330,6 +423,7 @@ def do_GET(self): "dataflow", "finviz", "bluebook", + "northstar", ]: if path.startswith(f"/{site}/css/") and path.endswith(".css"): self.send_file(path, CSS_MIMETYPE) @@ -364,22 +458,42 @@ def do_POST(self): try: event = json.loads(post_data.decode("utf-8")) event["received_at"] = datetime.now().isoformat() - events_store["events"].append(event) - # Track sessions + site_bucket = _get_event_site_bucket(event) session_id = event.get("sessionId", "unknown") - if session_id not in events_store["sessions"]: - events_store["sessions"][session_id] = { - "sessionId": session_id, - "site": event.get("site", "unknown"), - "difficulty": event.get("difficulty", "unknown"), - "start_time": event.get("timestamp"), - "events_count": 0, - } - events_store["sessions"][session_id]["events_count"] += 1 - events_store["sessions"][session_id]["last_activity"] = event.get( - "timestamp" - ) + + with events_store_lock: + events_store["events"].append(event) + + if session_id not in events_store["sessions"]: + events_store["sessions"][session_id] = { + "sessionId": session_id, + "site": event.get("site", "unknown"), + "site_bucket": site_bucket, + "difficulty": event.get("difficulty", "unknown"), + "start_time": event.get("timestamp"), + "events_count": 0, + } + events_store["sessions"][session_id]["events_count"] += 1 + events_store["sessions"][session_id]["last_activity"] = event.get( + "timestamp" + ) + + site_store = _get_or_create_site_store(site_bucket) + site_store["events"].append(event) + if session_id not in site_store["sessions"]: + site_store["sessions"][session_id] = { + "sessionId": session_id, + "site": event.get("site", "unknown"), + "site_bucket": site_bucket, + "difficulty": event.get("difficulty", "unknown"), + "start_time": event.get("timestamp"), + "events_count": 0, + } + site_store["sessions"][session_id]["events_count"] += 1 + site_store["sessions"][session_id]["last_activity"] = event.get( + "timestamp" + ) self.send_json_response({"status": "ok", "message": "Event tracked"}) except Exception as e: @@ -726,27 +840,32 @@ def print_startup_info(port): print("Mock Websites Server for AI Agent Evaluation") print("=" * 60) print(f"\nServer started at: http://localhost:{port}") - print(f"\nAvailable Sites:") + print("\nAvailable Sites:") print(f" - GBR (Easy): http://localhost:{port}/gbr/") print(f" - TechForum (Medium): http://localhost:{port}/techforum/") print(f" - CloudStack (Hard): http://localhost:{port}/cloudstack/") print(f" - DataFlow (Medium): http://localhost:{port}/dataflow/") print(f" - Finviz (Hard): http://localhost:{port}/finviz/") print(f" - BlueBook (Hard): http://localhost:{port}/bluebook/") - print(f"\nAPI Endpoints:") - print(f" - GET http://localhost:{port}/api/events - Get all tracked events") - print(f" - GET http://localhost:{port}/api/events/clear - Clear all events") + print("\nAPI Endpoints:") + print( + f" - GET http://localhost:{port}/api/events - Get tracked events (?site=gbr)" + ) + print( + f" - GET http://localhost:{port}/api/events/clear - Clear tracked events (?site=gbr)" + ) print(f" - GET http://localhost:{port}/api/sites - List available sites") print(f" - GET http://localhost:{port}/api/help - API help") print(f" - POST http://localhost:{port}/api/track - Submit tracking event") - print(f"\nPress Ctrl+C to stop the server") + print("\nPress Ctrl+C to stop the server") print("=" * 60 + "\n") def main(): """Main entry point""" - with socketserver.TCPServer(("", PORT), MockWebsiteHandler) as httpd: + with socketserver.ThreadingTCPServer(("", PORT), MockWebsiteHandler) as httpd: httpd.allow_reuse_address = True + httpd.daemon_threads = True print_startup_info(PORT) try: diff --git a/extension/src/__tests__/background-cleanup-regression.test.ts b/extension/src/__tests__/background-cleanup-regression.test.ts index 7938fe4..9583b2c 100644 --- a/extension/src/__tests__/background-cleanup-regression.test.ts +++ b/extension/src/__tests__/background-cleanup-regression.test.ts @@ -26,9 +26,48 @@ describe('Background cleanup regressions', () => { test('swipe screenshots reuse tab-view warmup capture options', () => { expect(backgroundSource).toContain("case 'swipe_element': {"); - expect(backgroundSource).toContain(' 900,'); + expect(backgroundSource).toContain("logLabel: 'SwipeElement'"); + expect(backgroundSource).toContain('preconditionWaitForRender: 900'); expect(backgroundSource).toContain( - ' TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,', + 'preconditionCaptureOptions: TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS', + ); + }); + + test('highlight warmup defaults to tab-view capture options', () => { + expect(backgroundSource).toContain( + 'captureOptions = TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS', + ); + expect(backgroundSource).toContain( + 'preconditionCaptureOptions ?? TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS', + ); + expect(backgroundSource).toContain( + 'const screenshotResult = await captureScreenshot(', + ); + expect(backgroundSource).not.toContain( + 'HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS', + ); + }); + + test('navigation defaults prime the page with a raw screenshot before highlight', () => { + expect(backgroundSource).toContain('async function runRawScreenshotPrime('); + expect(backgroundSource).toContain('primeWithRawScreenshot: true'); + expect(backgroundSource).toContain("logLabel: 'Tab Init'"); + expect(backgroundSource).toContain("logLabel: 'Tab Open'"); + expect(backgroundSource).toContain("logLabel: 'Tab Refresh'"); + }); + + test('post-action screenshots reuse the default highlighted page-state helper', () => { + expect(backgroundSource).toContain( + 'async function captureDefaultHighlightedPageState(', + ); + expect(backgroundSource).toContain("logLabel: 'ClickElement'"); + expect(backgroundSource).toContain("logLabel: 'Tab Init'"); + }); + + test('tab view still uses the raw screenshot path', () => { + expect(backgroundSource).toContain("case 'view': {"); + expect(backgroundSource).toContain( + 'const viewScreenshotResult = await captureScreenshot(', ); }); }); diff --git a/extension/src/__tests__/highlight-integration.test.ts b/extension/src/__tests__/highlight-integration.test.ts index 2dfdb57..61c683f 100644 --- a/extension/src/__tests__/highlight-integration.test.ts +++ b/extension/src/__tests__/highlight-integration.test.ts @@ -109,8 +109,8 @@ describe('Highlight Integration', () => { // Run selectCollisionFreePage const page1 = selectCollisionFreePage(elements, 1); - expect(page1.map((element) => element.id)).toEqual( - page1.map((_, index) => String(index + 1)), + expect(new Set(page1.map((element) => element.id)).size).toBe( + page1.length, ); // Verify no label collisions on the same page @@ -161,15 +161,19 @@ describe('Highlight Integration', () => { const positions = new Set(page1.map((e) => e.labelPosition)); expect(positions.size).toBe(page1.length); - // Verify elements on different pages while numeric ids reset per page. + // Verify elements on different pages while preserving each element's ID. const page1Selectors = new Set(page1.map((e) => e.selector)); + const expectedIdsBySelector = Object.fromEntries( + elements.map((element) => [element.selector, element.id]), + ); const page2 = selectCollisionFreePage(elements, 2); expect(page2.length).toBeGreaterThan(0); - expect(page2.map((element) => element.id)).toEqual( - page2.map((_, index) => String(index + 1)), - ); for (const elem of page2) { expect(page1Selectors.has(elem.selector)).toBe(false); + expect(expectedIdsBySelector[elem.selector]).toBe(elem.id); + } + for (const elem of page1) { + expect(expectedIdsBySelector[elem.selector]).toBe(elem.id); } }); @@ -259,6 +263,16 @@ describe('Highlight Integration', () => { // Both should fit with appropriate label positions expect(result.length).toBeGreaterThan(0); }); + + test('should separate partially overlapping non-nested boxes across pages', () => { + const elemA = createElement('overlap-a', 'clickable', 100, 100, 120, 40); + const elemB = createElement('overlap-b', 'clickable', 180, 110, 120, 40); + const elements = [elemA, elemB]; + + expect(calculateTotalPages(elements, 1280, 720)).toBe(2); + expect(selectCollisionFreePage(elements, 1, 1280, 720)).toHaveLength(1); + expect(selectCollisionFreePage(elements, 2, 1280, 720)).toHaveLength(1); + }); }); describe('Label placement algorithm', () => { @@ -331,7 +345,7 @@ describe('Highlight Integration', () => { const page1 = selectCollisionFreePage(elements, 1, 1728, 891); - expect(page1.map((e) => e.id)).toEqual(['1', '2', '3']); + expect(page1.map((e) => e.id)).toEqual(['modal', 'like', 'reply']); expect(page1[0].labelPosition).toBeDefined(); expect(page1[1].labelPosition).toBeDefined(); expect(page1[2].labelPosition).toBeDefined(); @@ -356,6 +370,36 @@ describe('Highlight Integration', () => { // Should not use 'left' position (would be outside viewport) expect(leftElem?.labelPosition).not.toBe('left'); }); + + test('should treat one-pixel label-to-element gaps as blocked', () => { + const upper = createElement('upper', 'clickable', 100, 44, 80, 30); + const lower = createElement('lower', 'clickable', 100, 101, 80, 30); + + const result = selectCollisionFreePage([upper, lower], 1, 1280, 720); + + expect(findBySelector(result, '#upper')?.labelPosition).toBe('above'); + expect(findBySelector(result, '#lower')?.labelPosition).toBe('below'); + }); + + test('should treat one-pixel label-to-label gaps as blocked', () => { + const left = createElement('AAAAAA', 'clickable', 100, 100, 24, 14); + const leftLabel = getLabelBBox(left.bbox, 'above', left.id); + const right = createElement( + 'CCCCCC', + 'clickable', + leftLabel.x + leftLabel.width + 1, + 100, + 24, + 14, + ); + + const result = selectCollisionFreePage([left, right], 1, 1280, 720); + + expect(findBySelector(result, '#AAAAAA')?.labelPosition).not.toBe( + 'above', + ); + expect(findBySelector(result, '#CCCCCC')?.labelPosition).toBe('above'); + }); }); describe('Edge cases', () => { diff --git a/extension/src/__tests__/highlight-padding.test.ts b/extension/src/__tests__/highlight-padding.test.ts index 836756a..86543ca 100644 --- a/extension/src/__tests__/highlight-padding.test.ts +++ b/extension/src/__tests__/highlight-padding.test.ts @@ -13,18 +13,17 @@ import { /** * TDD Tests for Padding Consistency - * Bug: BASE_LABEL_PADDING is 4, should be 5 to match LABEL_PADDING - * Label height = 16 (font) + 5 (padding) * 2 = 26px + * Label height = 16 (font) + 3 (padding) * 2 = 22px */ describe('Padding Consistency', () => { describe('Collision Detection Constants (background/index.ts)', () => { - test('should have LABEL_HEIGHT of 26px', () => { - expect(LABEL_HEIGHT).toBe(26); + test('should have LABEL_HEIGHT of 22px', () => { + expect(LABEL_HEIGHT).toBe(22); }); - test('should have LABEL_PADDING of 5px', () => { - expect(LABEL_PADDING).toBe(5); + test('should have LABEL_PADDING of 3px', () => { + expect(LABEL_PADDING).toBe(3); }); test('should have LABEL_FONT_SIZE of 16px', () => { @@ -33,8 +32,8 @@ describe('Padding Consistency', () => { }); describe('Visual Drawing Constants (visual-highlight.ts)', () => { - test('should have BASE_LABEL_PADDING of 5px', () => { - expect(BASE_LABEL_PADDING).toBe(5); + test('should have BASE_LABEL_PADDING of 3px', () => { + expect(BASE_LABEL_PADDING).toBe(3); }); test('should have BASE_FONT_SIZE of 16px', () => { @@ -43,7 +42,7 @@ describe('Padding Consistency', () => { test('should calculate label height consistently', () => { const expectedLabelHeight = BASE_FONT_SIZE + BASE_LABEL_PADDING * 2; - expect(expectedLabelHeight).toBe(26); + expect(expectedLabelHeight).toBe(22); }); }); @@ -57,7 +56,7 @@ describe('Padding Consistency', () => { const visualLabelHeight = BASE_FONT_SIZE + BASE_LABEL_PADDING * 2; expect(collisionLabelHeight).toBe(visualLabelHeight); - expect(collisionLabelHeight).toBe(26); + expect(collisionLabelHeight).toBe(22); }); }); }); diff --git a/extension/src/__tests__/highlight-placement.test.ts b/extension/src/__tests__/highlight-placement.test.ts index 7514a74..cf43175 100644 --- a/extension/src/__tests__/highlight-placement.test.ts +++ b/extension/src/__tests__/highlight-placement.test.ts @@ -9,6 +9,7 @@ import { selectCollisionFreePage, } from '../utils/collision-detection'; import type { InteractiveElement } from '../types'; +import { generateShortHash } from '../commands/element-id'; import { getLabelDimensions } from '../utils/label-geometry'; /** @@ -23,18 +24,19 @@ import { getLabelDimensions } from '../utils/label-geometry'; // Helper to create a minimal InteractiveElement function createElement( - id: string, + selectorName: string, x: number, y: number, width: number, height: number, labelPosition?: 'above' | 'below' | 'left' | 'right', ): InteractiveElement { + const selector = `#${selectorName}`; return { - id, + id: generateShortHash(selector), type: 'clickable', tagName: 'button', - selector: `#${id}`, + selector, bbox: { x, y, width, height }, isVisible: true, isInViewport: true, @@ -158,9 +160,9 @@ describe('Smart Label Placement', () => { expect(result).toHaveLength(2); expect(result[0]?.selector).toBe('#constrained'); - expect(result[0]?.id).toBe('1'); + expect(result[0]?.id).toMatch(/^[0-9A-Z]{3}$/); expect(result[1]?.selector).toBe('#flexible'); - expect(result[1]?.id).toBe('2'); + expect(result[1]?.id).toMatch(/^[0-9A-Z]{3}$/); }); test('should place label above when space available (default)', () => { @@ -172,20 +174,22 @@ describe('Smart Label Placement', () => { expect(result[0].labelPosition).toBe('above'); }); - test('should place label below when above collides with another element', () => { + test('should place one label below when two identical elements would both prefer above', () => { // Element A at (100, 100) - label above at y=74-100 // Element B at (100, 100) - same position as A, label above would collide - // Element B should try below instead + // The layout should split them across above/below instead of dropping one. const elemA = createElement('a', 100, 100, 50, 30); const elemB = createElement('b', 100, 100, 50, 30); const elements = [elemA, elemB]; const result = selectCollisionFreePage(elements, 1); - // Both elements should be on page 1 with different label positions + // Both elements should be on page 1 with different label positions. expect(result).toHaveLength(2); - const resultB = findBySelector(result, '#b'); - expect(resultB?.labelPosition).toBe('below'); + expect(result.map((element) => element.labelPosition).sort()).toEqual([ + 'above', + 'below', + ]); }); test('should place label left when above and below collide', () => { @@ -239,8 +243,8 @@ describe('Smart Label Placement', () => { // Element completely surrounded in input order. The constraint-aware // heuristic should reorder placements so the center element still fits. const center = createElement('center', 200, 100, 50, 30); - const above = createElement('above', 200, 74, 50, 30); - const below = createElement('below', 200, 130, 50, 30); + const above = createElement('above', 200, 64, 50, 30); + const below = createElement('below', 200, 140, 50, 30); const left = createElement('left', 80, 100, 50, 30); const right = createElement('right', 320, 100, 50, 30); @@ -367,12 +371,14 @@ describe('Smart Label Placement', () => { // All should fit without collision expect(result).toHaveLength(3); - expect(result.map((element) => element.id)).toEqual(['1', '2', '3']); expect(result.map((element) => element.selector)).toEqual([ '#a', '#b', '#c', ]); + expect(result.every((element) => /^[0-9A-Z]{3}$/.test(element.id))).toBe( + true, + ); }); }); }); diff --git a/extension/src/__tests__/label-geometry.test.ts b/extension/src/__tests__/label-geometry.test.ts new file mode 100644 index 0000000..b79ae2e --- /dev/null +++ b/extension/src/__tests__/label-geometry.test.ts @@ -0,0 +1,18 @@ +import { describe, expect, test } from 'bun:test'; + +import { getLabelDimensions, getLabelTextWidth } from '../utils/label-geometry'; + +describe('label-geometry', () => { + test('measures wide glyphs wider than narrow glyphs', () => { + expect(getLabelTextWidth('III')).toBeLessThan(getLabelTextWidth('WWW')); + expect(getLabelTextWidth('111')).toBeLessThan(getLabelTextWidth('MQH')); + }); + + test('allocates enough width for measured label text plus padding', () => { + const dimensions = getLabelDimensions('MQH'); + + expect(dimensions.width).toBe( + dimensions.textWidth + dimensions.padding * 2, + ); + }); +}); diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index 992a006..c725b93 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -23,7 +23,8 @@ import { clearScreenshotCache } from '../commands/computer'; import { drawHighlights } from '../commands/visual-highlight'; import { highlightSingleElement } from '../commands/single-highlight'; import { elementCache } from '../commands/element-cache'; -import { assignSequentialElementIds } from '../commands/element-id'; +import { assignHashedElementIds } from '../commands/element-id'; +import { buildElementCacheMissMessage } from '../commands/element-cache'; import { buildHighlightDetectionScript, filterHighlightElementsByKeywords, @@ -59,11 +60,15 @@ import { type HighlightPageState, } from '../utils/layout-stability'; import { - HIGHLIGHT_PRECONDITION_CAPTURE_OPTIONS, - HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS, TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS, + type ScreenshotCaptureOptions, } from '../utils/highlight-screenshot'; -import type { Command, CommandResponse, InteractiveElement } from '../types'; +import type { + Command, + CommandResponse, + ElementType, + InteractiveElement, +} from '../types'; console.log('🚀 OpenBrowser extension starting (Strict Mode)...'); const SERVER_HTTP_URL = 'http://127.0.0.1:8765'; @@ -87,13 +92,24 @@ async function compressScreenshotResult( async function runHighlightPreconditionWarmup(options: { tabId: number; conversationId: string; - elementType: string; + elementType: ElementType; page: number; + waitForRender?: number; + captureOptions?: ScreenshotCaptureOptions; + logLabel?: string; }): Promise { - const { tabId, conversationId, elementType, page } = options; + const { + tabId, + conversationId, + elementType, + page, + waitForRender = 350, + captureOptions = TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS, + logLabel = 'HighlightElements', + } = options; const warmupStart = Date.now(); console.log( - `🔥 [HighlightElements] Starting screenshot warmup precondition for elementType=${elementType}, page=${page}`, + `🔥 [${logLabel}] Starting screenshot warmup precondition for elementType=${elementType}, page=${page}`, ); await captureScreenshot( @@ -102,12 +118,44 @@ async function runHighlightPreconditionWarmup(options: { true, 90, false, - 350, - HIGHLIGHT_PRECONDITION_CAPTURE_OPTIONS, + waitForRender, + captureOptions, ); console.log( - `🔥 [HighlightElements] Screenshot warmup precondition completed in ${Date.now() - warmupStart}ms`, + `🔥 [${logLabel}] Screenshot warmup precondition completed in ${Date.now() - warmupStart}ms`, + ); +} + +async function runRawScreenshotPrime(options: { + tabId: number; + conversationId: string; + waitForRender?: number; + captureOptions?: ScreenshotCaptureOptions; + logLabel?: string; +}): Promise { + const { + tabId, + conversationId, + waitForRender = 350, + captureOptions = TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS, + logLabel = 'HighlightPrime', + } = options; + const primeStart = Date.now(); + console.log(`🔥 [${logLabel}] Starting raw screenshot wake-up prime`); + + await captureScreenshot( + tabId, + conversationId, + true, + 90, + false, + waitForRender, + captureOptions, + ); + + console.log( + `🔥 [${logLabel}] Raw screenshot wake-up prime completed in ${Date.now() - primeStart}ms`, ); } @@ -127,11 +175,7 @@ function buildStoredHighlightPages(options: { } = options; if (keywordMode) { - return [ - assignSequentialElementIds( - sortElementsByVisualOrder(assignSequentialElementIds(filteredElements)), - ), - ]; + return [sortElementsByVisualOrder(filteredElements)]; } const pages: InteractiveElement[][] = []; @@ -142,9 +186,7 @@ function buildStoredHighlightPages(options: { viewportWidth, viewportHeight, ); - pages.push( - assignSequentialElementIds(sortElementsByVisualOrder(pageElements)), - ); + pages.push(sortElementsByVisualOrder(pageElements)); } return pages; @@ -196,6 +238,420 @@ function buildHighlightConsistencyScript( `; } +interface ScreenshotPayload { + screenshot?: string; + dialog_auto_accepted?: unknown; + dialog_auto_accepted_list?: unknown; +} + +interface HighlightedPageStateData extends ScreenshotPayload { + elements: InteractiveElement[]; + totalElements: number; + totalPages: number; + page: number; + pageState: HighlightPageState; + readinessReasons: string[]; +} + +interface HighlightedPageCaptureOptions { + tabId: number; + conversationId: string; + elementType?: ElementType; + page?: number; + keywords?: string[]; + logLabel?: string; + preconditionWaitForRender?: number; + preconditionCaptureOptions?: ScreenshotCaptureOptions; +} + +function buildScreenshotPayload( + screenshotResult: + | { + imageData?: string; + dialog_auto_accepted?: unknown; + dialog_auto_accepted_list?: unknown; + } + | null + | undefined, +): ScreenshotPayload { + return { + screenshot: screenshotResult?.imageData, + ...(screenshotResult?.dialog_auto_accepted + ? { + dialog_auto_accepted: screenshotResult.dialog_auto_accepted, + } + : {}), + ...(screenshotResult?.dialog_auto_accepted_list + ? { + dialog_auto_accepted_list: screenshotResult.dialog_auto_accepted_list, + } + : {}), + }; +} + +async function captureHighlightedPageState( + options: HighlightedPageCaptureOptions, +): Promise { + const { + tabId, + conversationId, + elementType = 'any', + page = 1, + keywords, + logLabel = 'HighlightElements', + preconditionWaitForRender, + preconditionCaptureOptions, + } = options; + + await tabManager.ensureTabManaged(tabId, conversationId); + tabManager.updateTabActivity(tabId, conversationId); + + const detectionScript = buildHighlightDetectionScript({ + elementType, + }); + + await runHighlightPreconditionWarmup({ + tabId, + conversationId, + elementType, + page, + waitForRender: preconditionWaitForRender, + captureOptions: preconditionCaptureOptions, + logLabel, + }); + + const maxHighlightAttempts = 3; + const highlightDetectionTimeoutMs = 18000; + let previousConsistency: HighlightConsistencyResult | null = null; + + for (let attempt = 1; attempt <= maxHighlightAttempts; attempt++) { + console.log(`🔁 [${logLabel}] Attempt ${attempt}/${maxHighlightAttempts}`); + + const detectionResult = await javascript.executeJavaScript( + tabId, + conversationId, + detectionScript, + true, + true, + highlightDetectionTimeoutMs, + ); + + if (!detectionResult.success || !detectionResult.result?.value) { + throw new Error(detectionResult.error || 'Failed to detect elements'); + } + + const allElements = detectionResult.result.value.elements || []; + const detectedDocumentId = + typeof detectionResult.result.value.documentId === 'string' + ? detectionResult.result.value.documentId + : ''; + const detectedViewport = detectionResult.result.value.viewport || {}; + const layoutStability = detectionResult.result.value.layoutStability; + const highlightTraceStart = Date.now(); + const detectedViewportWidth = + typeof detectedViewport.width === 'number' ? detectedViewport.width : 0; + const detectedViewportHeight = + typeof detectedViewport.height === 'number' ? detectedViewport.height : 0; + if (layoutStability) { + console.log( + `⏳ [${logLabel}] Readiness snapshot: ${JSON.stringify(layoutStability)}`, + ); + } + + const pageState: HighlightPageState = layoutStability?.state || 'ready'; + const readinessReasons = Array.isArray(layoutStability?.reasons) + ? layoutStability.reasons + : []; + + if (pageState === 'not_ready' && attempt < maxHighlightAttempts) { + const retryDelayMs = getHighlightReadinessRetryDelay(attempt); + console.warn( + `⚠️ [${logLabel}] Readiness state is not_ready (${readinessReasons.join(', ') || 'no reasons'}), retrying in ${retryDelayMs}ms (attempt ${attempt}/${maxHighlightAttempts})`, + ); + await new Promise((resolve) => setTimeout(resolve, retryDelayMs)); + continue; + } + + const keywordFilterStart = Date.now(); + const keywordFiltering = filterHighlightElementsByKeywords( + allElements, + keywords, + ); + const keywordList = keywordFiltering.keywords; + const filteredElements = assignHashedElementIds(keywordFiltering.elements); + + if (keywordList.length > 0) { + console.log( + `🔍 [${logLabel}] Keywords [${keywordList.join(', ')}] matched ${filteredElements.length} of ${allElements.length} elements`, + ); + } + console.log( + `⏱️ [HighlightTrace] background keyword-filter ${Date.now() - keywordFilterStart}ms (keywords=${keywordList.length}, kept=${filteredElements.length}/${allElements.length})`, + ); + + let paginatedElements: InteractiveElement[]; + let totalPages: number; + let currentPage = page; + + if (keywordList.length > 0) { + paginatedElements = filteredElements; + totalPages = 1; + currentPage = 1; + console.log( + `🔍 [${logLabel}] Keywords [${keywordList.join(', ')}] matched ${paginatedElements.length} elements (no pagination)`, + ); + } else { + const paginationSelectionStart = Date.now(); + paginatedElements = selectCollisionFreePage( + filteredElements, + page, + detectedViewportWidth, + detectedViewportHeight, + ); + const paginationSelectionMs = Date.now() - paginationSelectionStart; + const totalPagesStart = Date.now(); + totalPages = calculateTotalPages( + filteredElements, + detectedViewportWidth, + detectedViewportHeight, + ); + const totalPagesMs = Date.now() - totalPagesStart; + console.log( + `📄 [${logLabel}] Page ${page}/${totalPages}, showing ${paginatedElements.length} of ${filteredElements.length} elements`, + ); + console.log( + `⏱️ [HighlightTrace] background pagination select=${paginationSelectionMs}ms totalPages=${totalPagesMs}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, + ); + } + + const screenshotStart = Date.now(); + const screenshotResult = await captureScreenshot( + tabId, + conversationId, + true, + 90, + false, + 0, + TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS, + ); + + if (!screenshotResult?.success || !screenshotResult?.imageData) { + throw new Error( + `Failed to capture screenshot: ${screenshotResult?.success === false ? 'Screenshot command failed' : 'No image data returned'}`, + ); + } + console.log( + `📸 [${logLabel}] Screenshot captured, size: ${screenshotResult.imageData.length} bytes`, + ); + console.log( + `⏱️ [HighlightTrace] background screenshot ${Date.now() - screenshotStart}ms`, + ); + + const imageScale = + screenshotResult.metadata?.imageScale || + screenshotResult.metadata?.devicePixelRatio || + 1; + const viewportWidth = screenshotResult.metadata?.viewportWidth || 0; + const viewportHeight = screenshotResult.metadata?.viewportHeight || 0; + console.log(`📐 [${logLabel}] Image scale: ${imageScale}`); + console.log( + `📐 [${logLabel}] Viewport: ${viewportWidth}x${viewportHeight} CSS pixels`, + ); + console.log( + `📐 [${logLabel}] Expected image size: ${Math.round(viewportWidth * imageScale)}x${Math.round(viewportHeight * imageScale)} device pixels`, + ); + + const consistencyCheckStart = Date.now(); + const consistencyScript = + buildHighlightConsistencyScript(paginatedElements); + const consistencyResult = await javascript.executeJavaScript( + tabId, + conversationId, + consistencyScript, + true, + false, + 2000, + ); + const currentConsistencySamples = + consistencyResult.success && + consistencyResult.result?.value?.samples && + Array.isArray(consistencyResult.result.value.samples) + ? consistencyResult.result.value.samples + : []; + const highlightConsistency = evaluateHighlightConsistency( + paginatedElements + .slice(0, HIGHLIGHT_CONSISTENCY_CONFIG.maxSampleSize) + .map((element) => ({ + id: element.id, + bbox: element.bbox, + })), + currentConsistencySamples, + ); + console.log( + `⏱️ [HighlightTrace] background consistency-check ${Date.now() - consistencyCheckStart}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`, + ); + const repeatedDrift = isRepeatedHighlightDrift( + highlightConsistency, + previousConsistency, + ); + + if ( + highlightConsistency.shouldRetry && + attempt < maxHighlightAttempts && + !repeatedDrift + ) { + previousConsistency = highlightConsistency; + console.warn( + `⚠️ [${logLabel}] Layout drift detected after screenshot, retrying (attempt ${attempt}/${maxHighlightAttempts})`, + ); + continue; + } + + if (highlightConsistency.shouldRetry) { + console.warn( + repeatedDrift + ? `⚠️ [${logLabel}] Layout drift repeated with near-identical metrics, returning latest screenshot` + : `⚠️ [${logLabel}] Layout drift still detected on final attempt, returning latest screenshot`, + ); + } + + const storedPages = buildStoredHighlightPages({ + filteredElements, + totalPages, + viewportWidth: detectedViewportWidth, + viewportHeight: detectedViewportHeight, + keywordMode: keywordList.length > 0, + }); + const displayOrderedElements = storedPages[currentPage - 1] ?? []; + + const cacheStoreStart = Date.now(); + const storedPage = elementCache.storeHighlightResult({ + conversationId, + tabId, + documentId: detectedDocumentId, + elementType, + keywords: keywordList, + totalElements: filteredElements.length, + totalPages, + pages: storedPages, + page: currentPage, + }); + console.log( + `⏱️ [HighlightTrace] background cache-store ${Date.now() - cacheStoreStart}ms (page=${storedPage.page}, count=${displayOrderedElements.length})`, + ); + + if (displayOrderedElements.length > 0) { + console.log( + `📍 [${logLabel}] First element bbox:`, + JSON.stringify(displayOrderedElements[0].bbox), + ); + } + + const drawHighlightsStart = Date.now(); + const highlightedScreenshot = await drawHighlights( + screenshotResult.imageData, + storedPage.elements, + { + scale: imageScale, + viewportWidth, + viewportHeight, + }, + ); + console.log( + `⏱️ [HighlightTrace] background draw-highlights ${Date.now() - drawHighlightsStart}ms (elements=${storedPage.elements.length})`, + ); + + const compressStart = Date.now(); + const compressedScreenshotResult = await compressScreenshotResult({ + imageData: highlightedScreenshot, + dialog_auto_accepted: screenshotResult.dialog_auto_accepted, + dialog_auto_accepted_list: screenshotResult.dialog_auto_accepted_list, + }); + console.log( + `⏱️ [HighlightTrace] background compress ${Date.now() - compressStart}ms`, + ); + console.log( + `⏱️ [HighlightTrace] background total ${Date.now() - highlightTraceStart}ms`, + ); + + return { + elements: storedPage.elements, + totalElements: filteredElements.length, + totalPages, + page: currentPage, + pageState, + readinessReasons, + ...buildScreenshotPayload(compressedScreenshotResult), + }; + } + + throw new Error('Failed to produce a stable highlight screenshot'); +} + +async function captureDefaultHighlightedPageState(options: { + tabId: number; + conversationId: string; + logLabel: string; + preconditionWaitForRender?: number; + preconditionCaptureOptions?: ScreenshotCaptureOptions; + primeWithRawScreenshot?: boolean; + primeWaitForRender?: number; + primeCaptureOptions?: ScreenshotCaptureOptions; +}): Promise { + const { + tabId, + conversationId, + logLabel, + preconditionWaitForRender, + preconditionCaptureOptions, + primeWithRawScreenshot = false, + primeWaitForRender, + primeCaptureOptions, + } = options; + const effectivePreconditionWaitForRender = preconditionWaitForRender ?? 350; + const effectivePreconditionCaptureOptions = + preconditionCaptureOptions ?? TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS; + + if (primeWithRawScreenshot) { + await runRawScreenshotPrime({ + tabId, + conversationId, + waitForRender: primeWaitForRender ?? effectivePreconditionWaitForRender, + captureOptions: + primeCaptureOptions ?? effectivePreconditionCaptureOptions, + logLabel: `${logLabel} Prime`, + }); + } + + try { + return await captureHighlightedPageState({ + tabId, + conversationId, + elementType: 'any', + page: 1, + logLabel, + preconditionWaitForRender: effectivePreconditionWaitForRender, + preconditionCaptureOptions: effectivePreconditionCaptureOptions, + }); + } catch (error) { + console.warn( + `⚠️ [${logLabel}] Default any/page 1 highlight failed, falling back to raw screenshot: ${error instanceof Error ? error.message : String(error)}`, + ); + const screenshotResult = await captureScreenshot( + tabId, + conversationId, + true, + 90, + false, + effectivePreconditionWaitForRender, + effectivePreconditionCaptureOptions, + ); + const compressedScreenshotResult = + await compressScreenshotResult(screenshotResult); + return buildScreenshotPayload(compressedScreenshotResult); + } +} + function cleanupTabState(conversationId: string, tabId: number): void { elementCache.invalidate(conversationId, tabId); dialogManager.disableForTab(tabId); @@ -796,16 +1252,12 @@ async function handleCommand(command: Command): Promise { tabManager.setCurrentActiveTabId(conversationId, initResult.tabId); // Capture screenshot after initialization - const initScreenshotResult = await captureScreenshot( - initResult.tabId, + const initPageState = await captureDefaultHighlightedPageState({ + tabId: initResult.tabId, conversationId, - true, - 90, - false, - 0, - ); - const compressedInitScreenshotResult = - await compressScreenshotResult(initScreenshotResult); + logLabel: 'Tab Init', + primeWithRawScreenshot: true, + }); return { success: true, @@ -816,19 +1268,7 @@ async function handleCommand(command: Command): Promise { url: initResult.url, conversationId: conversationId, isManaged: true, - screenshot: compressedInitScreenshotResult?.imageData, - ...(compressedInitScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedInitScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedInitScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedInitScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...initPageState, }, timestamp: Date.now(), }; @@ -848,18 +1288,14 @@ async function handleCommand(command: Command): Promise { } // Capture screenshot after opening - const openScreenshotResult = openResult.tabId - ? await captureScreenshot( - openResult.tabId, + const openPageState = openResult.tabId + ? await captureDefaultHighlightedPageState({ + tabId: openResult.tabId, conversationId, - true, - 90, - false, - 0, - ) - : null; - const compressedOpenScreenshotResult = - await compressScreenshotResult(openScreenshotResult); + logLabel: 'Tab Open', + primeWithRawScreenshot: true, + }) + : {}; return { success: true, @@ -867,19 +1303,7 @@ async function handleCommand(command: Command): Promise { data: { ...openResult, conversationId: conversationId, - screenshot: compressedOpenScreenshotResult?.imageData, - ...(compressedOpenScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedOpenScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedOpenScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedOpenScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...openPageState, }, timestamp: Date.now(), }; @@ -911,16 +1335,12 @@ async function handleCommand(command: Command): Promise { tabManager.setCurrentActiveTabId(conversationId, command.tab_id); // Capture screenshot after switching - const switchScreenshotResult = await captureScreenshot( - command.tab_id, + const switchPageState = await captureDefaultHighlightedPageState({ + tabId: command.tab_id, conversationId, - true, - 90, - false, - 0, - ); - const compressedSwitchScreenshotResult = - await compressScreenshotResult(switchScreenshotResult); + logLabel: 'Tab Switch', + primeWithRawScreenshot: true, + }); return { success: true, @@ -928,19 +1348,7 @@ async function handleCommand(command: Command): Promise { data: { ...switchResult, conversationId: conversationId, - screenshot: compressedSwitchScreenshotResult?.imageData, - ...(compressedSwitchScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedSwitchScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedSwitchScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedSwitchScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...switchPageState, }, timestamp: Date.now(), }; @@ -969,16 +1377,12 @@ async function handleCommand(command: Command): Promise { const refreshResult = await tabs.refreshTab(command.tab_id); // Capture screenshot after refresh - const refreshScreenshotResult = await captureScreenshot( - command.tab_id, + const refreshPageState = await captureDefaultHighlightedPageState({ + tabId: command.tab_id, conversationId, - true, - 90, - false, - 0, - ); - const compressedRefreshScreenshotResult = - await compressScreenshotResult(refreshScreenshotResult); + logLabel: 'Tab Refresh', + primeWithRawScreenshot: true, + }); return { success: true, @@ -986,19 +1390,7 @@ async function handleCommand(command: Command): Promise { data: { ...refreshResult, conversationId: conversationId, - screenshot: compressedRefreshScreenshotResult?.imageData, - ...(compressedRefreshScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedRefreshScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedRefreshScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedRefreshScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...refreshPageState, }, timestamp: Date.now(), }; @@ -1091,17 +1483,14 @@ async function handleCommand(command: Command): Promise { ? await tabs.goBack(targetTabId) : await tabs.goForward(targetTabId); - // Capture screenshot after navigation - const screenshotResult = await captureScreenshot( - targetTabId, - conversationId, - true, - 90, - false, - 0, - ); - const compressedNavigationScreenshotResult = - await compressScreenshotResult(screenshotResult); + const navigationPageState = + await captureDefaultHighlightedPageState({ + tabId: targetTabId, + conversationId, + logLabel: + command.action === 'back' ? 'Tab Back' : 'Tab Forward', + primeWithRawScreenshot: true, + }); return { success: true, @@ -1110,19 +1499,7 @@ async function handleCommand(command: Command): Promise { ...navigationResult, tabId: targetTabId, conversationId: conversationId, - screenshot: compressedNavigationScreenshotResult?.imageData, - ...(compressedNavigationScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedNavigationScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedNavigationScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedNavigationScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...navigationPageState, }, timestamp: Date.now(), }; @@ -1292,36 +1669,18 @@ async function handleCommand(command: Command): Promise { tabManager.setCurrentActiveTabId(conversationId, screenshotTabId); } - // Always take screenshot - const jsScreenshotResult = await captureScreenshot( - screenshotTabId, + const jsPageState = await captureDefaultHighlightedPageState({ + tabId: screenshotTabId, conversationId, - true, - 90, - false, - 0, - ); - const compressedJsScreenshotResult = - await compressScreenshotResult(jsScreenshotResult); + logLabel: 'JavaScript', + }); return { success: true, message: 'JavaScript executed successfully', data: { ...jsResult, - screenshot: compressedJsScreenshotResult?.imageData, - ...(compressedJsScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedJsScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedJsScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedJsScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...jsPageState, }, timestamp: Date.now(), duration: jsDuration, @@ -1391,15 +1750,11 @@ async function handleCommand(command: Command): Promise { console.log(`💬 [HandleDialog] Auto-accepting cascading alert`); await dialogManager.autoAcceptDialog(activeTabId); - // Take screenshot after auto-accept - const screenshotResult = await captureScreenshot( - activeTabId, + const dialogPageState = await captureDefaultHighlightedPageState({ + tabId: activeTabId, conversationId, - true, // include_cursor - 90, // quality - false, // resizeToPreset - 0, // waitForRender - ); + logLabel: 'HandleDialog', + }); return { success: true, @@ -1411,22 +1766,7 @@ async function handleCommand(command: Command): Promise { message: handleResult.newDialog.message, autoAccepted: true, }, - screenshot: await compressIfNeeded( - screenshotResult, - getCompressionThreshold(), - ), - ...(screenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - screenshotResult.dialog_auto_accepted, - } - : {}), - ...(screenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - screenshotResult.dialog_auto_accepted_list, - } - : {}), + ...dialogPageState, }, timestamp: Date.now(), }; @@ -1451,16 +1791,11 @@ async function handleCommand(command: Command): Promise { }; } - // No cascade - dialog handling complete - // Take screenshot to show the result - const screenshotResult = await captureScreenshot( - activeTabId, + const dialogPageState = await captureDefaultHighlightedPageState({ + tabId: activeTabId, conversationId, - true, // include_cursor - 90, // quality - false, // resizeToPreset - 0, // waitForRender - ); + logLabel: 'HandleDialog', + }); console.log( `✅ [HandleDialog] Dialog handling complete, screenshot captured`, @@ -1471,21 +1806,7 @@ async function handleCommand(command: Command): Promise { message: `Dialog handled successfully: ${handleResult.previousDialog.type} ${action}ed`, data: { handledDialog: handleResult.previousDialog, - screenshot: await compressIfNeeded( - screenshotResult, - getCompressionThreshold(), - ), - ...(screenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: screenshotResult.dialog_auto_accepted, - } - : {}), - ...(screenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - screenshotResult.dialog_auto_accepted_list, - } - : {}), + ...dialogPageState, }, timestamp: Date.now(), }; @@ -1544,334 +1865,28 @@ async function handleCommand(command: Command): Promise { const keywords = command.keywords; const elementType = command.element_type || 'any'; const page = command.page || 1; - - const detectionScript = buildHighlightDetectionScript({ - elementType, - }); - - await runHighlightPreconditionWarmup({ - tabId: activeTabId, - conversationId, - elementType, - page, - }); - - const maxHighlightAttempts = 3; - const highlightDetectionTimeoutMs = 18000; - let previousConsistency: HighlightConsistencyResult | null = null; - - for (let attempt = 1; attempt <= maxHighlightAttempts; attempt++) { - console.log( - `🔁 [HighlightElements] Attempt ${attempt}/${maxHighlightAttempts}`, - ); - - // Execute detection script in page context - const detectionResult = await javascript.executeJavaScript( - activeTabId, - conversationId, - detectionScript, - true, // returnByValue - true, // awaitPromise - highlightDetectionTimeoutMs, // timeout - ); - - if (!detectionResult.success || !detectionResult.result?.value) { - return { - success: false, - error: detectionResult.error || 'Failed to detect elements', - timestamp: Date.now(), - }; - } - - const allElements = detectionResult.result.value.elements || []; - const detectedDocumentId = - typeof detectionResult.result.value.documentId === 'string' - ? detectionResult.result.value.documentId - : ''; - const detectedViewport = detectionResult.result.value.viewport || {}; - const layoutStability = detectionResult.result.value.layoutStability; - const highlightTraceStart = Date.now(); - const detectedViewportWidth = - typeof detectedViewport.width === 'number' - ? detectedViewport.width - : 0; - const detectedViewportHeight = - typeof detectedViewport.height === 'number' - ? detectedViewport.height - : 0; - if (layoutStability) { - console.log( - `⏳ [HighlightElements] Readiness snapshot: ${JSON.stringify(layoutStability)}`, - ); - } - - // Do not wait inside the page for "stability". Hidden/background tabs - // can throttle page timers hard enough that page-side polling becomes - // the dominant source of highlight timeouts. Instead, classify the - // current snapshot and do at most a couple of short background-side - // retries when the viewport still looks like a loading/skeleton state. - const pageState: HighlightPageState = - layoutStability?.state || 'ready'; - const readinessReasons = Array.isArray(layoutStability?.reasons) - ? layoutStability.reasons - : []; - - if (pageState === 'not_ready' && attempt < maxHighlightAttempts) { - const retryDelayMs = getHighlightReadinessRetryDelay(attempt); - console.warn( - `⚠️ [HighlightElements] Readiness state is not_ready (${readinessReasons.join(', ') || 'no reasons'}), retrying in ${retryDelayMs}ms (attempt ${attempt}/${maxHighlightAttempts})`, - ); - await new Promise((resolve) => setTimeout(resolve, retryDelayMs)); - continue; - } - - const keywordFilterStart = Date.now(); - const keywordFiltering = filterHighlightElementsByKeywords( - allElements, - keywords, - ); - const keywordList = keywordFiltering.keywords; - const filteredElements = keywordFiltering.elements; - - if (keywordList.length > 0) { - console.log( - `🔍 [HighlightElements] Keywords [${keywordList.join(', ')}] matched ${filteredElements.length} of ${allElements.length} elements`, - ); - } - console.log( - `⏱️ [HighlightTrace] background keyword-filter ${Date.now() - keywordFilterStart}ms (keywords=${keywordList.length}, kept=${filteredElements.length}/${allElements.length})`, - ); - - let paginatedElements: InteractiveElement[]; - let totalPages: number; - let currentPage = page; - - if (keywordList.length > 0) { - // Keyword mode: return all matching elements, no pagination. - // Assign temporary numeric IDs so the consistency check can - // correlate samples before the final display-order renumbering. - paginatedElements = assignSequentialElementIds(filteredElements); - totalPages = 1; - currentPage = 1; - console.log( - `🔍 [HighlightElements] Keywords [${keywordList.join(', ')}] matched ${paginatedElements.length} elements (no pagination)`, - ); - } else { - // Normal collision-aware pagination - const paginationSelectionStart = Date.now(); - paginatedElements = selectCollisionFreePage( - filteredElements, - page, - detectedViewportWidth, - detectedViewportHeight, - ); - const paginationSelectionMs = Date.now() - paginationSelectionStart; - const totalPagesStart = Date.now(); - totalPages = calculateTotalPages( - filteredElements, - detectedViewportWidth, - detectedViewportHeight, - ); - const totalPagesMs = Date.now() - totalPagesStart; - console.log( - `📄 [HighlightElements] Page ${page}/${totalPages}, showing ${paginatedElements.length} of ${filteredElements.length} elements`, - ); - console.log( - `⏱️ [HighlightTrace] background pagination select=${paginationSelectionMs}ms totalPages=${totalPagesMs}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, - ); - } - - // Capture screenshot - const screenshotStart = Date.now(); - const screenshotResult = await captureScreenshot( - activeTabId, - conversationId, - true, - 90, - false, - 0, - HIGHLIGHT_SCREENSHOT_CAPTURE_OPTIONS, - ); - - // Validate screenshot result - if (!screenshotResult?.success || !screenshotResult?.imageData) { - return { - success: false, - error: `Failed to capture screenshot: ${screenshotResult?.success === false ? 'Screenshot command failed' : 'No image data returned'}`, - timestamp: Date.now(), - }; - } - console.log( - `📸 [HighlightElements] Screenshot captured, size: ${screenshotResult.imageData.length} bytes`, - ); - console.log( - `⏱️ [HighlightTrace] background screenshot ${Date.now() - screenshotStart}ms`, - ); - - // Get device pixel ratio for coordinate scaling - const imageScale = - screenshotResult.metadata?.imageScale || - screenshotResult.metadata?.devicePixelRatio || - 1; - const viewportWidth = screenshotResult.metadata?.viewportWidth || 0; - const viewportHeight = screenshotResult.metadata?.viewportHeight || 0; - console.log(`📐 [HighlightElements] Image scale: ${imageScale}`); - console.log( - `📐 [HighlightElements] Viewport: ${viewportWidth}x${viewportHeight} CSS pixels`, - ); - console.log( - `📐 [HighlightElements] Expected image size: ${Math.round(viewportWidth * imageScale)}x${Math.round(viewportHeight * imageScale)} device pixels`, - ); - - const consistencyCheckStart = Date.now(); - const consistencyScript = - buildHighlightConsistencyScript(paginatedElements); - const consistencyResult = await javascript.executeJavaScript( - activeTabId, - conversationId, - consistencyScript, - true, - false, - 2000, - ); - const currentConsistencySamples = - consistencyResult.success && - consistencyResult.result?.value?.samples && - Array.isArray(consistencyResult.result.value.samples) - ? consistencyResult.result.value.samples - : []; - const highlightConsistency = evaluateHighlightConsistency( - paginatedElements - .slice(0, HIGHLIGHT_CONSISTENCY_CONFIG.maxSampleSize) - .map((element) => ({ - id: element.id, - bbox: element.bbox, - })), - currentConsistencySamples, - ); - console.log( - `⏱️ [HighlightTrace] background consistency-check ${Date.now() - consistencyCheckStart}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`, - ); - const repeatedDrift = isRepeatedHighlightDrift( - highlightConsistency, - previousConsistency, - ); - - if ( - highlightConsistency.shouldRetry && - attempt < maxHighlightAttempts && - !repeatedDrift - ) { - previousConsistency = highlightConsistency; - console.warn( - `⚠️ [HighlightElements] Layout drift detected after screenshot, retrying (attempt ${attempt}/${maxHighlightAttempts})`, - ); - continue; - } - - if (highlightConsistency.shouldRetry) { - console.warn( - repeatedDrift - ? `⚠️ [HighlightElements] Layout drift repeated with near-identical metrics, returning latest screenshot` - : `⚠️ [HighlightElements] Layout drift still detected on final attempt, returning latest screenshot`, - ); - } - - // Preserve the original highlight pipeline order for detection, - // pagination, and consistency checks. Only sort and renumber at the - // rendering boundary so the screenshot/response stay intuitive - // without changing the stability gate. - const storedPages = buildStoredHighlightPages({ - filteredElements, - totalPages, - viewportWidth: detectedViewportWidth, - viewportHeight: detectedViewportHeight, - keywordMode: keywordList.length > 0, - }); - const displayOrderedElements = storedPages[currentPage - 1] ?? []; - - const cacheStoreStart = Date.now(); - const storedSnapshot = elementCache.storeSnapshot({ - conversationId, + try { + const highlightedPage = await captureHighlightedPageState({ tabId: activeTabId, - documentId: detectedDocumentId, + conversationId, elementType, - keywords: keywordList, - totalElements: filteredElements.length, - pages: storedPages, - page: currentPage, + page, + keywords, + logLabel: 'HighlightElements', }); - console.log( - `⏱️ [HighlightTrace] background cache-store ${Date.now() - cacheStoreStart}ms (snapshot=${storedSnapshot.snapshotId}, count=${displayOrderedElements.length})`, - ); - - // Log first few element bboxes for debugging - if (displayOrderedElements.length > 0) { - console.log( - `📍 [HighlightElements] First element bbox:`, - JSON.stringify(displayOrderedElements[0].bbox), - ); - } - - // Draw highlights on screenshot (scale coordinates by DPR) - const drawHighlightsStart = Date.now(); - const highlightedScreenshot = await drawHighlights( - screenshotResult.imageData, - storedSnapshot.elements, - { - scale: imageScale, - viewportWidth, - viewportHeight, - }, - ); - console.log( - `⏱️ [HighlightTrace] background draw-highlights ${Date.now() - drawHighlightsStart}ms (elements=${storedSnapshot.elements.length})`, - ); - - const compressStart = Date.now(); - const compressedScreenshot = await compressIfNeeded( - highlightedScreenshot, - getCompressionThreshold(), - ); - console.log( - `⏱️ [HighlightTrace] background compress ${Date.now() - compressStart}ms`, - ); - console.log( - `⏱️ [HighlightTrace] background total ${Date.now() - highlightTraceStart}ms`, - ); return { success: true, - data: { - highlight_snapshot_id: storedSnapshot.snapshotId, - elements: storedSnapshot.elements, - totalElements: filteredElements.length, - totalPages: totalPages, - page: currentPage, - pageState, - readinessReasons, - screenshot: compressedScreenshot, - ...(screenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: screenshotResult.dialog_auto_accepted, - } - : {}), - ...(screenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - screenshotResult.dialog_auto_accepted_list, - } - : {}), - }, + data: highlightedPage, + timestamp: Date.now(), + }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : String(error), timestamp: Date.now(), }; } - - return { - success: false, - error: 'Failed to produce a stable highlight screenshot', - timestamp: Date.now(), - }; } case 'click_element': { @@ -1883,7 +1898,6 @@ async function handleCommand(command: Command): Promise { const clickResult = await performElementClick( command.conversation_id, - command.highlight_snapshot_id, command.element_id, clickTabId, ); @@ -1910,35 +1924,17 @@ async function handleCommand(command: Command): Promise { ); } - const clickScreenshotResult = await captureScreenshot( - screenshotTabId, - command.conversation_id, - true, - 90, - false, - 0, - ); - const compressedClickScreenshotResult = await compressScreenshotResult( - clickScreenshotResult, - ); + const clickPageState = await captureDefaultHighlightedPageState({ + tabId: screenshotTabId, + conversationId: command.conversation_id, + logLabel: 'ClickElement', + }); return { success: clickResult.success, data: { ...clickResult, - screenshot: compressedClickScreenshotResult?.imageData, - ...(compressedClickScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedClickScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedClickScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedClickScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...clickPageState, }, error: clickResult.error, timestamp: Date.now(), @@ -1954,39 +1950,20 @@ async function handleCommand(command: Command): Promise { const hoverResult = await performElementHover( command.conversation_id, - command.highlight_snapshot_id, command.element_id, hoverTabId, ); - const hoverScreenshotResult = await captureScreenshot( - hoverTabId, - command.conversation_id, - true, - 90, - false, - 0, - ); - const compressedHoverScreenshotResult = await compressScreenshotResult( - hoverScreenshotResult, - ); + const hoverPageState = await captureDefaultHighlightedPageState({ + tabId: hoverTabId, + conversationId: command.conversation_id, + logLabel: 'HoverElement', + }); return { success: hoverResult.success, data: { ...hoverResult, - screenshot: compressedHoverScreenshotResult?.imageData, - ...(compressedHoverScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedHoverScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedHoverScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedHoverScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...hoverPageState, }, error: hoverResult.error, timestamp: Date.now(), @@ -2003,41 +1980,22 @@ async function handleCommand(command: Command): Promise { // element_id is optional - if not provided, scrolls the entire page const scrollResult = await performElementScroll( command.conversation_id, - command.highlight_snapshot_id, command.element_id, command.direction || 'down', scrollTabId, command.scroll_amount || 0.5, ); - const scrollScreenshotResult = await captureScreenshot( - scrollTabId, - command.conversation_id, - true, - 90, - false, - 0, - ); - const compressedScrollScreenshotResult = await compressScreenshotResult( - scrollScreenshotResult, - ); + const scrollPageState = await captureDefaultHighlightedPageState({ + tabId: scrollTabId, + conversationId: command.conversation_id, + logLabel: 'ScrollElement', + }); return { success: scrollResult.success, data: { ...scrollResult, - screenshot: compressedScrollScreenshotResult?.imageData, - ...(compressedScrollScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedScrollScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedScrollScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedScrollScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...scrollPageState, }, error: scrollResult.error, timestamp: Date.now(), @@ -2053,42 +2011,24 @@ async function handleCommand(command: Command): Promise { const swipeResult = await performElementSwipe( command.conversation_id, - command.highlight_snapshot_id, command.element_id, command.direction || 'next', swipeTabId, command.swipe_count || 1, ); - const swipeScreenshotResult = await captureScreenshot( - swipeTabId, - command.conversation_id, - true, - 90, - false, - 900, - TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS, - ); - const compressedSwipeScreenshotResult = await compressScreenshotResult( - swipeScreenshotResult, - ); + const swipePageState = await captureDefaultHighlightedPageState({ + tabId: swipeTabId, + conversationId: command.conversation_id, + logLabel: 'SwipeElement', + preconditionWaitForRender: 900, + preconditionCaptureOptions: TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS, + }); return { success: swipeResult.success, data: { ...swipeResult, - screenshot: compressedSwipeScreenshotResult?.imageData, - ...(compressedSwipeScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedSwipeScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedSwipeScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedSwipeScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...swipePageState, }, error: swipeResult.error, timestamp: Date.now(), @@ -2104,40 +2044,21 @@ async function handleCommand(command: Command): Promise { const inputResult = await performKeyboardInput( command.conversation_id, - command.highlight_snapshot_id, command.element_id, command.text, inputTabId, ); - const inputScreenshotResult = await captureScreenshot( - inputTabId, - command.conversation_id, - true, - 90, - false, - 0, - ); - const compressedInputScreenshotResult = await compressScreenshotResult( - inputScreenshotResult, - ); + const inputPageState = await captureDefaultHighlightedPageState({ + tabId: inputTabId, + conversationId: command.conversation_id, + logLabel: 'KeyboardInput', + }); return { success: inputResult.success, data: { ...inputResult, - screenshot: compressedInputScreenshotResult?.imageData, - ...(compressedInputScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedInputScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedInputScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedInputScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...inputPageState, }, error: inputResult.error, timestamp: Date.now(), @@ -2153,40 +2074,21 @@ async function handleCommand(command: Command): Promise { const selectResult = await performElementSelect( command.conversation_id, - command.highlight_snapshot_id, command.element_id, selectTabId, command.value, ); - const selectScreenshotResult = await captureScreenshot( - selectTabId, - command.conversation_id, - true, - 90, - false, - 0, - ); - const compressedSelectScreenshotResult = await compressScreenshotResult( - selectScreenshotResult, - ); + const selectPageState = await captureDefaultHighlightedPageState({ + tabId: selectTabId, + conversationId: command.conversation_id, + logLabel: 'SelectElement', + }); return { success: selectResult.success, data: { ...selectResult, - screenshot: compressedSelectScreenshotResult?.imageData, - ...(compressedSelectScreenshotResult?.dialog_auto_accepted - ? { - dialog_auto_accepted: - compressedSelectScreenshotResult.dialog_auto_accepted, - } - : {}), - ...(compressedSelectScreenshotResult?.dialog_auto_accepted_list - ? { - dialog_auto_accepted_list: - compressedSelectScreenshotResult.dialog_auto_accepted_list, - } - : {}), + ...selectPageState, }, error: selectResult.error, timestamp: Date.now(), @@ -2198,16 +2100,10 @@ async function handleCommand(command: Command): Promise { throw new Error('conversation_id required for get_element_html'); const conversationId = command.conversation_id; const elementId = command.element_id; - const highlightSnapshotId = command.highlight_snapshot_id; if (!elementId) { throw new Error('element_id is required for get_element_html'); } - if (highlightSnapshotId === undefined || highlightSnapshotId === null) { - throw new Error( - 'highlight_snapshot_id is required for get_element_html', - ); - } // Get current active tab for this conversation const activeTabId = tabManager.getCurrentActiveTabId(conversationId); @@ -2221,20 +2117,24 @@ async function handleCommand(command: Command): Promise { const element = elementCache.getElementById( conversationId, activeTabId, - highlightSnapshotId, elementId, ); if (!element) { console.warn( - `⚠️ [GetElementHtml] Element ${elementId} not found in cache for conversation ${conversationId}, tab ${activeTabId}, snapshot ${highlightSnapshotId}`, + `⚠️ [GetElementHtml] Element ${elementId} not found in cache for conversation ${conversationId}, tab ${activeTabId}`, ); return { success: false, - error: `Element ${elementId} not found in cache for highlight snapshot ${highlightSnapshotId}. The snapshot may have expired or the page may have changed. Try highlight_elements again.`, + error: buildElementCacheMissMessage({ + conversationId, + tabId: activeTabId, + elementId, + refreshHint: + 'The highlight cache may have expired or the page may have changed. Try highlight_elements again.', + }), data: { element_id: elementId, - highlight_snapshot_id: highlightSnapshotId, html: null, }, timestamp: Date.now(), @@ -2249,10 +2149,16 @@ async function handleCommand(command: Command): Promise { return { success: true, - message: `Retrieved HTML for element ${elementId}`, + message: + element.elementIdCorrected && + element.resolvedElementId !== elementId + ? `Retrieved HTML for element ${element.resolvedElementId} (matched from requested ${elementId})` + : `Retrieved HTML for element ${element.resolvedElementId}`, data: { - element_id: elementId, - highlight_snapshot_id: highlightSnapshotId, + element_id: element.resolvedElementId, + requested_element_id: elementId, + resolved_element_id: element.resolvedElementId, + element_id_corrected: element.elementIdCorrected, html: html, tagName: element.element.tagName, type: element.element.type, @@ -2269,27 +2175,24 @@ async function handleCommand(command: Command): Promise { } const conversationId = command.conversation_id; const activeTabId = tabManager.getCurrentActiveTabId(conversationId); - const highlightSnapshotId = command.highlight_snapshot_id; if (!activeTabId) { throw new Error(`No active tab for conversation ${conversationId}`); } - if (highlightSnapshotId === undefined || highlightSnapshotId === null) { - throw new Error( - 'highlight_snapshot_id is required for highlight_single_element command', - ); - } // Get element from cache const element = elementCache.getElementById( conversationId, activeTabId, - highlightSnapshotId, command.element_id, ); if (!element) { return { success: false, - error: `Element ${command.element_id} not found in cache for highlight snapshot ${highlightSnapshotId}. Call highlight_elements() again.`, + error: buildElementCacheMissMessage({ + conversationId, + tabId: activeTabId, + elementId: command.element_id, + }), timestamp: Date.now(), }; } @@ -2424,7 +2327,7 @@ async function handleCommand(command: Command): Promise { ok: false, stale: true, error: - "Highlight snapshot ${highlightSnapshotId} is stale because the document changed. Call highlight_elements() again." + "The highlighted element is stale because the document changed. Call highlight_elements() again." }; } if (!el) { @@ -2432,7 +2335,7 @@ async function handleCommand(command: Command): Promise { ok: false, stale: true, error: - "Element not found in DOM for this highlight snapshot. Call highlight_elements() again." + "Element not found in DOM for the cached highlight result. Call highlight_elements() again." }; } const currentFingerprint = getElementFingerprint(el); @@ -2441,7 +2344,7 @@ async function handleCommand(command: Command): Promise { ok: false, stale: true, error: - "Highlight snapshot is stale because the target element identity changed. Call highlight_elements() again." + "The cached highlight result is stale because the target element identity changed. Call highlight_elements() again." }; } const rect = el.getBoundingClientRect(); @@ -2488,7 +2391,7 @@ async function handleCommand(command: Command): Promise { success: false, error: bboxResult.result.value.error || - `Element ${command.element_id} is stale for highlight snapshot ${highlightSnapshotId}. Call highlight_elements() again.`, + `Element ${command.element_id} is stale. Call highlight_elements() again.`, timestamp: Date.now(), }; } else { @@ -2554,7 +2457,6 @@ async function handleCommand(command: Command): Promise { `Element ${element.element.id} is not visible in the current viewport. ${scrollHint}`.trim(), data: { elementId: element.element.id, - highlight_snapshot_id: highlightSnapshotId, bbox: freshBbox, viewportWidth, viewportHeight, @@ -2574,6 +2476,7 @@ async function handleCommand(command: Command): Promise { screenshotResult.imageData, elementWithFreshBbox, { + intendedAction: command.intended_action, scale: screenshotResult.metadata?.imageScale || screenshotResult.metadata?.devicePixelRatio || @@ -2591,8 +2494,10 @@ async function handleCommand(command: Command): Promise { highlightedScreenshot, getCompressionThreshold(), ), - elementId: command.element_id, - highlight_snapshot_id: highlightSnapshotId, + elementId: element.resolvedElementId, + requestedElementId: command.element_id, + resolvedElementId: element.resolvedElementId, + elementIdCorrected: element.elementIdCorrected, ...(screenshotResult?.dialog_auto_accepted ? { dialog_auto_accepted: screenshotResult.dialog_auto_accepted } : {}), diff --git a/extension/src/commands/__tests__/element-id.test.ts b/extension/src/commands/__tests__/element-id.test.ts index 12ad0cb..edcaeed 100644 --- a/extension/src/commands/__tests__/element-id.test.ts +++ b/extension/src/commands/__tests__/element-id.test.ts @@ -1,8 +1,21 @@ import { describe, expect, test } from 'bun:test'; import type { InteractiveElement } from '../../types'; -import { elementCache } from '../element-cache'; -import { assignSequentialElementIds } from '../element-id'; +import { buildElementCacheMissMessage, elementCache } from '../element-cache'; +import { + ELEMENT_ID_CHARSET, + ELEMENT_ID_LENGTH, + assignHashedElementIds, + generateShortHash, + normalizeVisualElementIdInput, +} from '../element-id'; + +function usesAllowedElementIdChars(id: string): boolean { + return ( + id.length === ELEMENT_ID_LENGTH && + [...id].every((char) => ELEMENT_ID_CHARSET.includes(char)) + ); +} function createElement(id: string, selector: string): InteractiveElement { return { @@ -10,6 +23,7 @@ function createElement(id: string, selector: string): InteractiveElement { type: 'clickable', tagName: 'button', selector, + html: ``, bbox: { x: 0, y: 0, width: 10, height: 10 }, isVisible: true, isInViewport: true, @@ -17,59 +31,175 @@ function createElement(id: string, selector: string): InteractiveElement { } describe('element-id', () => { - test('assigns page-local numeric ids in order', () => { - const result = assignSequentialElementIds([ - createElement('old-a', '#a'), - createElement('old-b', '#b'), - createElement('old-c', '#c'), + test('generates fixed-length visual-safe hashes', () => { + const hash = generateShortHash('#checkout', ''); + + expect(hash).toHaveLength(ELEMENT_ID_LENGTH); + expect(usesAllowedElementIdChars(hash)).toBe(true); + }); + + test('assigns stable hash ids based on selector and html', () => { + const result = assignHashedElementIds([ + createElement('old-a', '#checkout'), + createElement('old-b', '#email'), + createElement('old-c', '#submit'), ]); - expect(result.map((element) => element.id)).toEqual(['1', '2', '3']); - expect(result.map((element) => element.selector)).toEqual([ - '#a', - '#b', - '#c', + expect( + result.every((element) => usesAllowedElementIdChars(element.id)), + ).toBe(true); + expect(new Set(result.map((element) => element.id)).size).toBe( + result.length, + ); + + const secondPass = assignHashedElementIds([ + createElement('different-a', '#checkout'), + createElement('different-b', '#email'), + createElement('different-c', '#submit'), ]); + + expect(secondPass.map((element) => element.id)).toEqual( + result.map((element) => element.id), + ); + }); + + test('keeps ids stable regardless of input array order', () => { + const original = [ + createElement('first', '#alpha'), + createElement('second', '#beta'), + createElement('third', '#gamma'), + ]; + const reversed = [...original].reverse(); + + const originalAssigned = assignHashedElementIds(original); + const reversedAssigned = assignHashedElementIds(reversed); + + expect( + Object.fromEntries( + originalAssigned.map((element) => [element.selector, element.id]), + ), + ).toEqual( + Object.fromEntries( + reversedAssigned.map((element) => [element.selector, element.id]), + ), + ); }); test('does not mutate the caller-owned element objects', () => { const original = [createElement('keep-me', '#a')]; - const result = assignSequentialElementIds(original); + const result = assignHashedElementIds(original); expect(original[0].id).toBe('keep-me'); - expect(result[0].id).toBe('1'); + expect(usesAllowedElementIdChars(result[0].id)).toBe(true); expect(result[0]).not.toBe(original[0]); }); + + test('normalizes visually ambiguous 3-character element IDs only', () => { + expect(normalizeVisualElementIdInput('D02')).toBe('DO2'); + expect(normalizeVisualElementIdInput(' d o 2 ')).toBe('DO2'); + expect(normalizeVisualElementIdInput('id-10')).toBe('id-10'); + }); }); -describe('element-cache highlight snapshots', () => { - test('stores a page-scoped snapshot and resolves element IDs within that snapshot', () => { +describe('element-cache document cache', () => { + test('stores highlight pages and resolves element IDs from the current document cache', () => { elementCache.clearAll(); + const assignedPages = [ + assignHashedElementIds([createElement('', '#page-1')]), + assignHashedElementIds([createElement('', '#page-2')]), + ]; - const snapshot = elementCache.storeSnapshot({ + const storedPage = elementCache.storeHighlightResult({ conversationId: 'conv-1', tabId: 101, documentId: 'doc-1', elementType: 'any', totalElements: 2, - pages: [[createElement('1', '#page-1')], [createElement('1', '#page-2')]], + totalPages: 2, + pages: assignedPages, page: 1, }); - expect(snapshot.snapshotId).toBe(1); - expect(snapshot.page).toBe(1); - expect(snapshot.elements.map((element) => element.selector)).toEqual([ + expect(storedPage.documentId).toBe('doc-1'); + expect(storedPage.page).toBe(1); + expect(storedPage.elements.map((element) => element.selector)).toEqual([ '#page-1', ]); + expect(storedPage.elements[0]?.id).toBe(assignedPages[0]?.[0]?.id); + const storedElementId = storedPage.elements[0]?.id; + expect(storedElementId).toBeDefined(); - const lookup = elementCache.getElementById( - 'conv-1', - 101, - snapshot.snapshotId, - '1', - ); + const lookup = elementCache.getElementById('conv-1', 101, storedElementId!); expect(lookup?.element.selector).toBe('#page-1'); expect(lookup?.documentId).toBe('doc-1'); }); + + test('resolves visually ambiguous requested IDs to the cached visual-safe ID', () => { + elementCache.clearAll(); + const page = [createElement('DO2', '#page-corrected')]; + + elementCache.storeHighlightResult({ + conversationId: 'conv-visual-safe', + tabId: 202, + documentId: 'doc-visual-safe', + elementType: 'any', + totalElements: 1, + totalPages: 1, + pages: [page], + page: 1, + }); + + const lookup = elementCache.getElementById('conv-visual-safe', 202, 'D02'); + + expect(lookup?.requestedElementId).toBe('D02'); + expect(lookup?.resolvedElementId).toBe('DO2'); + expect(lookup?.elementIdCorrected).toBe(true); + expect(lookup?.element.selector).toBe('#page-corrected'); + }); + + test('suggests close visual-safe ids when the requested id is missing', () => { + elementCache.clearAll(); + const page = [ + createElement('MQK', '#heart-button'), + createElement('M9H', '#share-button'), + createElement('AB1', '#other-button'), + ]; + + elementCache.storeHighlightResult({ + conversationId: 'conv-suggestions', + tabId: 303, + documentId: 'doc-suggestions', + elementType: 'any', + totalElements: page.length, + totalPages: 1, + pages: [page], + page: 1, + }); + + const suggestions = elementCache.getElementIdSuggestions( + 'conv-suggestions', + 303, + 'MQH', + ); + + expect(suggestions.map((suggestion) => suggestion.elementId)).toEqual([ + 'MQK', + 'M9H', + ]); + + const missMessage = buildElementCacheMissMessage({ + conversationId: 'conv-suggestions', + tabId: 303, + elementId: 'MQH', + }); + + expect(missMessage).toContain("Maybe try 'MQK', 'M9H'"); + expect(missMessage).toContain( + 'MQK: ', + ); + expect(missMessage).toContain( + 'M9H: ', + ); + }); }); diff --git a/extension/src/commands/__tests__/single-highlight.test.ts b/extension/src/commands/__tests__/single-highlight.test.ts index 5e9aa11..2439aef 100644 --- a/extension/src/commands/__tests__/single-highlight.test.ts +++ b/extension/src/commands/__tests__/single-highlight.test.ts @@ -1,7 +1,11 @@ import { describe, expect, test } from 'bun:test'; import type { InteractiveElement } from '../../types'; -import { calculateConfirmationPreviewLayout } from '../single-highlight'; +import { + calculateConfirmationBannerLayout, + calculateConfirmationPreviewLayout, + getConfirmationPromptText, +} from '../single-highlight'; function createElement(bbox: InteractiveElement['bbox']): InteractiveElement { return { @@ -16,6 +20,40 @@ function createElement(bbox: InteractiveElement['bbox']): InteractiveElement { } describe('single-highlight confirmation preview', () => { + test('formats confirmation reminder text for click and keyboard input', () => { + expect(getConfirmationPromptText('click')).toBe( + 'Is this the element you wanted to click?', + ); + expect(getConfirmationPromptText('keyboard_input')).toBe( + 'Is this the element you wanted to type into?', + ); + }); + + test('places the confirmation reminder above the highlight when space is available', () => { + const banner = calculateConfirmationBannerLayout({ + canvasWidth: 720, + canvasHeight: 420, + elementRect: { x: 220, y: 180, width: 120, height: 40 }, + message: getConfirmationPromptText('click'), + scale: 1, + }); + + expect(banner.y + banner.height).toBeLessThanOrEqual(180 - 8); + expect(banner.x).toBeGreaterThanOrEqual(10); + }); + + test('falls back below the highlight when there is no room above', () => { + const banner = calculateConfirmationBannerLayout({ + canvasWidth: 720, + canvasHeight: 420, + elementRect: { x: 220, y: 18, width: 120, height: 40 }, + message: getConfirmationPromptText('click'), + scale: 1, + }); + + expect(banner.y).toBeGreaterThanOrEqual(18 + 40 + 8); + }); + test('uses a bounded close-up crop around the selected element', () => { const layout = calculateConfirmationPreviewLayout( 1280, diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts index fae90a0..002d3b8 100644 --- a/extension/src/commands/element-actions.ts +++ b/extension/src/commands/element-actions.ts @@ -11,7 +11,7 @@ import type { ElementActionResult } from '../types'; * - Handles dialog events using the same pattern as javascript.ts */ -import { ELEMENT_CACHE_TTL_DESCRIPTION, elementCache } from './element-cache'; +import { buildElementCacheMissMessage, elementCache } from './element-cache'; import { executeJavaScript, type JavaScriptResult } from './javascript'; import { buildHitTestVisibilityHelpersScript } from '../utils/hit-test-visibility'; @@ -19,14 +19,25 @@ function escapeForDoubleQuotedJavaScriptString(value: string): string { return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); } -function buildElementCacheMissMessage( - elementId: string, - refreshHint: string = 'Call highlight_elements() again to get a fresh highlight_snapshot_id.', -): string { - return `Element '${elementId}' not found in cache for the referenced highlight snapshot. Highlight snapshots expire after ${ELEMENT_CACHE_TTL_DESCRIPTION}. ${refreshHint}`; +function buildResolvedElementResultFields( + requestedElementId: string, + resolvedElementId: string, +): Pick< + ElementActionResult, + | 'elementId' + | 'requestedElementId' + | 'resolvedElementId' + | 'elementIdCorrected' +> { + return { + elementId: resolvedElementId, + requestedElementId, + resolvedElementId, + elementIdCorrected: requestedElementId !== resolvedElementId, + }; } -function buildSnapshotIdentityHelpersScript(): string { +function buildCachedElementIdentityHelpersScript(): string { return ` function normalizeIdentityWhitespace(value, maxLength = 240) { const normalized = String(value ?? '') @@ -145,14 +156,14 @@ function buildSnapshotIdentityHelpersScript(): string { return overlap >= Math.max(2, Math.min(4, Math.ceil(expectedTokens.length * 0.5))); } - function validateSnapshotElement(expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el) { + function validateCachedElement(expectedDocumentId, expectedFingerprint, el) { const currentDocumentId = getCurrentDocumentId(); if (expectedDocumentId && currentDocumentId !== expectedDocumentId) { return { ok: false, stale: true, error: - \`Highlight snapshot \${expectedHighlightSnapshotId} is stale because the document changed. Call highlight_elements() again.\`, + 'The cached element is stale because the document changed. Call highlight_elements() again.', }; } @@ -162,7 +173,7 @@ function buildSnapshotIdentityHelpersScript(): string { ok: false, stale: true, error: - 'Highlight snapshot is stale because the target element no longer matches the cached identity. Call highlight_elements() again.', + 'The cached element is stale because the target no longer matches the cached identity. Call highlight_elements() again.', }; } @@ -177,7 +188,7 @@ function buildSnapshotIdentityHelpersScript(): string { function buildEditableActivationHelpersScript(): string { return ` - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} ${buildHitTestVisibilityHelpersScript()} function getInteractiveActivationTarget(target) { @@ -386,21 +397,19 @@ export interface SelectResult extends ElementActionResult { * 4. Return result with dialog info if applicable * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements - * @param elementId Cached element ID from the referenced highlight snapshot (for example, "1") + * @param elementId Cached element ID from the latest highlight cache (for example, "A1H") * @param tabId Target tab ID * @param timeout Maximum execution time in milliseconds (default: 30000) * @returns Click result with success status and dialog info */ export async function performElementClick( conversationId: string, - highlightSnapshotId: number, elementId: string, tabId: number, timeout: number = 30000, ): Promise { console.log( - `👆 [ElementClick] Clicking element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`, + `👆 [ElementClick] Clicking element ${elementId} in conversation ${conversationId} on tab ${tabId}`, ); // ============================================================ @@ -409,23 +418,27 @@ export async function performElementClick( const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementClick] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), clicked: false, staleElement: false, - error: buildElementCacheMissMessage( + error: buildElementCacheMissMessage({ + conversationId, + tabId, elementId, - 'Call highlight_elements() again to get a fresh highlight_snapshot_id and element IDs.', - ), + }), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementClick] Found element: selector="${element.selector}"`, @@ -448,7 +461,6 @@ export async function performElementClick( const script = ` (async function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; ${buildEditableActivationHelpersScript()} @@ -458,8 +470,7 @@ export async function performElementClick( return { clicked: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -548,7 +559,7 @@ export async function performElementClick( console.error(`❌ [ElementClick] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, clicked: false, staleElement: false, error: error instanceof Error ? error.message : String(error), @@ -567,7 +578,7 @@ export async function performElementClick( console.log(`❌ [ElementClick] Click execution failed: ${jsResult.error}`); return { success: false, - elementId, + ...resolvedElementFields, clicked: false, staleElement: false, error: jsResult.error || 'Click JavaScript execution failed', @@ -588,7 +599,7 @@ export async function performElementClick( ); const result: ClickResult = { success: true, - elementId, + ...resolvedElementFields, clicked: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -623,7 +634,7 @@ export async function performElementClick( ); return { success: false, - elementId, + ...resolvedElementFields, clicked: false, staleElement: false, error: invalidResultError, @@ -638,7 +649,7 @@ export async function performElementClick( return { success: false, - elementId, + ...resolvedElementFields, clicked: false, staleElement: isStale, error: clickResult?.error, @@ -650,7 +661,7 @@ export async function performElementClick( // If dialog opened during click, propagate dialog info const result: ClickResult = { success: true, - elementId, + ...resolvedElementFields, clicked: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -683,21 +694,19 @@ export async function performElementClick( * 4. Return result * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements - * @param elementId Cached element ID from the referenced highlight snapshot (for example, "1") + * @param elementId Cached element ID from the latest highlight cache (for example, "A1H") * @param tabId Target tab ID * @param timeout Maximum execution time in milliseconds (default: 30000) * @returns Hover result with success status */ export async function performElementHover( conversationId: string, - highlightSnapshotId: number, elementId: string, tabId: number, timeout: number = 30000, ): Promise { console.log( - `🖱️ [ElementHover] Hovering element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`, + `🖱️ [ElementHover] Hovering element ${elementId} in conversation ${conversationId} on tab ${tabId}`, ); // ============================================================ @@ -706,20 +715,27 @@ export async function performElementHover( const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementHover] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), hovered: false, staleElement: false, - error: buildElementCacheMissMessage(elementId), + error: buildElementCacheMissMessage({ + conversationId, + tabId, + elementId, + }), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementHover] Found element: selector="${element.selector}"`, @@ -741,18 +757,16 @@ export async function performElementHover( const script = ` (function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} const el = document.querySelector(selector); if (!el) { return { hovered: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -823,7 +837,7 @@ export async function performElementHover( console.error(`❌ [ElementHover] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, hovered: false, staleElement: false, }; @@ -836,7 +850,7 @@ export async function performElementHover( console.log(`❌ [ElementHover] Hover execution failed: ${jsResult.error}`); return { success: false, - elementId, + ...resolvedElementFields, hovered: false, staleElement: false, }; @@ -856,7 +870,7 @@ export async function performElementHover( ); const result: HoverResult = { success: true, - elementId, + ...resolvedElementFields, hovered: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -887,7 +901,7 @@ export async function performElementHover( return { success: false, - elementId, + ...resolvedElementFields, hovered: false, staleElement: isStale, }; @@ -898,7 +912,7 @@ export async function performElementHover( // If dialog opened during hover, propagate dialog info const result: HoverResult = { success: true, - elementId, + ...resolvedElementFields, hovered: true, }; @@ -960,8 +974,7 @@ export interface SwipeResult extends ElementActionResult { * 3. Execute and return result * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements. Required when elementId is provided - * @param elementId Cached element ID from the referenced highlight snapshot. Optional - if not provided, scrolls the entire page + * @param elementId Cached element ID from the latest highlight cache. Optional - if not provided, scrolls the entire page * @param direction Swipe direction ('next' or 'prev') * @param tabId Target tab ID * @param timeout Maximum execution time in milliseconds (default: 30000) @@ -969,7 +982,6 @@ export interface SwipeResult extends ElementActionResult { */ export async function performElementScroll( conversationId: string, - highlightSnapshotId: number | undefined, elementId: string | undefined, direction: ScrollDirection, tabId: number, @@ -977,7 +989,7 @@ export async function performElementScroll( timeout: number = 30000, ): Promise { console.log( - `📜 [ElementScroll] Scrolling ${elementId ? `element ${elementId}` : 'entire page'} ${direction} (amount: ${scrollAmount}x viewport) in conversation ${conversationId} on tab ${tabId}`, + `📜 [ElementScroll] Scrolling ${elementId ? `element ${elementId}` : 'entire page'} ${direction} (amount factor: ${scrollAmount}x current scroll target) in conversation ${conversationId} on tab ${tabId}`, ); // ============================================================ @@ -985,8 +997,8 @@ export async function performElementScroll( // ============================================================ // Calculate scroll multipliers based on direction - // scrollAmount is relative to viewport height (0.5 = half page, 1.0 = full page) - // For horizontal scroll, we use viewport width + // scrollAmount is relative to the current scroll target's visible size. + // For vertical scroll we use clientHeight; for horizontal scroll we use clientWidth. const scrollMultipliers: Record = { up: { x: 0, y: -scrollAmount }, down: { x: 0, y: scrollAmount }, @@ -997,35 +1009,35 @@ export async function performElementScroll( const { x: xMultiplier, y: yMultiplier } = scrollMultipliers[direction]; let script: string; + let resolvedElementFields: + | ReturnType + | undefined; if (elementId) { // Scroll a specific element - if (highlightSnapshotId === undefined || highlightSnapshotId === null) { - return { - success: false, - elementId, - scrolled: false, - error: - 'highlight_snapshot_id is required when scrolling a highlighted element.', - }; - } - const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementScroll] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), scrolled: false, - error: buildElementCacheMissMessage(elementId), + error: buildElementCacheMissMessage({ + conversationId, + tabId, + elementId, + }), }; } const element = cachedElement.element; + resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementScroll] Found element: selector="${element.selector}"`, @@ -1043,20 +1055,18 @@ export async function performElementScroll( script = ` (function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const el = document.querySelector(selector); const xMultiplier = ${xMultiplier}; const yMultiplier = ${yMultiplier}; - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} if (!el) { return { scrolled: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -1212,7 +1222,10 @@ export async function performElementScroll( console.error(`❌ [ElementScroll] JavaScript execution error:`, error); return { success: false, - elementId, + ...(resolvedElementFields ?? + (elementId + ? buildResolvedElementResultFields(elementId, elementId) + : {})), scrolled: false, }; } @@ -1224,7 +1237,10 @@ export async function performElementScroll( ); return { success: false, - elementId, + ...(resolvedElementFields ?? + (elementId + ? buildResolvedElementResultFields(elementId, elementId) + : {})), scrolled: false, }; } @@ -1243,7 +1259,10 @@ export async function performElementScroll( ); const result: ScrollResult = { success: true, - elementId, + ...(resolvedElementFields ?? + (elementId + ? buildResolvedElementResultFields(elementId, elementId) + : {})), scrolled: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -1282,7 +1301,10 @@ export async function performElementScroll( return { success: false, - elementId, + ...(resolvedElementFields ?? + (elementId + ? buildResolvedElementResultFields(elementId, elementId) + : {})), scrolled: false, staleElement: isStale, }; @@ -1301,7 +1323,10 @@ export async function performElementScroll( const result: ScrollResult = { success: true, - elementId, + ...(resolvedElementFields ?? + (elementId + ? buildResolvedElementResultFields(elementId, elementId) + : {})), scrolled: true, scrollEffective, ...(warning ? { warning } : {}), @@ -1336,7 +1361,6 @@ export async function performElementScroll( */ export async function performElementSwipe( conversationId: string, - highlightSnapshotId: number, elementId: string, direction: SwipeDirection, tabId: number, @@ -1344,25 +1368,32 @@ export async function performElementSwipe( timeout: number = 30000, ): Promise { console.log( - `🫳 [ElementSwipe] Swiping element ${elementId} from snapshot ${highlightSnapshotId} ${direction} (count: ${swipeCount}) in conversation ${conversationId} on tab ${tabId}`, + `🫳 [ElementSwipe] Swiping element ${elementId} ${direction} (count: ${swipeCount}) in conversation ${conversationId} on tab ${tabId}`, ); const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementSwipe] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), swiped: false, - error: buildElementCacheMissMessage(elementId), + error: buildElementCacheMissMessage({ + conversationId, + tabId, + elementId, + }), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementSwipe] Found element: selector="${element.selector}"`, @@ -1381,20 +1412,18 @@ export async function performElementSwipe( const script = ` (async function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const direction = "${direction}"; const swipeCount = ${swipeCount}; - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} const el = document.querySelector(selector); if (!el) { return { swiped: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -2392,7 +2421,7 @@ export async function performElementSwipe( console.error(`❌ [ElementSwipe] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, swiped: false, error: error instanceof Error ? error.message : String(error), }; @@ -2402,7 +2431,7 @@ export async function performElementSwipe( console.log(`❌ [ElementSwipe] Swipe execution failed: ${jsResult.error}`); return { success: false, - elementId, + ...resolvedElementFields, swiped: false, error: jsResult.error || 'Swipe JavaScript execution failed', }; @@ -2414,7 +2443,7 @@ export async function performElementSwipe( ); const result: SwipeResult = { success: true, - elementId, + ...resolvedElementFields, swiped: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -2449,7 +2478,7 @@ export async function performElementSwipe( console.log(`❌ [ElementSwipe] Swipe failed: ${error}, stale=${isStale}`); return { success: false, - elementId, + ...resolvedElementFields, swiped: false, staleElement: isStale, error, @@ -2471,7 +2500,7 @@ export async function performElementSwipe( return { success: true, - elementId, + ...resolvedElementFields, swiped: true, swipeEffective, ...(warning ? { warning } : {}), @@ -2499,8 +2528,7 @@ export interface InputResult extends ElementActionResult { * 4. Return result with input value * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements - * @param elementId Cached element ID from the referenced highlight snapshot + * @param elementId Cached element ID from the latest highlight cache * @param text Text to input into the element * @param tabId Target tab ID * @param timeout Maximum execution time in milliseconds (default: 30000) @@ -2508,14 +2536,13 @@ export interface InputResult extends ElementActionResult { */ export async function performKeyboardInput( conversationId: string, - highlightSnapshotId: number, elementId: string, text: string, tabId: number, timeout: number = 30000, ): Promise { console.log( - `⌨️ [KeyboardInput] Inputting text to element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`, + `⌨️ [KeyboardInput] Inputting text to element ${elementId} in conversation ${conversationId} on tab ${tabId}`, ); // ============================================================ @@ -2524,20 +2551,27 @@ export async function performKeyboardInput( const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [KeyboardInput] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), input: false, staleElement: false, - error: buildElementCacheMissMessage(elementId), + error: buildElementCacheMissMessage({ + conversationId, + tabId, + elementId, + }), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [KeyboardInput] Found element: selector="${element.selector}"`, @@ -2561,7 +2595,6 @@ export async function performKeyboardInput( const script = ` (function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const text = "${escapedText}"; @@ -2572,8 +2605,7 @@ export async function performKeyboardInput( return { input: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -2689,7 +2721,7 @@ export async function performKeyboardInput( console.error(`❌ [KeyboardInput] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, input: false, staleElement: false, }; @@ -2707,7 +2739,7 @@ export async function performKeyboardInput( console.log(`❌ [KeyboardInput] Input execution failed: ${jsResult.error}`); return { success: false, - elementId, + ...resolvedElementFields, input: false, staleElement: false, }; @@ -2727,7 +2759,7 @@ export async function performKeyboardInput( ); const result: InputResult = { success: true, - elementId, + ...resolvedElementFields, input: true, value: undefined, new_tabs_created: jsResult.new_tabs_created, @@ -2759,7 +2791,7 @@ export async function performKeyboardInput( return { success: false, - elementId, + ...resolvedElementFields, input: false, staleElement: isStale, }; @@ -2772,7 +2804,7 @@ export async function performKeyboardInput( // If dialog opened during input, propagate dialog info const result: InputResult = { success: true, - elementId, + ...resolvedElementFields, input: true, value: inputResult.value, }; @@ -2805,8 +2837,7 @@ export async function performKeyboardInput( * 4. Return result with selected values/labels/indices * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements - * @param elementId Cached element ID from the referenced highlight snapshot (for example, "1") + * @param elementId Cached element ID from the latest highlight cache (for example, "A1H") * @param tabId Target tab ID * @param value Option value(s) to select. Use string for single select, array for multi-select * @param timeout Maximum execution time in milliseconds (default: 30000) @@ -2814,14 +2845,13 @@ export async function performKeyboardInput( */ export async function performElementSelect( conversationId: string, - highlightSnapshotId: number, elementId: string, tabId: number, value: string | string[], timeout: number = 30000, ): Promise { console.log( - `📋 [ElementSelect] Selecting element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`, + `📋 [ElementSelect] Selecting element ${elementId} in conversation ${conversationId} on tab ${tabId}`, ); // ============================================================ @@ -2830,20 +2860,27 @@ export async function performElementSelect( const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementSelect] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), selected: false, staleElement: false, - error: buildElementCacheMissMessage(elementId), + error: buildElementCacheMissMessage({ + conversationId, + tabId, + elementId, + }), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementSelect] Found element: selector="${element.selector}"`, @@ -2869,11 +2906,10 @@ export async function performElementSelect( const script = ` (function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const value = ${valueJson}; - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} const el = document.querySelector(selector); @@ -2881,8 +2917,7 @@ export async function performElementSelect( return { selected: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -2999,7 +3034,7 @@ export async function performElementSelect( console.error(`❌ [ElementSelect] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, selected: false, staleElement: false, }; @@ -3016,7 +3051,7 @@ export async function performElementSelect( ); return { success: false, - elementId, + ...resolvedElementFields, selected: false, staleElement: false, }; @@ -3036,7 +3071,7 @@ export async function performElementSelect( ); const result: SelectResult = { success: true, - elementId, + ...resolvedElementFields, selected: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -3075,7 +3110,7 @@ export async function performElementSelect( return { success: false, - elementId, + ...resolvedElementFields, selected: false, staleElement: isStale, error: selectResult?.error, @@ -3089,7 +3124,7 @@ export async function performElementSelect( // Build result with selected values const result: SelectResult = { success: true, - elementId, + ...resolvedElementFields, selected: true, selectedValues: selectResult.selectedValues, selectedLabels: selectResult.selectedLabels, diff --git a/extension/src/commands/element-cache.ts b/extension/src/commands/element-cache.ts index 61babb5..a3f56b6 100644 --- a/extension/src/commands/element-cache.ts +++ b/extension/src/commands/element-cache.ts @@ -1,16 +1,21 @@ /** - * Highlight snapshot cache manager. + * Document-scoped element cache manager. * - * Two cache layers are maintained: - * 1. Per-call highlight inventories used to serve requested pages and keep - * page-local element IDs stable within one highlight response. - * 2. Page-scoped highlight snapshots returned to callers and used for - * element interactions together with page-local element IDs. + * Each conversation/tab keeps one active cache for the current highlighted + * document: + * 1. Persistent element-id assignments for the current document + * 2. A merged element lookup table keyed only by element_id + * 3. Latest highlight metadata for the current document */ import type { ElementType, InteractiveElement } from '../types'; +import { + buildElementIdentityKey, + generateUniqueHash, + normalizeVisualElementIdInput, +} from './element-id'; -interface HighlightInventoryEntry { +interface DocumentElementCacheEntry { tabId: number; createdAt: number; lastAccessedAt: number; @@ -18,19 +23,13 @@ interface HighlightInventoryEntry { elementType: ElementType; keywords: string[]; totalElements: number; - pages: InteractiveElement[][]; -} - -interface HighlightSnapshotViewEntry { - tabId: number; - inventoryId: number; - createdAt: number; - page: number; + totalPages: number; + idByIdentityKey: Map; + usedIds: Set; + elementsById: Map; } -export interface HighlightSnapshotPage { - snapshotId: number; - inventoryId: number; +export interface StoredHighlightPage { page: number; totalPages: number; totalElements: number; @@ -41,150 +40,172 @@ export interface HighlightSnapshotPage { } export interface CachedElementLookup { - snapshotId: number; - inventoryId: number; - page: number; - totalPages: number; - totalElements: number; documentId: string; elementType: ElementType; keywords: string[]; + totalElements: number; + totalPages: number; + requestedElementId: string; + resolvedElementId: string; + normalizedRequestedElementId: string; + elementIdCorrected: boolean; element: InteractiveElement; } +export interface ElementIdSuggestion { + elementId: string; + html: string; + matchedPositions: number; +} + export const ELEMENT_CACHE_TTL_MS = 1_200_000; // 20 minutes export const ELEMENT_CACHE_TTL_DESCRIPTION = `${ELEMENT_CACHE_TTL_MS / 60_000} minutes`; -const MAX_HIGHLIGHT_INVENTORIES_PER_TAB = 12; class ElementCacheImpl { - private inventories = new Map(); - - private snapshotViews = new Map(); + private documents = new Map(); - private nextInventoryId = 1; - - private nextSnapshotId = 1; - - private buildInventoryKey( - conversationId: string, - tabId: number, - inventoryId: number, - ): string { - return `${conversationId}:${tabId}:inventory:${inventoryId}`; - } - - private buildSnapshotKey( - conversationId: string, - tabId: number, - snapshotId: number, - ): string { - return `${conversationId}:${tabId}:snapshot:${snapshotId}`; - } - - private touchInventory(entry: HighlightInventoryEntry): void { - entry.lastAccessedAt = Date.now(); + private buildDocumentKey(conversationId: string, tabId: number): string { + return `${conversationId}:${tabId}`; } private isExpired(timestamp: number): boolean { return Date.now() - timestamp > ELEMENT_CACHE_TTL_MS; } - private removeInventoryByKey(key: string): void { - const inventory = this.inventories.get(key); - if (!inventory) { - return; - } + private cloneElement( + element: InteractiveElement, + id: string, + ): InteractiveElement { + return { + ...element, + bbox: { ...element.bbox }, + id, + }; + } - this.inventories.delete(key); + private touchEntry(entry: DocumentElementCacheEntry): void { + entry.lastAccessedAt = Date.now(); + } - const snapshotKeysToDelete: string[] = []; - for (const [snapshotKey, snapshot] of this.snapshotViews.entries()) { - if (snapshot.inventoryId === this.parseInventoryIdFromKey(key)) { - snapshotKeysToDelete.push(snapshotKey); + private cleanupExpired(): void { + for (const [key, entry] of this.documents.entries()) { + if (this.isExpired(entry.lastAccessedAt)) { + this.documents.delete(key); + console.log(`⏰ [ElementCache] Document cache expired for key ${key}`); } } - for (const snapshotKey of snapshotKeysToDelete) { - this.snapshotViews.delete(snapshotKey); - } - - console.log( - `🗑️ [ElementCache] Removed highlight inventory ${key} (${inventory.pages.length} pages, ${snapshotKeysToDelete.length} snapshots)`, - ); } - private parseInventoryIdFromKey(key: string): number { - const maybeId = Number.parseInt(key.split(':').at(-1) ?? '', 10); - return Number.isFinite(maybeId) ? maybeId : -1; - } + private getOrCreateEntry(options: { + conversationId: string; + tabId: number; + documentId: string; + elementType: ElementType; + keywords: string[]; + totalElements: number; + totalPages: number; + }): DocumentElementCacheEntry { + const { + conversationId, + tabId, + documentId, + elementType, + keywords, + totalElements, + totalPages, + } = options; - private cleanupExpired(): void { - const activeInventoryKeys = new Set(); + this.cleanupExpired(); - for (const [snapshotKey, snapshot] of this.snapshotViews.entries()) { - if (this.isExpired(snapshot.createdAt)) { - this.snapshotViews.delete(snapshotKey); - console.log( - `⏰ [ElementCache] Snapshot expired for key ${snapshotKey}`, - ); - continue; - } + const key = this.buildDocumentKey(conversationId, tabId); + const existing = this.documents.get(key); + const now = Date.now(); - const inventoryKey = snapshotKey.replace( - /:snapshot:\d+$/, - `:inventory:${snapshot.inventoryId}`, - ); - activeInventoryKeys.add(inventoryKey); + if (existing && existing.documentId === documentId) { + existing.lastAccessedAt = now; + existing.elementType = elementType; + existing.keywords = [...keywords]; + existing.totalElements = totalElements; + existing.totalPages = totalPages; + return existing; } - const inventoryKeysToDelete: string[] = []; - for (const [inventoryKey, inventory] of this.inventories.entries()) { - if (this.isExpired(inventory.lastAccessedAt)) { - inventoryKeysToDelete.push(inventoryKey); - continue; - } - - if ( - !activeInventoryKeys.has(inventoryKey) && - this.isExpired(inventory.createdAt) - ) { - inventoryKeysToDelete.push(inventoryKey); - } - } + const created: DocumentElementCacheEntry = { + tabId, + createdAt: now, + lastAccessedAt: now, + documentId, + elementType, + keywords: [...keywords], + totalElements, + totalPages, + idByIdentityKey: new Map(), + usedIds: new Set(), + elementsById: new Map(), + }; - for (const inventoryKey of inventoryKeysToDelete) { - this.removeInventoryByKey(inventoryKey); - } + this.documents.set(key, created); + console.log( + `📁 [ElementCache] Started new document cache for conversation ${conversationId}, tab ${tabId}, document ${documentId}`, + ); + return created; } - private pruneInventoriesForTab(conversationId: string, tabId: number): void { - const prefix = `${conversationId}:${tabId}:inventory:`; - const matchingInventories = Array.from(this.inventories.entries()) - .filter(([key]) => key.startsWith(prefix)) - .sort((a, b) => a[1].createdAt - b[1].createdAt); - - if (matchingInventories.length <= MAX_HIGHLIGHT_INVENTORIES_PER_TAB) { - return; + private assignIdsForEntry( + entry: DocumentElementCacheEntry, + elements: InteractiveElement[], + ): InteractiveElement[] { + const assignedIds = new Array(elements.length); + + const elementsByStableKey = elements + .map((element, index) => ({ + element, + index, + identityKey: buildElementIdentityKey(element), + })) + .sort((left, right) => { + const keyOrder = left.identityKey.localeCompare(right.identityKey); + if (keyOrder !== 0) { + return keyOrder; + } + return left.index - right.index; + }); + + for (const { element, index, identityKey } of elementsByStableKey) { + let elementId = entry.idByIdentityKey.get(identityKey); + if (!elementId) { + if (element.id && !entry.usedIds.has(element.id)) { + elementId = element.id; + } else { + const { hash } = generateUniqueHash( + element.selector, + entry.usedIds, + element.html, + ); + elementId = hash; + } + entry.idByIdentityKey.set(identityKey, elementId); + entry.usedIds.add(elementId); + } + assignedIds[index] = elementId; } - const toDelete = matchingInventories.slice( - 0, - matchingInventories.length - MAX_HIGHLIGHT_INVENTORIES_PER_TAB, + return elements.map((element, index) => + this.cloneElement(element, assignedIds[index] || element.id), ); - for (const [inventoryKey] of toDelete) { - this.removeInventoryByKey(inventoryKey); - } } - storeSnapshot(options: { + storeHighlightResult(options: { conversationId: string; tabId: number; documentId: string; elementType: ElementType; keywords?: string[]; totalElements: number; - pages: InteractiveElement[][]; + totalPages: number; page: number; - }): HighlightSnapshotPage { + pages: InteractiveElement[][]; + }): StoredHighlightPage { const { conversationId, tabId, @@ -192,243 +213,255 @@ class ElementCacheImpl { elementType, keywords = [], totalElements, - pages, + totalPages, page, + pages, } = options; - this.cleanupExpired(); - - const inventoryId = this.nextInventoryId++; - const snapshotId = this.nextSnapshotId++; - const now = Date.now(); - const inventoryKey = this.buildInventoryKey( - conversationId, - tabId, - inventoryId, - ); - const snapshotKey = this.buildSnapshotKey( + const entry = this.getOrCreateEntry({ conversationId, tabId, - snapshotId, - ); - - this.inventories.set(inventoryKey, { - tabId, - createdAt: now, - lastAccessedAt: now, documentId, elementType, - keywords: [...keywords], + keywords, totalElements, - pages: pages.map((snapshotPage) => - snapshotPage.map((element) => ({ - ...element, - bbox: { ...element.bbox }, - })), - ), + totalPages, }); - this.snapshotViews.set(snapshotKey, { - tabId, - inventoryId, - createdAt: now, - page, - }); - - this.pruneInventoriesForTab(conversationId, tabId); - - const snapshotPage = this.getSnapshotPage( - conversationId, - tabId, - snapshotId, - ); - if (!snapshotPage) { - throw new Error( - `Failed to retrieve newly stored highlight snapshot ${snapshotId}`, - ); - } - - console.log( - `📁 [ElementCache] Stored highlight inventory ${inventoryId} and snapshot ${snapshotId} for conversation ${conversationId}, tab ${tabId} (${pages.length} pages, ${totalElements} total elements)`, + const assignedPages = pages.map((pageElements) => + this.assignIdsForEntry(entry, pageElements), ); - return snapshotPage; - } - - getSnapshotPage( - conversationId: string, - tabId: number, - snapshotId: number, - ): HighlightSnapshotPage | undefined { - this.cleanupExpired(); - const snapshot = this.getSnapshotView(conversationId, tabId, snapshotId); - if (!snapshot) { - return undefined; - } - - const inventory = this.getInventory( - conversationId, - tabId, - snapshot.inventoryId, - ); - if (!inventory) { - return undefined; + for (const pageElements of assignedPages) { + for (const element of pageElements) { + entry.elementsById.set( + element.id, + this.cloneElement(element, element.id), + ); + } } - this.touchInventory(inventory); + this.touchEntry(entry); - const pageIndex = Math.max(0, snapshot.page - 1); - const elements = inventory.pages[pageIndex] ?? []; + console.log( + `📁 [ElementCache] Stored ${assignedPages.length} highlight pages for conversation ${conversationId}, tab ${tabId} (${totalElements} total elements on document ${documentId})`, + ); return { - snapshotId, - inventoryId: snapshot.inventoryId, - page: snapshot.page, - totalPages: inventory.pages.length, - totalElements: inventory.totalElements, - elementType: inventory.elementType, - keywords: [...inventory.keywords], - documentId: inventory.documentId, - elements: elements.map((element) => ({ - ...element, - bbox: { ...element.bbox }, - })), + page, + totalPages, + totalElements, + elementType, + keywords: [...keywords], + documentId, + elements: (assignedPages[Math.max(0, page - 1)] ?? []).map((element) => + this.cloneElement(element, element.id), + ), }; } getElementById( conversationId: string, tabId: number, - snapshotId: number, elementId: string, ): CachedElementLookup | undefined { - const snapshotPage = this.getSnapshotPage( - conversationId, - tabId, - snapshotId, - ); - if (!snapshotPage) { + this.cleanupExpired(); + + const key = this.buildDocumentKey(conversationId, tabId); + const entry = this.documents.get(key); + if (!entry || entry.tabId !== tabId) { return undefined; } - const element = snapshotPage.elements.find( - (candidate) => candidate.id === elementId, - ); + this.touchEntry(entry); + const requestedElementId = elementId; + const normalizedRequestedElementId = + normalizeVisualElementIdInput(requestedElementId); + let resolvedElementId = requestedElementId; + let element = entry.elementsById.get(requestedElementId); + + if (!element && normalizedRequestedElementId !== requestedElementId) { + element = entry.elementsById.get(normalizedRequestedElementId); + if (element) { + resolvedElementId = normalizedRequestedElementId; + } + } + if (!element) { return undefined; } return { - snapshotId, - inventoryId: snapshotPage.inventoryId, - page: snapshotPage.page, - totalPages: snapshotPage.totalPages, - totalElements: snapshotPage.totalElements, - documentId: snapshotPage.documentId, - elementType: snapshotPage.elementType, - keywords: snapshotPage.keywords, - element, + documentId: entry.documentId, + elementType: entry.elementType, + keywords: [...entry.keywords], + totalElements: entry.totalElements, + totalPages: entry.totalPages, + requestedElementId, + resolvedElementId, + normalizedRequestedElementId, + elementIdCorrected: requestedElementId !== resolvedElementId, + element: this.cloneElement(element, element.id), }; } - getSnapshotView( + getElementIdSuggestions( conversationId: string, tabId: number, - snapshotId: number, - ): HighlightSnapshotViewEntry | undefined { - if (!conversationId) { - return undefined; - } - - const snapshotKey = this.buildSnapshotKey( - conversationId, - tabId, - snapshotId, - ); - const snapshot = this.snapshotViews.get(snapshotKey); - if (!snapshot) { - return undefined; - } + elementId: string, + limit: number = 3, + ): ElementIdSuggestion[] { + this.cleanupExpired(); - if (snapshot.tabId !== tabId || this.isExpired(snapshot.createdAt)) { - this.snapshotViews.delete(snapshotKey); - console.log( - `⏰ [ElementCache] Snapshot expired or mismatched for key ${snapshotKey}`, - ); - return undefined; + const key = this.buildDocumentKey(conversationId, tabId); + const entry = this.documents.get(key); + if (!entry || entry.tabId !== tabId) { + return []; } - return snapshot; - } + this.touchEntry(entry); - getInventory( - conversationId: string, - tabId: number, - inventoryId: number, - ): HighlightInventoryEntry | undefined { - if (!conversationId) { - return undefined; + const normalizedRequestedElementId = + normalizeVisualElementIdInput(elementId).toUpperCase(); + if (!normalizedRequestedElementId) { + return []; } - const inventoryKey = this.buildInventoryKey( - conversationId, - tabId, - inventoryId, + const minimumMatchedPositions = Math.max( + 2, + normalizedRequestedElementId.length - 1, ); - const inventory = this.inventories.get(inventoryKey); - if (!inventory) { - return undefined; - } - if (inventory.tabId !== tabId || this.isExpired(inventory.lastAccessedAt)) { - this.removeInventoryByKey(inventoryKey); - return undefined; - } + return Array.from(entry.elementsById.values()) + .map((element) => { + const candidateId = element.id.toUpperCase(); + if ( + candidateId === normalizedRequestedElementId || + candidateId.length !== normalizedRequestedElementId.length + ) { + return null; + } + + const matchedPositions = Array.from(candidateId).reduce( + (count, char, index) => + count + + (char === normalizedRequestedElementId.charAt(index) ? 1 : 0), + 0, + ); + const weightedPositionScore = Array.from(candidateId).reduce( + (score, char, index) => + score + + (char === normalizedRequestedElementId.charAt(index) + ? normalizedRequestedElementId.length - index + : 0), + 0, + ); - return inventory; + if (matchedPositions < minimumMatchedPositions) { + return null; + } + + return { + elementId: element.id, + html: compactHtmlSnippet(element.html || `<${element.tagName}>`), + matchedPositions, + weightedPositionScore, + }; + }) + .filter( + ( + suggestion, + ): suggestion is ElementIdSuggestion & { + weightedPositionScore: number; + } => suggestion !== null, + ) + .sort((left, right) => { + if (right.matchedPositions !== left.matchedPositions) { + return right.matchedPositions - left.matchedPositions; + } + if (right.weightedPositionScore !== left.weightedPositionScore) { + return right.weightedPositionScore - left.weightedPositionScore; + } + return left.elementId.localeCompare(right.elementId); + }) + .map( + ({ weightedPositionScore: _weightedPositionScore, ...suggestion }) => + suggestion, + ) + .slice(0, Math.max(0, limit)); } invalidate(conversationId: string, tabId?: number): void { - const inventoryPrefix = - tabId !== undefined - ? `${conversationId}:${tabId}:inventory:` - : `${conversationId}:`; - const snapshotPrefix = - tabId !== undefined - ? `${conversationId}:${tabId}:snapshot:` - : `${conversationId}:`; - - const inventoryKeysToDelete = Array.from(this.inventories.keys()).filter( - (key) => key.startsWith(inventoryPrefix), - ); - const snapshotKeysToDelete = Array.from(this.snapshotViews.keys()).filter( - (key) => key.startsWith(snapshotPrefix), - ); + const keysToDelete = Array.from(this.documents.keys()).filter((key) => { + if (tabId === undefined) { + return key.startsWith(`${conversationId}:`); + } + return key === this.buildDocumentKey(conversationId, tabId); + }); - for (const key of inventoryKeysToDelete) { - this.inventories.delete(key); - } - for (const key of snapshotKeysToDelete) { - this.snapshotViews.delete(key); + for (const key of keysToDelete) { + this.documents.delete(key); } - if (inventoryKeysToDelete.length > 0 || snapshotKeysToDelete.length > 0) { + if (keysToDelete.length > 0) { const scope = tabId !== undefined ? `tab ${tabId}` : 'all tabs'; console.log( - `🗑️ [ElementCache] Invalidated ${inventoryKeysToDelete.length} inventories and ${snapshotKeysToDelete.length} snapshots for conversation ${conversationId} (${scope})`, + `🗑️ [ElementCache] Invalidated ${keysToDelete.length} document caches for conversation ${conversationId} (${scope})`, ); } } clearAll(): void { - this.inventories.clear(); - this.snapshotViews.clear(); + this.documents.clear(); console.log('🧹 [ElementCache] Cleared all caches'); } get size(): number { - return this.snapshotViews.size; + return this.documents.size; } } export const elementCache = new ElementCacheImpl(); + +function compactHtmlSnippet(html: string): string { + return html.replace(/\s+/g, ' ').trim().slice(0, 180); +} + +export function buildElementCacheMissMessage(options: { + conversationId: string; + tabId: number; + elementId: string; + refreshHint?: string; +}): string { + const { + conversationId, + tabId, + elementId, + refreshHint = 'Call highlight_elements() again to refresh the element cache.', + } = options; + const normalizedElementId = normalizeVisualElementIdInput(elementId); + const suggestions = elementCache.getElementIdSuggestions( + conversationId, + tabId, + normalizedElementId, + ); + + const baseMessage = + normalizedElementId && normalizedElementId !== elementId + ? `Element '${elementId}' was interpreted as '${normalizedElementId}' for visual-safe ID matching, but no cached element matched.` + : `Element '${elementId}' not found in cache.`; + const ttlMessage = `Highlight caches expire after ${ELEMENT_CACHE_TTL_DESCRIPTION}. ${refreshHint}`; + + if (suggestions.length === 0) { + return `${baseMessage} ${ttlMessage}`; + } + + const suggestedIds = suggestions + .map((suggestion) => `'${suggestion.elementId}'`) + .join(', '); + const suggestedHtml = suggestions + .map((suggestion) => `${suggestion.elementId}: ${suggestion.html}`) + .join(' | '); + + return `${baseMessage} ${ttlMessage} Maybe try ${suggestedIds}. Candidate HTML: ${suggestedHtml}`; +} diff --git a/extension/src/commands/element-id.ts b/extension/src/commands/element-id.ts index 57513be..f312eea 100644 --- a/extension/src/commands/element-id.ts +++ b/extension/src/commands/element-id.ts @@ -1,17 +1,167 @@ import type { InteractiveElement } from '../types'; +export const ELEMENT_ID_CHARSET = '123456789ACDEFHJKMNOPQRTUVWXY'; +export const ELEMENT_ID_LENGTH = 3; +const ELEMENT_ID_SPACE = ELEMENT_ID_CHARSET.length ** ELEMENT_ID_LENGTH; +const NORMALIZABLE_ELEMENT_ID_PATTERN = /^[0-9A-Za-z]{3}$/; +const AMBIGUOUS_ELEMENT_ID_CHAR_MAP: Record = { + '0': 'O', + o: 'O', + O: 'O', + i: '1', + I: '1', + l: '1', + L: '1', + z: '2', + Z: '2', + s: '5', + S: '5', + g: '6', + G: '6', + b: '8', + B: '8', +}; + +function encodeFixedVisualId(value: number): string { + let remaining = value; + const chars = Array.from( + { length: ELEMENT_ID_LENGTH }, + () => ELEMENT_ID_CHARSET[0], + ); + + for (let index = ELEMENT_ID_LENGTH - 1; index >= 0; index -= 1) { + chars[index] = ELEMENT_ID_CHARSET[remaining % ELEMENT_ID_CHARSET.length]; + remaining = Math.floor(remaining / ELEMENT_ID_CHARSET.length); + } + + return chars.join(''); +} + +/** + * Generate a short stable hash from a selector and optional HTML content. + * + * Uses FNV-1a for speed and reasonable distribution, then projects into the + * fixed 3-character visual-safe ID space used by highlight labels. + */ +export function generateShortHash( + cssPath: string, + html?: string, + salt: number = 0, +): string { + const FNV_PRIME = 0x01000193; + const FNV_OFFSET = 0x811c9dc5; + + let input = html ? `${cssPath}:${html}` : cssPath; + if (salt > 0) { + input = `${input}:${salt}`; + } + + let hash = FNV_OFFSET; + for (let index = 0; index < input.length; index += 1) { + hash ^= input.charCodeAt(index); + hash = Math.imul(hash, FNV_PRIME); + } + + return encodeFixedVisualId((hash >>> 0) % ELEMENT_ID_SPACE); +} + +export function generateUniqueHash( + cssPath: string, + existingHashes: Set, + html?: string, + maxAttempts: number = 512, +): { hash: string; salt: number } { + let salt = 0; + + while (salt < maxAttempts) { + const hash = generateShortHash(cssPath, html, salt); + if (!existingHashes.has(hash)) { + return { hash, salt }; + } + salt += 1; + } + + const fallbackSalt = Date.now(); + return { + hash: generateShortHash(cssPath, html, fallbackSalt), + salt: fallbackSalt, + }; +} + +export function normalizeVisualElementIdInput(value: string): string { + const compact = value.trim().replace(/\s+/g, ''); + if (!compact) { + return ''; + } + + if (!NORMALIZABLE_ELEMENT_ID_PATTERN.test(compact)) { + return compact; + } + + return compact + .split('') + .map((char) => { + const mapped = AMBIGUOUS_ELEMENT_ID_CHAR_MAP[char]; + if (mapped) { + return mapped; + } + + return char.toUpperCase(); + }) + .join(''); +} + +export function buildElementIdentityKey(element: InteractiveElement): string { + return `${element.selector}\u0000${element.html ?? ''}`; +} + /** - * Reassign element IDs to page-local sequential numbers. + * Assign short hash IDs that stay stable for the same selector/content. * - * IDs are intentionally page-local: each highlight snapshot page starts at 1 - * again, and the page-local IDs must be paired with highlight_snapshot_id. + * IDs are opaque references, not sequence numbers. */ -export function assignSequentialElementIds( +export function assignHashedElementIds( elements: InteractiveElement[], ): InteractiveElement[] { - return elements.map((element, index) => ({ - ...element, - bbox: { ...element.bbox }, - id: String(index + 1), - })); + const existingHashes = new Set(); + const assignedIds = new Array(elements.length); + + const elementsByStableKey = elements + .map((element, index) => ({ + element, + index, + identityKey: buildElementIdentityKey(element), + })) + .sort((left, right) => { + const keyOrder = left.identityKey.localeCompare(right.identityKey); + if (keyOrder !== 0) { + return keyOrder; + } + return left.index - right.index; + }); + + for (const { element, index } of elementsByStableKey) { + const { hash } = generateUniqueHash( + element.selector, + existingHashes, + element.html, + ); + existingHashes.add(hash); + assignedIds[index] = hash; + } + + return elements.map((element, index) => { + const assignedId = assignedIds[index]; + if (!assignedId) { + throw new Error( + `Failed to assign an element ID for selector "${element.selector}"`, + ); + } + + return { + ...element, + bbox: { ...element.bbox }, + id: assignedId, + }; + }); } diff --git a/extension/src/commands/label-constants.ts b/extension/src/commands/label-constants.ts index 9679b11..36862b3 100644 --- a/extension/src/commands/label-constants.ts +++ b/extension/src/commands/label-constants.ts @@ -4,6 +4,7 @@ */ export const LABEL_FONT_SIZE = 16; -export const LABEL_PADDING = 5; -export const LABEL_HEIGHT = LABEL_FONT_SIZE + LABEL_PADDING * 2; // 26px +export const LABEL_PADDING = 3; +export const LABEL_HEIGHT = LABEL_FONT_SIZE + LABEL_PADDING * 2; // 22px export const MAX_LABEL_WIDTH = 120; // Maximum label width for collision detection +export const LABEL_FONT_FAMILY = 'Arial'; diff --git a/extension/src/commands/single-highlight.ts b/extension/src/commands/single-highlight.ts index 98c3a54..2b1237f 100644 --- a/extension/src/commands/single-highlight.ts +++ b/extension/src/commands/single-highlight.ts @@ -6,15 +6,23 @@ import type { InteractiveElement } from '../types'; // Visual style for single-element confirmation -const CONFIRMATION_COLOR = '#FF6600'; // Orange border +const CONFIRMATION_COLOR = '#FFD400'; // Yellow border +const CONFIRMATION_TEXT_COLOR = '#111111'; +const CONFIRMATION_BANNER_COLOR = 'rgba(255, 212, 0, 0.5)'; +const CONFIRMATION_BANNER_BORDER_COLOR = 'rgba(17, 17, 17, 0.18)'; const BASE_BOX_PADDING = 2; -const BASE_LINE_WIDTH = 3; +const BASE_LINE_WIDTH = 4; const BASE_CONTEXT_PADDING_X = 96; const BASE_CONTEXT_PADDING_Y = 112; const BASE_MIN_CROP_WIDTH = 520; const BASE_MIN_CROP_HEIGHT = 320; const MIN_CROP_WIDTH_RATIO = 0.58; const MIN_CROP_HEIGHT_RATIO = 0.58; +const BASE_BANNER_FONT_SIZE = 22; +const BASE_BANNER_PADDING_X = 12; +const BASE_BANNER_PADDING_Y = 12; +const BASE_BANNER_MARGIN = 14; +const BASE_BANNER_GAP = 12; interface DeviceRect { x: number; @@ -39,7 +47,12 @@ interface ConfirmationPreviewLayout { export async function highlightSingleElement( screenshotDataUrl: string, element: InteractiveElement, - options?: { scale?: number; viewportWidth?: number; viewportHeight?: number }, + options?: { + intendedAction?: 'click' | 'keyboard_input'; + scale?: number; + viewportWidth?: number; + viewportHeight?: number; + }, ): Promise { console.log( `🎨 [SingleHighlight] Drawing highlight for element ${element.id}...`, @@ -199,6 +212,12 @@ export async function highlightSingleElement( // Draw the single element bounding box drawSingleBoundingBox(ctx, previewLayout.element, scale); + drawConfirmationBanner( + ctx, + previewLayout.element, + options?.intendedAction, + scale, + ); const resultBlob = await canvas.convertToBlob({ type: 'image/png' }); @@ -224,6 +243,102 @@ export async function highlightSingleElement( } } +export function formatConfirmationOperationLabel( + intendedAction?: 'click' | 'keyboard_input', +): string { + switch (intendedAction) { + case 'click': + return 'click'; + case 'keyboard_input': + return 'type into'; + default: + return 'interact with'; + } +} + +export function getConfirmationPromptText( + intendedAction?: 'click' | 'keyboard_input', +): string { + return `Is this the element you wanted to ${formatConfirmationOperationLabel(intendedAction)}?`; +} + +export function calculateConfirmationBannerLayout(options: { + canvasWidth: number; + canvasHeight: number; + elementRect: DeviceRect; + message: string; + scale: number; + textWidth?: number; +}): DeviceRect { + const { canvasWidth, canvasHeight, elementRect, message, scale, textWidth } = + options; + const fontSize = Math.max(16, Math.round(BASE_BANNER_FONT_SIZE * scale)); + const paddingX = Math.max(12, Math.round(BASE_BANNER_PADDING_X * scale)); + const paddingY = Math.max(8, Math.round(BASE_BANNER_PADDING_Y * scale)); + const margin = Math.max(10, Math.round(BASE_BANNER_MARGIN * scale)); + const gap = Math.max(8, Math.round(BASE_BANNER_GAP * scale)); + const estimatedTextWidth = Math.ceil(message.length * fontSize * 0.6); + const resolvedTextWidth = Math.ceil(textWidth ?? estimatedTextWidth); + const width = Math.min( + canvasWidth - margin * 2, + resolvedTextWidth + paddingX * 2, + ); + const height = fontSize + paddingY * 2; + + const clampX = (value: number): number => + clamp(value, margin, Math.max(margin, canvasWidth - width - margin)); + const clampY = (value: number): number => + clamp(value, margin, Math.max(margin, canvasHeight - height - margin)); + const centeredX = clampX(elementRect.x + elementRect.width / 2 - width / 2); + const centeredY = clampY(elementRect.y + elementRect.height / 2 - height / 2); + + if (elementRect.y - gap - height >= margin) { + return { + x: centeredX, + y: elementRect.y - gap - height, + width, + height, + }; + } + + if ( + elementRect.y + elementRect.height + gap + height <= + canvasHeight - margin + ) { + return { + x: centeredX, + y: elementRect.y + elementRect.height + gap, + width, + height, + }; + } + + if (elementRect.x + elementRect.width + gap + width <= canvasWidth - margin) { + return { + x: elementRect.x + elementRect.width + gap, + y: centeredY, + width, + height, + }; + } + + if (elementRect.x - gap - width >= margin) { + return { + x: elementRect.x - gap - width, + y: centeredY, + width, + height, + }; + } + + return { + x: centeredX, + y: clampY(elementRect.y + elementRect.height + gap), + width, + height, + }; +} + /** * Calculate a focused preview crop around the target element. */ @@ -311,10 +426,58 @@ function drawSingleBoundingBox( `[SingleHighlight] Drawing confirmation bbox at (${x}, ${y}, ${width}, ${height}) scale=${scale}`, ); - // Draw bounding box with orange color + // Draw bounding box with a bright yellow confirmation color. + ctx.save(); ctx.strokeStyle = CONFIRMATION_COLOR; ctx.lineWidth = lineWidth; + ctx.shadowColor = 'rgba(255, 212, 0, 0.7)'; + ctx.shadowBlur = 12 * scale; ctx.strokeRect(x, y, width, height); + ctx.restore(); +} + +function drawConfirmationBanner( + ctx: OffscreenCanvasRenderingContext2D, + elementRect: DeviceRect, + intendedAction: 'click' | 'keyboard_input' | undefined, + scale: number, +): void { + const message = getConfirmationPromptText(intendedAction); + const fontSize = Math.max(16, Math.round(BASE_BANNER_FONT_SIZE * scale)); + const paddingX = Math.max(12, Math.round(BASE_BANNER_PADDING_X * scale)); + + ctx.save(); + ctx.font = `700 ${fontSize}px sans-serif`; + ctx.textBaseline = 'middle'; + const measuredTextWidth = ctx.measureText(message).width; + const bannerRect = calculateConfirmationBannerLayout({ + canvasWidth: ctx.canvas.width, + canvasHeight: ctx.canvas.height, + elementRect, + message, + scale, + textWidth: measuredTextWidth, + }); + + ctx.fillStyle = CONFIRMATION_BANNER_COLOR; + ctx.fillRect(bannerRect.x, bannerRect.y, bannerRect.width, bannerRect.height); + ctx.strokeStyle = CONFIRMATION_BANNER_BORDER_COLOR; + ctx.lineWidth = Math.max(1, scale); + ctx.strokeRect( + bannerRect.x, + bannerRect.y, + bannerRect.width, + bannerRect.height, + ); + + ctx.fillStyle = CONFIRMATION_TEXT_COLOR; + ctx.fillText( + message, + bannerRect.x + paddingX, + bannerRect.y + bannerRect.height / 2, + bannerRect.width - paddingX * 2, + ); + ctx.restore(); } function clamp(value: number, min: number, max: number): number { diff --git a/extension/src/commands/visual-highlight.ts b/extension/src/commands/visual-highlight.ts index 574e111..eba80fd 100644 --- a/extension/src/commands/visual-highlight.ts +++ b/extension/src/commands/visual-highlight.ts @@ -10,7 +10,7 @@ import type { HighlightOptions, } from '../types'; import { LABEL_FONT_SIZE, LABEL_PADDING } from './label-constants'; -import { getLabelDimensions } from '../utils/label-geometry'; +import { getLabelDimensions, getLabelFont } from '../utils/label-geometry'; /** * Color mapping for different element types (with transparency for label backgrounds) @@ -497,8 +497,7 @@ function drawLabel( const fontSize = Math.round(LABEL_FONT_SIZE * scale); const labelPadding = Math.round(LABEL_PADDING * scale); - // Set font before measuring text - ctx.font = `bold ${fontSize}px Arial`; + ctx.font = getLabelFont(fontSize); // Measure text width let labelX: number; diff --git a/extension/src/types.ts b/extension/src/types.ts index 0685fd4..56526e7 100644 --- a/extension/src/types.ts +++ b/extension/src/types.ts @@ -130,10 +130,8 @@ export interface HighlightElementsCommand extends BaseCommand { export interface ClickElementCommand extends BaseCommand { type: 'click_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; /** * Target tab ID (optional - auto-resolved from conversation if not provided) * Note: Required in Python models, but optional here as extension auto-resolves it @@ -143,10 +141,8 @@ export interface ClickElementCommand extends BaseCommand { export interface HoverElementCommand extends BaseCommand { type: 'hover_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; /** * Target tab ID (optional - auto-resolved from conversation if not provided) * Note: Required in Python models, but optional here as extension auto-resolves it @@ -156,12 +152,10 @@ export interface HoverElementCommand extends BaseCommand { export interface ScrollElementCommand extends BaseCommand { type: 'scroll_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string). If not provided, scrolls the entire page */ + /** Element ID from highlight response (short opaque string). If not provided, scrolls the entire page */ element_id?: string; - /** Highlight snapshot ID returned by highlight_elements. Required when element_id is provided */ - highlight_snapshot_id?: number; direction?: ScrollDirection; - /** Scroll amount relative to page/element height (0.5 = half page, 1.0 = full page) */ + /** Scroll amount relative to the current scroll target's visible size (0.5 = half target, 1.0 = one full visible span) */ scroll_amount?: number; /** * Target tab ID (optional - auto-resolved from conversation if not provided) @@ -174,10 +168,8 @@ export type SwipeDirection = 'next' | 'prev'; export interface SwipeElementCommand extends BaseCommand { type: 'swipe_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; direction?: SwipeDirection; /** Number of swipe steps for carousel/swiper interactions */ swipe_count?: number; @@ -190,10 +182,8 @@ export interface SwipeElementCommand extends BaseCommand { export interface KeyboardInputCommand extends BaseCommand { type: 'keyboard_input'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; text: string; /** * Target tab ID (optional - auto-resolved from conversation if not provided) @@ -204,10 +194,8 @@ export interface KeyboardInputCommand extends BaseCommand { export interface SelectElementCommand extends BaseCommand { type: 'select_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; /** Option value(s) to select. Use string for single select, array for multi-select (` dropdown element by its visual ID. - -**Native `` dropdowns, not custom dropdown components built with `
`, `