From 1b9bd2f00ede53f0d2a97944c54bc6ba9cc00409 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Sat, 28 Mar 2026 16:59:21 +0800 Subject: [PATCH 01/20] feat(openbrowser): wire sdk tool image window Use the SDK-level tool image window configuration instead of the local prompt patch and point uv sources at the local agent-sdk worktree for validation. Co-authored-by: openhands --- pyproject.toml | 8 +++- server/agent/context_image_window.py | 39 ++++++++++++++++++ server/agent/manager.py | 5 +++ .../tests/unit/test_agent_manager_process.py | 24 +++++++++++ .../tests/unit/test_context_image_window.py | 29 ++++++++++++++ uv.lock | 40 +++++++++++++++++-- 6 files changed, 139 insertions(+), 6 deletions(-) create mode 100644 server/agent/context_image_window.py create mode 100644 server/tests/unit/test_context_image_window.py diff --git a/pyproject.toml b/pyproject.toml index 0b3e1d0..2f4bdb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ dependencies = [ "pillow>=10.0.0", "numpy>=1.24.0", "requests>=2.31.0", - "openhands-sdk @ git+https://github.com/softpudding/agent-sdk.git@df47da7429a04cc2a5681e701331d85fcb798f1e#subdirectory=openhands-sdk", - "openhands-tools @ git+https://github.com/softpudding/agent-sdk.git@df47da7429a04cc2a5681e701331d85fcb798f1e#subdirectory=openhands-tools", + "openhands-sdk", + "openhands-tools", "litellm @ git+https://github.com/softpudding/litellm.git@bfba5e3889829067baeab3b12d38008360913771", ] @@ -69,3 +69,7 @@ python_version = "3.12" warn_return_any = true warn_unused_configs = true disallow_untyped_defs = true + +[tool.uv.sources] +openhands-sdk = { path = "/Users/yangxiao/git/agent-sdk-context-image-window-sdk/openhands-sdk", editable = true } +openhands-tools = { path = "/Users/yangxiao/git/agent-sdk-context-image-window-sdk/openhands-tools", editable = true } diff --git a/server/agent/context_image_window.py b/server/agent/context_image_window.py new file mode 100644 index 0000000..5cc4442 --- /dev/null +++ b/server/agent/context_image_window.py @@ -0,0 +1,39 @@ +"""Configuration helpers for the live tool-image context window.""" + +from __future__ import annotations + +import os + +from openhands.sdk import get_logger + +logger = get_logger(__name__) + +ENV_CONTEXT_IMAGE_WINDOW = "OPENBROWSER_CONTEXT_IMAGE_WINDOW" +DEFAULT_CONTEXT_IMAGE_WINDOW = 1 + + +def get_context_image_window() -> int | None: + """Return the tool-image window passed to the SDK Agent. + + The default is to keep only the latest screenshot-bearing tool message. + Environment variable semantics: + - `-1`: disable SDK filtering entirely (`None`) + - `0`: keep no screenshot-bearing tool messages + - `N >= 1`: keep the latest N screenshot-bearing tool messages + """ + + raw_value = os.getenv(ENV_CONTEXT_IMAGE_WINDOW) + if raw_value is None or raw_value.strip() == "": + return DEFAULT_CONTEXT_IMAGE_WINDOW + + try: + parsed_value = int(raw_value) + except ValueError: + logger.warning( + "Invalid %s=%r; falling back to %s", + ENV_CONTEXT_IMAGE_WINDOW, + raw_value, + DEFAULT_CONTEXT_IMAGE_WINDOW, + ) + return DEFAULT_CONTEXT_IMAGE_WINDOW + return parsed_value if parsed_value >= 0 else None diff --git a/server/agent/manager.py b/server/agent/manager.py index eff2b1d..6a2bf0c 100644 --- a/server/agent/manager.py +++ b/server/agent/manager.py @@ -30,6 +30,7 @@ from server.api.sse import SSEEvent from server.agent.visualizer import QueueVisualizer from server.agent.conversation import ConversationState +from server.agent.context_image_window import get_context_image_window from server.agent.user_help import PLEASE_HELP_ME_TOOL_NAME import server.agent.tools.help_tool # noqa: F401 from server.agent.tools.browser_executor import remove_browser_executor @@ -287,6 +288,7 @@ def _create_conversation_in_process( agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) + tool_image_window = get_context_image_window() agent = Agent( llm=llm_instance, tools=tools, @@ -297,6 +299,7 @@ def _create_conversation_in_process( system_prompt_kwargs=self._get_system_prompt_kwargs( model=model, model_alias=model_alias ), + tool_image_window=tool_image_window, ) # Create visualizer (queue will be set when processing messages) @@ -513,6 +516,7 @@ def get_or_create_conversation( agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) + tool_image_window = get_context_image_window() agent = Agent( llm=llm_instance, tools=tools, @@ -520,6 +524,7 @@ def get_or_create_conversation( system_prompt_kwargs=self._get_system_prompt_kwargs( model=model, model_alias=model_alias ), + tool_image_window=tool_image_window, ) # Create visualizer (queue will be set when processing messages) diff --git a/server/tests/unit/test_agent_manager_process.py b/server/tests/unit/test_agent_manager_process.py index d031e78..b7eeb3c 100644 --- a/server/tests/unit/test_agent_manager_process.py +++ b/server/tests/unit/test_agent_manager_process.py @@ -173,6 +173,30 @@ def test_system_prompt_kwargs_follow_small_model_profile(self) -> None: "small_model": True, } + def test_single_process_agent_receives_tool_image_window(self) -> None: + """Single-process conversations should pass tool_image_window to Agent.""" + manager = OpenBrowserAgentManager() + + with ( + patch("server.agent.manager.Agent") as mock_agent, + patch("server.agent.manager.Conversation"), + patch("server.agent.manager.QueueVisualizer"), + patch("server.agent.manager.get_context_image_window", return_value=2), + patch.object(manager, "_build_agent_context", return_value=MagicMock()), + patch.object(manager, "_create_llm_from_config", return_value=MagicMock()), + patch.object(manager, "_get_tools_for_model", return_value=[]), + patch.object( + manager, + "_get_system_prompt_kwargs", + return_value={"model_profile": "large", "small_model": False}, + ), + patch("server.agent.manager.get_default_condenser", return_value=None), + ): + manager._create_conversation_in_process(str(uuid.uuid4()), cwd="/tmp/demo") + + assert mock_agent.call_args is not None + assert mock_agent.call_args.kwargs["tool_image_window"] == 2 + class TestConversationCreationMultiProcess: """Tests for conversation creation in multi-process mode.""" diff --git a/server/tests/unit/test_context_image_window.py b/server/tests/unit/test_context_image_window.py new file mode 100644 index 0000000..7f0358b --- /dev/null +++ b/server/tests/unit/test_context_image_window.py @@ -0,0 +1,29 @@ +"""Tests for the live tool-image window configuration helpers.""" + +from server.agent.context_image_window import ( + DEFAULT_CONTEXT_IMAGE_WINDOW, + ENV_CONTEXT_IMAGE_WINDOW, + get_context_image_window, +) + + +class TestContextImageWindowConfig: + def test_default_value_keeps_latest_image(self, monkeypatch) -> None: + monkeypatch.delenv(ENV_CONTEXT_IMAGE_WINDOW, raising=False) + + assert get_context_image_window() == DEFAULT_CONTEXT_IMAGE_WINDOW + + def test_invalid_env_value_falls_back_to_default(self, monkeypatch) -> None: + monkeypatch.setenv(ENV_CONTEXT_IMAGE_WINDOW, "invalid") + + assert get_context_image_window() == DEFAULT_CONTEXT_IMAGE_WINDOW + + def test_negative_value_disables_sdk_filtering(self, monkeypatch) -> None: + monkeypatch.setenv(ENV_CONTEXT_IMAGE_WINDOW, "-1") + + assert get_context_image_window() is None + + def test_zero_value_keeps_no_tool_images(self, monkeypatch) -> None: + monkeypatch.setenv(ENV_CONTEXT_IMAGE_WINDOW, "0") + + assert get_context_image_window() == 0 diff --git a/uv.lock b/uv.lock index 9b9545d..c2de07e 100644 --- a/uv.lock +++ b/uv.lock @@ -1675,8 +1675,8 @@ requires-dist = [ { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=bfba5e3889829067baeab3b12d38008360913771" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, - { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=df47da7429a04cc2a5681e701331d85fcb798f1e" }, - { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=df47da7429a04cc2a5681e701331d85fcb798f1e" }, + { name = "openhands-sdk", editable = "../agent-sdk-context-image-window-sdk/openhands-sdk" }, + { name = "openhands-tools", editable = "../agent-sdk-context-image-window-sdk/openhands-tools" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "pydantic", specifier = ">=2.5.0" }, @@ -2221,7 +2221,7 @@ wheels = [ [[package]] name = "openhands-sdk" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=df47da7429a04cc2a5681e701331d85fcb798f1e#df47da7429a04cc2a5681e701331d85fcb798f1e" } +source = { editable = "../agent-sdk-context-image-window-sdk/openhands-sdk" } dependencies = [ { name = "agent-client-protocol" }, { name = "deprecation" }, @@ -2238,10 +2238,29 @@ dependencies = [ { name = "websockets" }, ] +[package.metadata] +requires-dist = [ + { name = "agent-client-protocol", specifier = ">=0.8.1" }, + { name = "boto3", marker = "extra == 'boto3'", specifier = ">=1.35.0" }, + { name = "deprecation", specifier = ">=2.1.0" }, + { name = "fakeredis", extras = ["lua"], specifier = ">=2.32.1" }, + { name = "fastmcp", specifier = ">=3.0.0" }, + { name = "filelock", specifier = ">=3.20.1" }, + { name = "httpx", specifier = ">=0.27.0" }, + { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=bfba5e3889829067baeab3b12d38008360913771" }, + { name = "lmnr", specifier = ">=0.7.24" }, + { name = "pydantic", specifier = ">=2.12.5" }, + { name = "python-frontmatter", specifier = ">=1.1.0" }, + { name = "python-json-logger", specifier = ">=3.3.0" }, + { name = "tenacity", specifier = ">=9.1.2" }, + { name = "websockets", specifier = ">=12" }, +] +provides-extras = ["boto3"] + [[package]] name = "openhands-tools" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=df47da7429a04cc2a5681e701331d85fcb798f1e#df47da7429a04cc2a5681e701331d85fcb798f1e" } +source = { editable = "../agent-sdk-context-image-window-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, { name = "binaryornot" }, @@ -2254,6 +2273,19 @@ dependencies = [ { name = "tom-swe" }, ] +[package.metadata] +requires-dist = [ + { name = "bashlex", specifier = ">=0.18" }, + { name = "binaryornot", specifier = ">=0.4.4" }, + { name = "browser-use", specifier = ">=0.8.0" }, + { name = "cachetools" }, + { name = "func-timeout", specifier = ">=4.3.5" }, + { name = "libtmux", specifier = ">=0.53.0" }, + { name = "openhands-sdk", editable = "../agent-sdk-context-image-window-sdk/openhands-sdk" }, + { name = "pydantic", specifier = ">=2.11.7" }, + { name = "tom-swe", specifier = ">=1.0.3" }, +] + [[package]] name = "opentelemetry-api" version = "1.40.0" From e6add55d0410790f9630b0655795cd4a1670e4b3 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Sat, 28 Mar 2026 22:07:54 +0800 Subject: [PATCH 02/20] feat(openbrowser): prefer token-driven condensation Co-authored-by: openhands --- server/agent/browser_condenser.py | 62 +++++++++++++++++++ server/agent/manager.py | 16 ++++- .../tests/unit/test_agent_manager_process.py | 38 ++++++++++++ server/tests/unit/test_browser_condenser.py | 51 +++++++++++++++ 4 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 server/agent/browser_condenser.py create mode 100644 server/tests/unit/test_browser_condenser.py diff --git a/server/agent/browser_condenser.py b/server/agent/browser_condenser.py new file mode 100644 index 0000000..298d1d2 --- /dev/null +++ b/server/agent/browser_condenser.py @@ -0,0 +1,62 @@ +"""OpenBrowser-specific condenser tuning.""" + +from __future__ import annotations + +from openhands.sdk import LLM, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser +from openhands.sdk.context.condenser.base import CondenserBase + +logger = get_logger(__name__) + +DEFAULT_BROWSER_CONDENSER_MAX_SIZE = 1000 +DEFAULT_BROWSER_CONDENSER_TOKEN_RATIO = 0.7 + + +def derive_browser_condenser_max_tokens(llm: LLM) -> int | None: + """Derive a token threshold for browser-heavy conversations.""" + + max_input_tokens = llm.max_input_tokens + if not max_input_tokens or max_input_tokens <= 0: + return None + + max_tokens = int(max_input_tokens * DEFAULT_BROWSER_CONDENSER_TOKEN_RATIO) + return max_tokens if max_tokens > 0 else None + + +def configure_browser_condenser( + condenser: CondenserBase | None, + llm: LLM, +) -> CondenserBase | None: + """Prefer token-driven condensation for browser workflows. + + Browser conversations generate many small action/observation events. Keep the + upstream preset, but raise the event-count guardrail and derive a token limit + from the model context window so token usage becomes the primary trigger. + """ + + if condenser is None: + return None + + if not isinstance(condenser, LLMSummarizingCondenser): + return condenser + + updates: dict[str, int] = {} + + if condenser.max_size < DEFAULT_BROWSER_CONDENSER_MAX_SIZE: + updates["max_size"] = DEFAULT_BROWSER_CONDENSER_MAX_SIZE + + if condenser.max_tokens is None: + max_tokens = derive_browser_condenser_max_tokens(llm) + if max_tokens is not None: + updates["max_tokens"] = max_tokens + + if not updates: + return condenser + + configured = condenser.model_copy(update=updates) + logger.info( + "Configured browser condenser with max_size=%s and max_tokens=%s", + configured.max_size, + configured.max_tokens, + ) + return configured diff --git a/server/agent/manager.py b/server/agent/manager.py index 6a2bf0c..1048cb0 100644 --- a/server/agent/manager.py +++ b/server/agent/manager.py @@ -28,6 +28,7 @@ from openhands.sdk.tool import Tool from server.api.sse import SSEEvent +from server.agent.browser_condenser import configure_browser_condenser from server.agent.visualizer import QueueVisualizer from server.agent.conversation import ConversationState from server.agent.context_image_window import get_context_image_window @@ -289,11 +290,15 @@ def _create_conversation_in_process( llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) tool_image_window = get_context_image_window() + condenser_llm = llm_instance.model_copy(update={"usage_id": "condenser"}) agent = Agent( llm=llm_instance, tools=tools, - condenser=get_default_condenser( - llm=llm_instance.model_copy(update={"usage_id": "condenser"}) + condenser=configure_browser_condenser( + get_default_condenser( + llm=condenser_llm, + ), + llm_instance, ), agent_context=agent_context, system_prompt_kwargs=self._get_system_prompt_kwargs( @@ -517,9 +522,16 @@ def get_or_create_conversation( llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) tool_image_window = get_context_image_window() + condenser_llm = llm_instance.model_copy(update={"usage_id": "condenser"}) agent = Agent( llm=llm_instance, tools=tools, + condenser=configure_browser_condenser( + get_default_condenser( + llm=condenser_llm, + ), + llm_instance, + ), agent_context=agent_context, system_prompt_kwargs=self._get_system_prompt_kwargs( model=model, model_alias=model_alias diff --git a/server/tests/unit/test_agent_manager_process.py b/server/tests/unit/test_agent_manager_process.py index b7eeb3c..d677dea 100644 --- a/server/tests/unit/test_agent_manager_process.py +++ b/server/tests/unit/test_agent_manager_process.py @@ -6,6 +6,8 @@ from unittest.mock import MagicMock, patch, PropertyMock import pytest +from openhands.sdk import LLM +from openhands.sdk.context.condenser import LLMSummarizingCondenser # Mock openhands.tools imports used by server.agent.manager in test environments @@ -197,6 +199,42 @@ def test_single_process_agent_receives_tool_image_window(self) -> None: assert mock_agent.call_args is not None assert mock_agent.call_args.kwargs["tool_image_window"] == 2 + def test_single_process_agent_receives_browser_tuned_condenser(self) -> None: + """Single-process conversations should tune condenser for browser workflows.""" + manager = OpenBrowserAgentManager() + llm = LLM.model_construct(model="test-model", max_input_tokens=100_000) + default_condenser = LLMSummarizingCondenser( + llm=llm.model_copy(update={"usage_id": "condenser"}), + max_size=80, + keep_first=4, + ) + + with ( + patch("server.agent.manager.Agent") as mock_agent, + patch("server.agent.manager.Conversation"), + patch("server.agent.manager.QueueVisualizer"), + patch("server.agent.manager.get_context_image_window", return_value=1), + patch.object(manager, "_build_agent_context", return_value=MagicMock()), + patch.object(manager, "_create_llm_from_config", return_value=llm), + patch.object(manager, "_get_tools_for_model", return_value=[]), + patch.object( + manager, + "_get_system_prompt_kwargs", + return_value={"model_profile": "large", "small_model": False}, + ), + patch( + "server.agent.manager.get_default_condenser", + return_value=default_condenser, + ), + ): + manager._create_conversation_in_process(str(uuid.uuid4()), cwd="/tmp/demo") + + assert mock_agent.call_args is not None + condenser = mock_agent.call_args.kwargs["condenser"] + assert isinstance(condenser, LLMSummarizingCondenser) + assert condenser.max_size == 1000 + assert condenser.max_tokens == 70_000 + class TestConversationCreationMultiProcess: """Tests for conversation creation in multi-process mode.""" diff --git a/server/tests/unit/test_browser_condenser.py b/server/tests/unit/test_browser_condenser.py new file mode 100644 index 0000000..e097832 --- /dev/null +++ b/server/tests/unit/test_browser_condenser.py @@ -0,0 +1,51 @@ +"""Tests for OpenBrowser-specific condenser tuning.""" + +from openhands.sdk import LLM +from openhands.sdk.context.condenser import LLMSummarizingCondenser + +from server.agent.browser_condenser import ( + DEFAULT_BROWSER_CONDENSER_MAX_SIZE, + configure_browser_condenser, + derive_browser_condenser_max_tokens, +) + + +def test_derive_browser_condenser_max_tokens_from_context_window() -> None: + llm = LLM.model_construct(model="test-model", max_input_tokens=100_000) + + assert derive_browser_condenser_max_tokens(llm) == 70_000 + + +def test_derive_browser_condenser_max_tokens_returns_none_without_context_window() -> None: + llm = LLM.model_construct(model="test-model", max_input_tokens=None) + + assert derive_browser_condenser_max_tokens(llm) is None + + +def test_configure_browser_condenser_prefers_token_limit() -> None: + llm = LLM.model_construct(model="test-model", max_input_tokens=100_000) + condenser = LLMSummarizingCondenser(llm=llm, max_size=80, keep_first=4) + + configured = configure_browser_condenser(condenser, llm) + + assert isinstance(configured, LLMSummarizingCondenser) + assert configured.max_size == DEFAULT_BROWSER_CONDENSER_MAX_SIZE + assert configured.max_tokens == 70_000 + assert condenser.max_size == 80 + assert condenser.max_tokens is None + + +def test_configure_browser_condenser_preserves_explicit_token_limit() -> None: + llm = LLM.model_construct(model="test-model", max_input_tokens=100_000) + condenser = LLMSummarizingCondenser( + llm=llm, + max_size=80, + max_tokens=55_000, + keep_first=4, + ) + + configured = configure_browser_condenser(condenser, llm) + + assert isinstance(configured, LLMSummarizingCondenser) + assert configured.max_size == DEFAULT_BROWSER_CONDENSER_MAX_SIZE + assert configured.max_tokens == 55_000 From b31372809bb5d5e449ff40874e47fd80f3416c1a Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Sun, 29 Mar 2026 01:10:01 +0800 Subject: [PATCH 03/20] feat(openbrowser): harden visual element identification --- .../__tests__/highlight-integration.test.ts | 18 +- extension/src/background/index.ts | 105 ++-- .../src/commands/__tests__/element-id.test.ts | 137 ++++- .../__tests__/single-highlight.test.ts | 40 +- extension/src/commands/element-actions.ts | 232 ++++---- extension/src/commands/element-cache.ts | 502 +++++++----------- extension/src/commands/element-id.ts | 169 +++++- extension/src/commands/single-highlight.ts | 179 ++++++- extension/src/types.ts | 32 +- extension/src/utils/collision-detection.ts | 12 +- server/AGENTS.md | 2 +- .../big_model/element_interaction_tool.j2 | 61 ++- .../agent/prompts/big_model/highlight_tool.j2 | 18 +- .../small_model/element_interaction_tool.j2 | 28 +- .../prompts/small_model/highlight_tool.j2 | 11 +- server/agent/tools/base.py | 49 +- server/agent/tools/browser_executor.py | 334 ++++++++---- .../agent/tools/element_interaction_tool.py | 6 +- server/agent/tools/highlight_tool.py | 2 +- server/agent/tools/state.py | 9 +- server/models/commands.py | 35 +- .../integration/test_element_operations.py | 49 +- .../integration/test_toolset_integration.py | 5 - .../tests/unit/test_agent_browser_executor.py | 119 +++-- server/tests/unit/test_base_classes.py | 51 +- server/tests/unit/test_command_models.py | 20 +- server/tests/unit/test_prompt_contracts.py | 14 +- server/tests/unit/test_screenshot_behavior.py | 5 +- server/tests/unit/test_state.py | 21 - .../tests/unit/test_tool_prompt_profiles.py | 4 + 30 files changed, 1339 insertions(+), 930 deletions(-) diff --git a/extension/src/__tests__/highlight-integration.test.ts b/extension/src/__tests__/highlight-integration.test.ts index 2dfdb57..e83722c 100644 --- a/extension/src/__tests__/highlight-integration.test.ts +++ b/extension/src/__tests__/highlight-integration.test.ts @@ -109,8 +109,8 @@ describe('Highlight Integration', () => { // Run selectCollisionFreePage const page1 = selectCollisionFreePage(elements, 1); - expect(page1.map((element) => element.id)).toEqual( - page1.map((_, index) => String(index + 1)), + expect(new Set(page1.map((element) => element.id)).size).toBe( + page1.length, ); // Verify no label collisions on the same page @@ -161,15 +161,19 @@ describe('Highlight Integration', () => { const positions = new Set(page1.map((e) => e.labelPosition)); expect(positions.size).toBe(page1.length); - // Verify elements on different pages while numeric ids reset per page. + // Verify elements on different pages while preserving each element's ID. const page1Selectors = new Set(page1.map((e) => e.selector)); + const expectedIdsBySelector = Object.fromEntries( + elements.map((element) => [element.selector, element.id]), + ); const page2 = selectCollisionFreePage(elements, 2); expect(page2.length).toBeGreaterThan(0); - expect(page2.map((element) => element.id)).toEqual( - page2.map((_, index) => String(index + 1)), - ); for (const elem of page2) { expect(page1Selectors.has(elem.selector)).toBe(false); + expect(expectedIdsBySelector[elem.selector]).toBe(elem.id); + } + for (const elem of page1) { + expect(expectedIdsBySelector[elem.selector]).toBe(elem.id); } }); @@ -331,7 +335,7 @@ describe('Highlight Integration', () => { const page1 = selectCollisionFreePage(elements, 1, 1728, 891); - expect(page1.map((e) => e.id)).toEqual(['1', '2', '3']); + expect(page1.map((e) => e.id)).toEqual(['modal', 'like', 'reply']); expect(page1[0].labelPosition).toBeDefined(); expect(page1[1].labelPosition).toBeDefined(); expect(page1[2].labelPosition).toBeDefined(); diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index 992a006..183d721 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -23,7 +23,10 @@ import { clearScreenshotCache } from '../commands/computer'; import { drawHighlights } from '../commands/visual-highlight'; import { highlightSingleElement } from '../commands/single-highlight'; import { elementCache } from '../commands/element-cache'; -import { assignSequentialElementIds } from '../commands/element-id'; +import { + assignHashedElementIds, + normalizeVisualElementIdInput, +} from '../commands/element-id'; import { buildHighlightDetectionScript, filterHighlightElementsByKeywords, @@ -127,11 +130,7 @@ function buildStoredHighlightPages(options: { } = options; if (keywordMode) { - return [ - assignSequentialElementIds( - sortElementsByVisualOrder(assignSequentialElementIds(filteredElements)), - ), - ]; + return [sortElementsByVisualOrder(filteredElements)]; } const pages: InteractiveElement[][] = []; @@ -142,9 +141,7 @@ function buildStoredHighlightPages(options: { viewportWidth, viewportHeight, ); - pages.push( - assignSequentialElementIds(sortElementsByVisualOrder(pageElements)), - ); + pages.push(sortElementsByVisualOrder(pageElements)); } return pages; @@ -1631,7 +1628,9 @@ async function handleCommand(command: Command): Promise { keywords, ); const keywordList = keywordFiltering.keywords; - const filteredElements = keywordFiltering.elements; + const filteredElements = assignHashedElementIds( + keywordFiltering.elements, + ); if (keywordList.length > 0) { console.log( @@ -1648,9 +1647,7 @@ async function handleCommand(command: Command): Promise { if (keywordList.length > 0) { // Keyword mode: return all matching elements, no pagination. - // Assign temporary numeric IDs so the consistency check can - // correlate samples before the final display-order renumbering. - paginatedElements = assignSequentialElementIds(filteredElements); + paginatedElements = filteredElements; totalPages = 1; currentPage = 1; console.log( @@ -1778,9 +1775,9 @@ async function handleCommand(command: Command): Promise { } // Preserve the original highlight pipeline order for detection, - // pagination, and consistency checks. Only sort and renumber at the - // rendering boundary so the screenshot/response stay intuitive - // without changing the stability gate. + // pagination, and consistency checks. Only sort at the rendering + // boundary so the screenshot/response stay intuitive without + // changing the stability gate or element IDs. const storedPages = buildStoredHighlightPages({ filteredElements, totalPages, @@ -1791,18 +1788,19 @@ async function handleCommand(command: Command): Promise { const displayOrderedElements = storedPages[currentPage - 1] ?? []; const cacheStoreStart = Date.now(); - const storedSnapshot = elementCache.storeSnapshot({ + const storedPage = elementCache.storeHighlightResult({ conversationId, tabId: activeTabId, documentId: detectedDocumentId, elementType, keywords: keywordList, totalElements: filteredElements.length, + totalPages: totalPages, pages: storedPages, page: currentPage, }); console.log( - `⏱️ [HighlightTrace] background cache-store ${Date.now() - cacheStoreStart}ms (snapshot=${storedSnapshot.snapshotId}, count=${displayOrderedElements.length})`, + `⏱️ [HighlightTrace] background cache-store ${Date.now() - cacheStoreStart}ms (page=${storedPage.page}, count=${displayOrderedElements.length})`, ); // Log first few element bboxes for debugging @@ -1817,7 +1815,7 @@ async function handleCommand(command: Command): Promise { const drawHighlightsStart = Date.now(); const highlightedScreenshot = await drawHighlights( screenshotResult.imageData, - storedSnapshot.elements, + storedPage.elements, { scale: imageScale, viewportWidth, @@ -1825,7 +1823,7 @@ async function handleCommand(command: Command): Promise { }, ); console.log( - `⏱️ [HighlightTrace] background draw-highlights ${Date.now() - drawHighlightsStart}ms (elements=${storedSnapshot.elements.length})`, + `⏱️ [HighlightTrace] background draw-highlights ${Date.now() - drawHighlightsStart}ms (elements=${storedPage.elements.length})`, ); const compressStart = Date.now(); @@ -1843,8 +1841,7 @@ async function handleCommand(command: Command): Promise { return { success: true, data: { - highlight_snapshot_id: storedSnapshot.snapshotId, - elements: storedSnapshot.elements, + elements: storedPage.elements, totalElements: filteredElements.length, totalPages: totalPages, page: currentPage, @@ -1883,7 +1880,6 @@ async function handleCommand(command: Command): Promise { const clickResult = await performElementClick( command.conversation_id, - command.highlight_snapshot_id, command.element_id, clickTabId, ); @@ -1954,7 +1950,6 @@ async function handleCommand(command: Command): Promise { const hoverResult = await performElementHover( command.conversation_id, - command.highlight_snapshot_id, command.element_id, hoverTabId, ); @@ -2003,7 +1998,6 @@ async function handleCommand(command: Command): Promise { // element_id is optional - if not provided, scrolls the entire page const scrollResult = await performElementScroll( command.conversation_id, - command.highlight_snapshot_id, command.element_id, command.direction || 'down', scrollTabId, @@ -2053,7 +2047,6 @@ async function handleCommand(command: Command): Promise { const swipeResult = await performElementSwipe( command.conversation_id, - command.highlight_snapshot_id, command.element_id, command.direction || 'next', swipeTabId, @@ -2104,7 +2097,6 @@ async function handleCommand(command: Command): Promise { const inputResult = await performKeyboardInput( command.conversation_id, - command.highlight_snapshot_id, command.element_id, command.text, inputTabId, @@ -2153,7 +2145,6 @@ async function handleCommand(command: Command): Promise { const selectResult = await performElementSelect( command.conversation_id, - command.highlight_snapshot_id, command.element_id, selectTabId, command.value, @@ -2198,16 +2189,10 @@ async function handleCommand(command: Command): Promise { throw new Error('conversation_id required for get_element_html'); const conversationId = command.conversation_id; const elementId = command.element_id; - const highlightSnapshotId = command.highlight_snapshot_id; if (!elementId) { throw new Error('element_id is required for get_element_html'); } - if (highlightSnapshotId === undefined || highlightSnapshotId === null) { - throw new Error( - 'highlight_snapshot_id is required for get_element_html', - ); - } // Get current active tab for this conversation const activeTabId = tabManager.getCurrentActiveTabId(conversationId); @@ -2221,20 +2206,22 @@ async function handleCommand(command: Command): Promise { const element = elementCache.getElementById( conversationId, activeTabId, - highlightSnapshotId, elementId, ); if (!element) { + const normalizedElementId = normalizeVisualElementIdInput(elementId); console.warn( - `⚠️ [GetElementHtml] Element ${elementId} not found in cache for conversation ${conversationId}, tab ${activeTabId}, snapshot ${highlightSnapshotId}`, + `⚠️ [GetElementHtml] Element ${elementId} not found in cache for conversation ${conversationId}, tab ${activeTabId}`, ); return { success: false, - error: `Element ${elementId} not found in cache for highlight snapshot ${highlightSnapshotId}. The snapshot may have expired or the page may have changed. Try highlight_elements again.`, + error: + normalizedElementId !== elementId + ? `Element ${elementId} was interpreted as ${normalizedElementId} for visual-safe ID matching, but no cached element matched. The highlight cache may have expired or the page may have changed. Try highlight_elements again.` + : `Element ${elementId} not found in cache. The highlight cache may have expired or the page may have changed. Try highlight_elements again.`, data: { element_id: elementId, - highlight_snapshot_id: highlightSnapshotId, html: null, }, timestamp: Date.now(), @@ -2249,10 +2236,15 @@ async function handleCommand(command: Command): Promise { return { success: true, - message: `Retrieved HTML for element ${elementId}`, + message: + element.elementIdCorrected && element.resolvedElementId !== elementId + ? `Retrieved HTML for element ${element.resolvedElementId} (matched from requested ${elementId})` + : `Retrieved HTML for element ${element.resolvedElementId}`, data: { - element_id: elementId, - highlight_snapshot_id: highlightSnapshotId, + element_id: element.resolvedElementId, + requested_element_id: elementId, + resolved_element_id: element.resolvedElementId, + element_id_corrected: element.elementIdCorrected, html: html, tagName: element.element.tagName, type: element.element.type, @@ -2269,27 +2261,26 @@ async function handleCommand(command: Command): Promise { } const conversationId = command.conversation_id; const activeTabId = tabManager.getCurrentActiveTabId(conversationId); - const highlightSnapshotId = command.highlight_snapshot_id; if (!activeTabId) { throw new Error(`No active tab for conversation ${conversationId}`); } - if (highlightSnapshotId === undefined || highlightSnapshotId === null) { - throw new Error( - 'highlight_snapshot_id is required for highlight_single_element command', - ); - } // Get element from cache const element = elementCache.getElementById( conversationId, activeTabId, - highlightSnapshotId, command.element_id, ); if (!element) { + const normalizedElementId = normalizeVisualElementIdInput( + command.element_id, + ); return { success: false, - error: `Element ${command.element_id} not found in cache for highlight snapshot ${highlightSnapshotId}. Call highlight_elements() again.`, + error: + normalizedElementId !== command.element_id + ? `Element ${command.element_id} was interpreted as ${normalizedElementId} for visual-safe ID matching, but no cached element matched. Call highlight_elements() again.` + : `Element ${command.element_id} not found in cache. Call highlight_elements() again.`, timestamp: Date.now(), }; } @@ -2424,7 +2415,7 @@ async function handleCommand(command: Command): Promise { ok: false, stale: true, error: - "Highlight snapshot ${highlightSnapshotId} is stale because the document changed. Call highlight_elements() again." + "The highlighted element is stale because the document changed. Call highlight_elements() again." }; } if (!el) { @@ -2432,7 +2423,7 @@ async function handleCommand(command: Command): Promise { ok: false, stale: true, error: - "Element not found in DOM for this highlight snapshot. Call highlight_elements() again." + "Element not found in DOM for the cached highlight result. Call highlight_elements() again." }; } const currentFingerprint = getElementFingerprint(el); @@ -2441,7 +2432,7 @@ async function handleCommand(command: Command): Promise { ok: false, stale: true, error: - "Highlight snapshot is stale because the target element identity changed. Call highlight_elements() again." + "The cached highlight result is stale because the target element identity changed. Call highlight_elements() again." }; } const rect = el.getBoundingClientRect(); @@ -2488,7 +2479,7 @@ async function handleCommand(command: Command): Promise { success: false, error: bboxResult.result.value.error || - `Element ${command.element_id} is stale for highlight snapshot ${highlightSnapshotId}. Call highlight_elements() again.`, + `Element ${command.element_id} is stale. Call highlight_elements() again.`, timestamp: Date.now(), }; } else { @@ -2554,7 +2545,6 @@ async function handleCommand(command: Command): Promise { `Element ${element.element.id} is not visible in the current viewport. ${scrollHint}`.trim(), data: { elementId: element.element.id, - highlight_snapshot_id: highlightSnapshotId, bbox: freshBbox, viewportWidth, viewportHeight, @@ -2574,6 +2564,7 @@ async function handleCommand(command: Command): Promise { screenshotResult.imageData, elementWithFreshBbox, { + intendedAction: command.intended_action, scale: screenshotResult.metadata?.imageScale || screenshotResult.metadata?.devicePixelRatio || @@ -2591,8 +2582,10 @@ async function handleCommand(command: Command): Promise { highlightedScreenshot, getCompressionThreshold(), ), - elementId: command.element_id, - highlight_snapshot_id: highlightSnapshotId, + elementId: element.resolvedElementId, + requestedElementId: command.element_id, + resolvedElementId: element.resolvedElementId, + elementIdCorrected: element.elementIdCorrected, ...(screenshotResult?.dialog_auto_accepted ? { dialog_auto_accepted: screenshotResult.dialog_auto_accepted } : {}), diff --git a/extension/src/commands/__tests__/element-id.test.ts b/extension/src/commands/__tests__/element-id.test.ts index 12ad0cb..9f34c61 100644 --- a/extension/src/commands/__tests__/element-id.test.ts +++ b/extension/src/commands/__tests__/element-id.test.ts @@ -2,7 +2,20 @@ import { describe, expect, test } from 'bun:test'; import type { InteractiveElement } from '../../types'; import { elementCache } from '../element-cache'; -import { assignSequentialElementIds } from '../element-id'; +import { + ELEMENT_ID_CHARSET, + ELEMENT_ID_LENGTH, + assignHashedElementIds, + generateShortHash, + normalizeVisualElementIdInput, +} from '../element-id'; + +function usesAllowedElementIdChars(id: string): boolean { + return ( + id.length === ELEMENT_ID_LENGTH && + [...id].every((char) => ELEMENT_ID_CHARSET.includes(char)) + ); +} function createElement(id: string, selector: string): InteractiveElement { return { @@ -10,6 +23,7 @@ function createElement(id: string, selector: string): InteractiveElement { type: 'clickable', tagName: 'button', selector, + html: ``, bbox: { x: 0, y: 0, width: 10, height: 10 }, isVisible: true, isInViewport: true, @@ -17,59 +31,138 @@ function createElement(id: string, selector: string): InteractiveElement { } describe('element-id', () => { - test('assigns page-local numeric ids in order', () => { - const result = assignSequentialElementIds([ - createElement('old-a', '#a'), - createElement('old-b', '#b'), - createElement('old-c', '#c'), + test('generates fixed-length visual-safe hashes', () => { + const hash = generateShortHash('#checkout', ''); + + expect(hash).toHaveLength(ELEMENT_ID_LENGTH); + expect(usesAllowedElementIdChars(hash)).toBe(true); + }); + + test('assigns stable hash ids based on selector and html', () => { + const result = assignHashedElementIds([ + createElement('old-a', '#checkout'), + createElement('old-b', '#email'), + createElement('old-c', '#submit'), ]); - expect(result.map((element) => element.id)).toEqual(['1', '2', '3']); - expect(result.map((element) => element.selector)).toEqual([ - '#a', - '#b', - '#c', + expect(result.every((element) => usesAllowedElementIdChars(element.id))).toBe( + true, + ); + expect(new Set(result.map((element) => element.id)).size).toBe( + result.length, + ); + + const secondPass = assignHashedElementIds([ + createElement('different-a', '#checkout'), + createElement('different-b', '#email'), + createElement('different-c', '#submit'), ]); + + expect(secondPass.map((element) => element.id)).toEqual( + result.map((element) => element.id), + ); + }); + + test('keeps ids stable regardless of input array order', () => { + const original = [ + createElement('first', '#alpha'), + createElement('second', '#beta'), + createElement('third', '#gamma'), + ]; + const reversed = [...original].reverse(); + + const originalAssigned = assignHashedElementIds(original); + const reversedAssigned = assignHashedElementIds(reversed); + + expect( + Object.fromEntries( + originalAssigned.map((element) => [element.selector, element.id]), + ), + ).toEqual( + Object.fromEntries( + reversedAssigned.map((element) => [element.selector, element.id]), + ), + ); }); test('does not mutate the caller-owned element objects', () => { const original = [createElement('keep-me', '#a')]; - const result = assignSequentialElementIds(original); + const result = assignHashedElementIds(original); expect(original[0].id).toBe('keep-me'); - expect(result[0].id).toBe('1'); + expect(usesAllowedElementIdChars(result[0].id)).toBe(true); expect(result[0]).not.toBe(original[0]); }); + + test('normalizes visually ambiguous 3-character element IDs only', () => { + expect(normalizeVisualElementIdInput('D02')).toBe('DO2'); + expect(normalizeVisualElementIdInput(' d o 2 ')).toBe('DO2'); + expect(normalizeVisualElementIdInput('id-10')).toBe('id-10'); + }); }); -describe('element-cache highlight snapshots', () => { - test('stores a page-scoped snapshot and resolves element IDs within that snapshot', () => { +describe('element-cache document cache', () => { + test('stores highlight pages and resolves element IDs from the current document cache', () => { elementCache.clearAll(); + const assignedPages = [ + assignHashedElementIds([createElement('', '#page-1')]), + assignHashedElementIds([createElement('', '#page-2')]), + ]; - const snapshot = elementCache.storeSnapshot({ + const storedPage = elementCache.storeHighlightResult({ conversationId: 'conv-1', tabId: 101, documentId: 'doc-1', elementType: 'any', totalElements: 2, - pages: [[createElement('1', '#page-1')], [createElement('1', '#page-2')]], + totalPages: 2, + pages: assignedPages, page: 1, }); - expect(snapshot.snapshotId).toBe(1); - expect(snapshot.page).toBe(1); - expect(snapshot.elements.map((element) => element.selector)).toEqual([ + expect(storedPage.documentId).toBe('doc-1'); + expect(storedPage.page).toBe(1); + expect(storedPage.elements.map((element) => element.selector)).toEqual([ '#page-1', ]); + expect(storedPage.elements[0]?.id).toBe(assignedPages[0]?.[0]?.id); + const storedElementId = storedPage.elements[0]?.id; + expect(storedElementId).toBeDefined(); const lookup = elementCache.getElementById( 'conv-1', 101, - snapshot.snapshotId, - '1', + storedElementId!, ); expect(lookup?.element.selector).toBe('#page-1'); expect(lookup?.documentId).toBe('doc-1'); }); + + test('resolves visually ambiguous requested IDs to the cached visual-safe ID', () => { + elementCache.clearAll(); + const page = [createElement('DO2', '#page-corrected')]; + + elementCache.storeHighlightResult({ + conversationId: 'conv-visual-safe', + tabId: 202, + documentId: 'doc-visual-safe', + elementType: 'any', + totalElements: 1, + totalPages: 1, + pages: [page], + page: 1, + }); + + const lookup = elementCache.getElementById( + 'conv-visual-safe', + 202, + 'D02', + ); + + expect(lookup?.requestedElementId).toBe('D02'); + expect(lookup?.resolvedElementId).toBe('DO2'); + expect(lookup?.elementIdCorrected).toBe(true); + expect(lookup?.element.selector).toBe('#page-corrected'); + }); }); diff --git a/extension/src/commands/__tests__/single-highlight.test.ts b/extension/src/commands/__tests__/single-highlight.test.ts index 5e9aa11..2439aef 100644 --- a/extension/src/commands/__tests__/single-highlight.test.ts +++ b/extension/src/commands/__tests__/single-highlight.test.ts @@ -1,7 +1,11 @@ import { describe, expect, test } from 'bun:test'; import type { InteractiveElement } from '../../types'; -import { calculateConfirmationPreviewLayout } from '../single-highlight'; +import { + calculateConfirmationBannerLayout, + calculateConfirmationPreviewLayout, + getConfirmationPromptText, +} from '../single-highlight'; function createElement(bbox: InteractiveElement['bbox']): InteractiveElement { return { @@ -16,6 +20,40 @@ function createElement(bbox: InteractiveElement['bbox']): InteractiveElement { } describe('single-highlight confirmation preview', () => { + test('formats confirmation reminder text for click and keyboard input', () => { + expect(getConfirmationPromptText('click')).toBe( + 'Is this the element you wanted to click?', + ); + expect(getConfirmationPromptText('keyboard_input')).toBe( + 'Is this the element you wanted to type into?', + ); + }); + + test('places the confirmation reminder above the highlight when space is available', () => { + const banner = calculateConfirmationBannerLayout({ + canvasWidth: 720, + canvasHeight: 420, + elementRect: { x: 220, y: 180, width: 120, height: 40 }, + message: getConfirmationPromptText('click'), + scale: 1, + }); + + expect(banner.y + banner.height).toBeLessThanOrEqual(180 - 8); + expect(banner.x).toBeGreaterThanOrEqual(10); + }); + + test('falls back below the highlight when there is no room above', () => { + const banner = calculateConfirmationBannerLayout({ + canvasWidth: 720, + canvasHeight: 420, + elementRect: { x: 220, y: 18, width: 120, height: 40 }, + message: getConfirmationPromptText('click'), + scale: 1, + }); + + expect(banner.y).toBeGreaterThanOrEqual(18 + 40 + 8); + }); + test('uses a bounded close-up crop around the selected element', () => { const layout = calculateConfirmationPreviewLayout( 1280, diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts index fae90a0..e9b5b28 100644 --- a/extension/src/commands/element-actions.ts +++ b/extension/src/commands/element-actions.ts @@ -14,6 +14,7 @@ import type { ElementActionResult } from '../types'; import { ELEMENT_CACHE_TTL_DESCRIPTION, elementCache } from './element-cache'; import { executeJavaScript, type JavaScriptResult } from './javascript'; import { buildHitTestVisibilityHelpersScript } from '../utils/hit-test-visibility'; +import { normalizeVisualElementIdInput } from './element-id'; function escapeForDoubleQuotedJavaScriptString(value: string): string { return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"'); @@ -21,12 +22,31 @@ function escapeForDoubleQuotedJavaScriptString(value: string): string { function buildElementCacheMissMessage( elementId: string, - refreshHint: string = 'Call highlight_elements() again to get a fresh highlight_snapshot_id.', + refreshHint: string = 'Call highlight_elements() again to refresh the element cache.', ): string { - return `Element '${elementId}' not found in cache for the referenced highlight snapshot. Highlight snapshots expire after ${ELEMENT_CACHE_TTL_DESCRIPTION}. ${refreshHint}`; + const normalizedElementId = normalizeVisualElementIdInput(elementId); + if (normalizedElementId && normalizedElementId !== elementId) { + return `Element '${elementId}' was interpreted as '${normalizedElementId}' for visual-safe ID matching, but no cached element matched. Highlight caches expire after ${ELEMENT_CACHE_TTL_DESCRIPTION}. ${refreshHint}`; + } + return `Element '${elementId}' not found in cache. Highlight caches expire after ${ELEMENT_CACHE_TTL_DESCRIPTION}. ${refreshHint}`; } -function buildSnapshotIdentityHelpersScript(): string { +function buildResolvedElementResultFields( + requestedElementId: string, + resolvedElementId: string, +): Pick< + ElementActionResult, + 'elementId' | 'requestedElementId' | 'resolvedElementId' | 'elementIdCorrected' +> { + return { + elementId: resolvedElementId, + requestedElementId, + resolvedElementId, + elementIdCorrected: requestedElementId !== resolvedElementId, + }; +} + +function buildCachedElementIdentityHelpersScript(): string { return ` function normalizeIdentityWhitespace(value, maxLength = 240) { const normalized = String(value ?? '') @@ -145,14 +165,14 @@ function buildSnapshotIdentityHelpersScript(): string { return overlap >= Math.max(2, Math.min(4, Math.ceil(expectedTokens.length * 0.5))); } - function validateSnapshotElement(expectedHighlightSnapshotId, expectedDocumentId, expectedFingerprint, el) { + function validateCachedElement(expectedDocumentId, expectedFingerprint, el) { const currentDocumentId = getCurrentDocumentId(); if (expectedDocumentId && currentDocumentId !== expectedDocumentId) { return { ok: false, stale: true, error: - \`Highlight snapshot \${expectedHighlightSnapshotId} is stale because the document changed. Call highlight_elements() again.\`, + 'The cached element is stale because the document changed. Call highlight_elements() again.', }; } @@ -162,7 +182,7 @@ function buildSnapshotIdentityHelpersScript(): string { ok: false, stale: true, error: - 'Highlight snapshot is stale because the target element no longer matches the cached identity. Call highlight_elements() again.', + 'The cached element is stale because the target no longer matches the cached identity. Call highlight_elements() again.', }; } @@ -177,7 +197,7 @@ function buildSnapshotIdentityHelpersScript(): string { function buildEditableActivationHelpersScript(): string { return ` - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} ${buildHitTestVisibilityHelpersScript()} function getInteractiveActivationTarget(target) { @@ -386,21 +406,19 @@ export interface SelectResult extends ElementActionResult { * 4. Return result with dialog info if applicable * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements - * @param elementId Cached element ID from the referenced highlight snapshot (for example, "1") + * @param elementId Cached element ID from the latest highlight cache (for example, "A1H") * @param tabId Target tab ID * @param timeout Maximum execution time in milliseconds (default: 30000) * @returns Click result with success status and dialog info */ export async function performElementClick( conversationId: string, - highlightSnapshotId: number, elementId: string, tabId: number, timeout: number = 30000, ): Promise { console.log( - `👆 [ElementClick] Clicking element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`, + `👆 [ElementClick] Clicking element ${elementId} in conversation ${conversationId} on tab ${tabId}`, ); // ============================================================ @@ -409,23 +427,23 @@ export async function performElementClick( const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementClick] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), clicked: false, staleElement: false, - error: buildElementCacheMissMessage( - elementId, - 'Call highlight_elements() again to get a fresh highlight_snapshot_id and element IDs.', - ), + error: buildElementCacheMissMessage(elementId), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementClick] Found element: selector="${element.selector}"`, @@ -448,7 +466,6 @@ export async function performElementClick( const script = ` (async function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; ${buildEditableActivationHelpersScript()} @@ -458,8 +475,7 @@ export async function performElementClick( return { clicked: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -548,7 +564,7 @@ export async function performElementClick( console.error(`❌ [ElementClick] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, clicked: false, staleElement: false, error: error instanceof Error ? error.message : String(error), @@ -567,7 +583,7 @@ export async function performElementClick( console.log(`❌ [ElementClick] Click execution failed: ${jsResult.error}`); return { success: false, - elementId, + ...resolvedElementFields, clicked: false, staleElement: false, error: jsResult.error || 'Click JavaScript execution failed', @@ -588,7 +604,7 @@ export async function performElementClick( ); const result: ClickResult = { success: true, - elementId, + ...resolvedElementFields, clicked: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -623,7 +639,7 @@ export async function performElementClick( ); return { success: false, - elementId, + ...resolvedElementFields, clicked: false, staleElement: false, error: invalidResultError, @@ -638,7 +654,7 @@ export async function performElementClick( return { success: false, - elementId, + ...resolvedElementFields, clicked: false, staleElement: isStale, error: clickResult?.error, @@ -650,7 +666,7 @@ export async function performElementClick( // If dialog opened during click, propagate dialog info const result: ClickResult = { success: true, - elementId, + ...resolvedElementFields, clicked: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -683,21 +699,19 @@ export async function performElementClick( * 4. Return result * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements - * @param elementId Cached element ID from the referenced highlight snapshot (for example, "1") + * @param elementId Cached element ID from the latest highlight cache (for example, "A1H") * @param tabId Target tab ID * @param timeout Maximum execution time in milliseconds (default: 30000) * @returns Hover result with success status */ export async function performElementHover( conversationId: string, - highlightSnapshotId: number, elementId: string, tabId: number, timeout: number = 30000, ): Promise { console.log( - `🖱️ [ElementHover] Hovering element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`, + `🖱️ [ElementHover] Hovering element ${elementId} in conversation ${conversationId} on tab ${tabId}`, ); // ============================================================ @@ -706,20 +720,23 @@ export async function performElementHover( const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementHover] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), hovered: false, staleElement: false, error: buildElementCacheMissMessage(elementId), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementHover] Found element: selector="${element.selector}"`, @@ -741,18 +758,16 @@ export async function performElementHover( const script = ` (function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} const el = document.querySelector(selector); if (!el) { return { hovered: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -823,7 +838,7 @@ export async function performElementHover( console.error(`❌ [ElementHover] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, hovered: false, staleElement: false, }; @@ -836,7 +851,7 @@ export async function performElementHover( console.log(`❌ [ElementHover] Hover execution failed: ${jsResult.error}`); return { success: false, - elementId, + ...resolvedElementFields, hovered: false, staleElement: false, }; @@ -856,7 +871,7 @@ export async function performElementHover( ); const result: HoverResult = { success: true, - elementId, + ...resolvedElementFields, hovered: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -887,7 +902,7 @@ export async function performElementHover( return { success: false, - elementId, + ...resolvedElementFields, hovered: false, staleElement: isStale, }; @@ -898,7 +913,7 @@ export async function performElementHover( // If dialog opened during hover, propagate dialog info const result: HoverResult = { success: true, - elementId, + ...resolvedElementFields, hovered: true, }; @@ -960,8 +975,7 @@ export interface SwipeResult extends ElementActionResult { * 3. Execute and return result * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements. Required when elementId is provided - * @param elementId Cached element ID from the referenced highlight snapshot. Optional - if not provided, scrolls the entire page + * @param elementId Cached element ID from the latest highlight cache. Optional - if not provided, scrolls the entire page * @param direction Swipe direction ('next' or 'prev') * @param tabId Target tab ID * @param timeout Maximum execution time in milliseconds (default: 30000) @@ -969,7 +983,6 @@ export interface SwipeResult extends ElementActionResult { */ export async function performElementScroll( conversationId: string, - highlightSnapshotId: number | undefined, elementId: string | undefined, direction: ScrollDirection, tabId: number, @@ -997,35 +1010,31 @@ export async function performElementScroll( const { x: xMultiplier, y: yMultiplier } = scrollMultipliers[direction]; let script: string; + let resolvedElementFields: + | ReturnType + | undefined; if (elementId) { // Scroll a specific element - if (highlightSnapshotId === undefined || highlightSnapshotId === null) { - return { - success: false, - elementId, - scrolled: false, - error: - 'highlight_snapshot_id is required when scrolling a highlighted element.', - }; - } - const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementScroll] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), scrolled: false, error: buildElementCacheMissMessage(elementId), }; } const element = cachedElement.element; + resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementScroll] Found element: selector="${element.selector}"`, @@ -1043,20 +1052,18 @@ export async function performElementScroll( script = ` (function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const el = document.querySelector(selector); const xMultiplier = ${xMultiplier}; const yMultiplier = ${yMultiplier}; - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} if (!el) { return { scrolled: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -1212,7 +1219,10 @@ export async function performElementScroll( console.error(`❌ [ElementScroll] JavaScript execution error:`, error); return { success: false, - elementId, + ...(resolvedElementFields ?? + (elementId + ? buildResolvedElementResultFields(elementId, elementId) + : {})), scrolled: false, }; } @@ -1224,7 +1234,10 @@ export async function performElementScroll( ); return { success: false, - elementId, + ...(resolvedElementFields ?? + (elementId + ? buildResolvedElementResultFields(elementId, elementId) + : {})), scrolled: false, }; } @@ -1243,7 +1256,10 @@ export async function performElementScroll( ); const result: ScrollResult = { success: true, - elementId, + ...(resolvedElementFields ?? + (elementId + ? buildResolvedElementResultFields(elementId, elementId) + : {})), scrolled: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -1282,7 +1298,10 @@ export async function performElementScroll( return { success: false, - elementId, + ...(resolvedElementFields ?? + (elementId + ? buildResolvedElementResultFields(elementId, elementId) + : {})), scrolled: false, staleElement: isStale, }; @@ -1301,7 +1320,8 @@ export async function performElementScroll( const result: ScrollResult = { success: true, - elementId, + ...(resolvedElementFields ?? + (elementId ? buildResolvedElementResultFields(elementId, elementId) : {})), scrolled: true, scrollEffective, ...(warning ? { warning } : {}), @@ -1336,7 +1356,6 @@ export async function performElementScroll( */ export async function performElementSwipe( conversationId: string, - highlightSnapshotId: number, elementId: string, direction: SwipeDirection, tabId: number, @@ -1344,25 +1363,28 @@ export async function performElementSwipe( timeout: number = 30000, ): Promise { console.log( - `🫳 [ElementSwipe] Swiping element ${elementId} from snapshot ${highlightSnapshotId} ${direction} (count: ${swipeCount}) in conversation ${conversationId} on tab ${tabId}`, + `🫳 [ElementSwipe] Swiping element ${elementId} ${direction} (count: ${swipeCount}) in conversation ${conversationId} on tab ${tabId}`, ); const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementSwipe] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), swiped: false, error: buildElementCacheMissMessage(elementId), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementSwipe] Found element: selector="${element.selector}"`, @@ -1381,20 +1403,18 @@ export async function performElementSwipe( const script = ` (async function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const direction = "${direction}"; const swipeCount = ${swipeCount}; - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} const el = document.querySelector(selector); if (!el) { return { swiped: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -2392,7 +2412,7 @@ export async function performElementSwipe( console.error(`❌ [ElementSwipe] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, swiped: false, error: error instanceof Error ? error.message : String(error), }; @@ -2402,7 +2422,7 @@ export async function performElementSwipe( console.log(`❌ [ElementSwipe] Swipe execution failed: ${jsResult.error}`); return { success: false, - elementId, + ...resolvedElementFields, swiped: false, error: jsResult.error || 'Swipe JavaScript execution failed', }; @@ -2414,7 +2434,7 @@ export async function performElementSwipe( ); const result: SwipeResult = { success: true, - elementId, + ...resolvedElementFields, swiped: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -2449,7 +2469,7 @@ export async function performElementSwipe( console.log(`❌ [ElementSwipe] Swipe failed: ${error}, stale=${isStale}`); return { success: false, - elementId, + ...resolvedElementFields, swiped: false, staleElement: isStale, error, @@ -2471,7 +2491,7 @@ export async function performElementSwipe( return { success: true, - elementId, + ...resolvedElementFields, swiped: true, swipeEffective, ...(warning ? { warning } : {}), @@ -2499,8 +2519,7 @@ export interface InputResult extends ElementActionResult { * 4. Return result with input value * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements - * @param elementId Cached element ID from the referenced highlight snapshot + * @param elementId Cached element ID from the latest highlight cache * @param text Text to input into the element * @param tabId Target tab ID * @param timeout Maximum execution time in milliseconds (default: 30000) @@ -2508,14 +2527,13 @@ export interface InputResult extends ElementActionResult { */ export async function performKeyboardInput( conversationId: string, - highlightSnapshotId: number, elementId: string, text: string, tabId: number, timeout: number = 30000, ): Promise { console.log( - `⌨️ [KeyboardInput] Inputting text to element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`, + `⌨️ [KeyboardInput] Inputting text to element ${elementId} in conversation ${conversationId} on tab ${tabId}`, ); // ============================================================ @@ -2524,20 +2542,23 @@ export async function performKeyboardInput( const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [KeyboardInput] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), input: false, staleElement: false, error: buildElementCacheMissMessage(elementId), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [KeyboardInput] Found element: selector="${element.selector}"`, @@ -2561,7 +2582,6 @@ export async function performKeyboardInput( const script = ` (function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const text = "${escapedText}"; @@ -2572,8 +2592,7 @@ export async function performKeyboardInput( return { input: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -2689,7 +2708,7 @@ export async function performKeyboardInput( console.error(`❌ [KeyboardInput] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, input: false, staleElement: false, }; @@ -2707,7 +2726,7 @@ export async function performKeyboardInput( console.log(`❌ [KeyboardInput] Input execution failed: ${jsResult.error}`); return { success: false, - elementId, + ...resolvedElementFields, input: false, staleElement: false, }; @@ -2727,7 +2746,7 @@ export async function performKeyboardInput( ); const result: InputResult = { success: true, - elementId, + ...resolvedElementFields, input: true, value: undefined, new_tabs_created: jsResult.new_tabs_created, @@ -2759,7 +2778,7 @@ export async function performKeyboardInput( return { success: false, - elementId, + ...resolvedElementFields, input: false, staleElement: isStale, }; @@ -2772,7 +2791,7 @@ export async function performKeyboardInput( // If dialog opened during input, propagate dialog info const result: InputResult = { success: true, - elementId, + ...resolvedElementFields, input: true, value: inputResult.value, }; @@ -2805,8 +2824,7 @@ export async function performKeyboardInput( * 4. Return result with selected values/labels/indices * * @param conversationId Session ID for element cache lookup - * @param highlightSnapshotId Highlight snapshot ID returned by highlight_elements - * @param elementId Cached element ID from the referenced highlight snapshot (for example, "1") + * @param elementId Cached element ID from the latest highlight cache (for example, "A1H") * @param tabId Target tab ID * @param value Option value(s) to select. Use string for single select, array for multi-select * @param timeout Maximum execution time in milliseconds (default: 30000) @@ -2814,14 +2832,13 @@ export async function performKeyboardInput( */ export async function performElementSelect( conversationId: string, - highlightSnapshotId: number, elementId: string, tabId: number, value: string | string[], timeout: number = 30000, ): Promise { console.log( - `📋 [ElementSelect] Selecting element ${elementId} from snapshot ${highlightSnapshotId} in conversation ${conversationId} on tab ${tabId}`, + `📋 [ElementSelect] Selecting element ${elementId} in conversation ${conversationId} on tab ${tabId}`, ); // ============================================================ @@ -2830,20 +2847,23 @@ export async function performElementSelect( const cachedElement = elementCache.getElementById( conversationId, tabId, - highlightSnapshotId, elementId, ); if (!cachedElement) { console.log(`❌ [ElementSelect] Element ${elementId} not found in cache`); return { success: false, - elementId, + ...buildResolvedElementResultFields(elementId, elementId), selected: false, staleElement: false, error: buildElementCacheMissMessage(elementId), }; } const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); console.log( `✅ [ElementSelect] Found element: selector="${element.selector}"`, @@ -2869,11 +2889,10 @@ export async function performElementSelect( const script = ` (function() { const selector = "${escapedSelector}"; - const expectedHighlightSnapshotId = ${highlightSnapshotId}; const expectedDocumentId = "${escapedDocumentId}"; const expectedFingerprint = "${escapedFingerprint}"; const value = ${valueJson}; - ${buildSnapshotIdentityHelpersScript()} + ${buildCachedElementIdentityHelpersScript()} const el = document.querySelector(selector); @@ -2881,8 +2900,7 @@ export async function performElementSelect( return { selected: false, error: "Element not found in DOM", stale: true }; } - const snapshotValidation = validateSnapshotElement( - expectedHighlightSnapshotId, + const snapshotValidation = validateCachedElement( expectedDocumentId, expectedFingerprint, el, @@ -2999,7 +3017,7 @@ export async function performElementSelect( console.error(`❌ [ElementSelect] JavaScript execution error:`, error); return { success: false, - elementId, + ...resolvedElementFields, selected: false, staleElement: false, }; @@ -3016,7 +3034,7 @@ export async function performElementSelect( ); return { success: false, - elementId, + ...resolvedElementFields, selected: false, staleElement: false, }; @@ -3036,7 +3054,7 @@ export async function performElementSelect( ); const result: SelectResult = { success: true, - elementId, + ...resolvedElementFields, selected: true, new_tabs_created: jsResult.new_tabs_created, }; @@ -3075,7 +3093,7 @@ export async function performElementSelect( return { success: false, - elementId, + ...resolvedElementFields, selected: false, staleElement: isStale, error: selectResult?.error, @@ -3089,7 +3107,7 @@ export async function performElementSelect( // Build result with selected values const result: SelectResult = { success: true, - elementId, + ...resolvedElementFields, selected: true, selectedValues: selectResult.selectedValues, selectedLabels: selectResult.selectedLabels, diff --git a/extension/src/commands/element-cache.ts b/extension/src/commands/element-cache.ts index 61babb5..32330b1 100644 --- a/extension/src/commands/element-cache.ts +++ b/extension/src/commands/element-cache.ts @@ -1,16 +1,21 @@ /** - * Highlight snapshot cache manager. + * Document-scoped element cache manager. * - * Two cache layers are maintained: - * 1. Per-call highlight inventories used to serve requested pages and keep - * page-local element IDs stable within one highlight response. - * 2. Page-scoped highlight snapshots returned to callers and used for - * element interactions together with page-local element IDs. + * Each conversation/tab keeps one active cache for the current highlighted + * document: + * 1. Persistent element-id assignments for the current document + * 2. A merged element lookup table keyed only by element_id + * 3. Latest highlight metadata for the current document */ import type { ElementType, InteractiveElement } from '../types'; +import { + buildElementIdentityKey, + generateUniqueHash, + normalizeVisualElementIdInput, +} from './element-id'; -interface HighlightInventoryEntry { +interface DocumentElementCacheEntry { tabId: number; createdAt: number; lastAccessedAt: number; @@ -18,19 +23,13 @@ interface HighlightInventoryEntry { elementType: ElementType; keywords: string[]; totalElements: number; - pages: InteractiveElement[][]; -} - -interface HighlightSnapshotViewEntry { - tabId: number; - inventoryId: number; - createdAt: number; - page: number; + totalPages: number; + idByIdentityKey: Map; + usedIds: Set; + elementsById: Map; } -export interface HighlightSnapshotPage { - snapshotId: number; - inventoryId: number; +export interface StoredHighlightPage { page: number; totalPages: number; totalElements: number; @@ -41,178 +40,88 @@ export interface HighlightSnapshotPage { } export interface CachedElementLookup { - snapshotId: number; - inventoryId: number; - page: number; - totalPages: number; - totalElements: number; documentId: string; elementType: ElementType; keywords: string[]; + totalElements: number; + totalPages: number; + requestedElementId: string; + resolvedElementId: string; + normalizedRequestedElementId: string; + elementIdCorrected: boolean; element: InteractiveElement; } export const ELEMENT_CACHE_TTL_MS = 1_200_000; // 20 minutes export const ELEMENT_CACHE_TTL_DESCRIPTION = `${ELEMENT_CACHE_TTL_MS / 60_000} minutes`; -const MAX_HIGHLIGHT_INVENTORIES_PER_TAB = 12; class ElementCacheImpl { - private inventories = new Map(); + private documents = new Map(); - private snapshotViews = new Map(); - - private nextInventoryId = 1; - - private nextSnapshotId = 1; - - private buildInventoryKey( - conversationId: string, - tabId: number, - inventoryId: number, - ): string { - return `${conversationId}:${tabId}:inventory:${inventoryId}`; - } - - private buildSnapshotKey( - conversationId: string, - tabId: number, - snapshotId: number, - ): string { - return `${conversationId}:${tabId}:snapshot:${snapshotId}`; - } - - private touchInventory(entry: HighlightInventoryEntry): void { - entry.lastAccessedAt = Date.now(); + private buildDocumentKey(conversationId: string, tabId: number): string { + return `${conversationId}:${tabId}`; } private isExpired(timestamp: number): boolean { return Date.now() - timestamp > ELEMENT_CACHE_TTL_MS; } - private removeInventoryByKey(key: string): void { - const inventory = this.inventories.get(key); - if (!inventory) { - return; - } - - this.inventories.delete(key); - - const snapshotKeysToDelete: string[] = []; - for (const [snapshotKey, snapshot] of this.snapshotViews.entries()) { - if (snapshot.inventoryId === this.parseInventoryIdFromKey(key)) { - snapshotKeysToDelete.push(snapshotKey); - } - } - for (const snapshotKey of snapshotKeysToDelete) { - this.snapshotViews.delete(snapshotKey); - } - - console.log( - `🗑️ [ElementCache] Removed highlight inventory ${key} (${inventory.pages.length} pages, ${snapshotKeysToDelete.length} snapshots)`, - ); + private cloneElement(element: InteractiveElement, id: string): InteractiveElement { + return { + ...element, + bbox: { ...element.bbox }, + id, + }; } - private parseInventoryIdFromKey(key: string): number { - const maybeId = Number.parseInt(key.split(':').at(-1) ?? '', 10); - return Number.isFinite(maybeId) ? maybeId : -1; + private touchEntry(entry: DocumentElementCacheEntry): void { + entry.lastAccessedAt = Date.now(); } private cleanupExpired(): void { - const activeInventoryKeys = new Set(); - - for (const [snapshotKey, snapshot] of this.snapshotViews.entries()) { - if (this.isExpired(snapshot.createdAt)) { - this.snapshotViews.delete(snapshotKey); - console.log( - `⏰ [ElementCache] Snapshot expired for key ${snapshotKey}`, - ); - continue; - } - - const inventoryKey = snapshotKey.replace( - /:snapshot:\d+$/, - `:inventory:${snapshot.inventoryId}`, - ); - activeInventoryKeys.add(inventoryKey); - } - - const inventoryKeysToDelete: string[] = []; - for (const [inventoryKey, inventory] of this.inventories.entries()) { - if (this.isExpired(inventory.lastAccessedAt)) { - inventoryKeysToDelete.push(inventoryKey); - continue; - } - - if ( - !activeInventoryKeys.has(inventoryKey) && - this.isExpired(inventory.createdAt) - ) { - inventoryKeysToDelete.push(inventoryKey); + for (const [key, entry] of this.documents.entries()) { + if (this.isExpired(entry.lastAccessedAt)) { + this.documents.delete(key); + console.log(`⏰ [ElementCache] Document cache expired for key ${key}`); } } - - for (const inventoryKey of inventoryKeysToDelete) { - this.removeInventoryByKey(inventoryKey); - } } - private pruneInventoriesForTab(conversationId: string, tabId: number): void { - const prefix = `${conversationId}:${tabId}:inventory:`; - const matchingInventories = Array.from(this.inventories.entries()) - .filter(([key]) => key.startsWith(prefix)) - .sort((a, b) => a[1].createdAt - b[1].createdAt); - - if (matchingInventories.length <= MAX_HIGHLIGHT_INVENTORIES_PER_TAB) { - return; - } - - const toDelete = matchingInventories.slice( - 0, - matchingInventories.length - MAX_HIGHLIGHT_INVENTORIES_PER_TAB, - ); - for (const [inventoryKey] of toDelete) { - this.removeInventoryByKey(inventoryKey); - } - } - - storeSnapshot(options: { + private getOrCreateEntry(options: { conversationId: string; tabId: number; documentId: string; elementType: ElementType; - keywords?: string[]; + keywords: string[]; totalElements: number; - pages: InteractiveElement[][]; - page: number; - }): HighlightSnapshotPage { + totalPages: number; + }): DocumentElementCacheEntry { const { conversationId, tabId, documentId, elementType, - keywords = [], + keywords, totalElements, - pages, - page, + totalPages, } = options; this.cleanupExpired(); - const inventoryId = this.nextInventoryId++; - const snapshotId = this.nextSnapshotId++; + const key = this.buildDocumentKey(conversationId, tabId); + const existing = this.documents.get(key); const now = Date.now(); - const inventoryKey = this.buildInventoryKey( - conversationId, - tabId, - inventoryId, - ); - const snapshotKey = this.buildSnapshotKey( - conversationId, - tabId, - snapshotId, - ); - this.inventories.set(inventoryKey, { + if (existing && existing.documentId === documentId) { + existing.lastAccessedAt = now; + existing.elementType = elementType; + existing.keywords = [...keywords]; + existing.totalElements = totalElements; + existing.totalPages = totalPages; + return existing; + } + + const created: DocumentElementCacheEntry = { tabId, createdAt: now, lastAccessedAt: now, @@ -220,214 +129,197 @@ class ElementCacheImpl { elementType, keywords: [...keywords], totalElements, - pages: pages.map((snapshotPage) => - snapshotPage.map((element) => ({ - ...element, - bbox: { ...element.bbox }, - })), - ), - }); - - this.snapshotViews.set(snapshotKey, { - tabId, - inventoryId, - createdAt: now, - page, - }); - - this.pruneInventoriesForTab(conversationId, tabId); + totalPages, + idByIdentityKey: new Map(), + usedIds: new Set(), + elementsById: new Map(), + }; - const snapshotPage = this.getSnapshotPage( - conversationId, - tabId, - snapshotId, + this.documents.set(key, created); + console.log( + `📁 [ElementCache] Started new document cache for conversation ${conversationId}, tab ${tabId}, document ${documentId}`, ); - if (!snapshotPage) { - throw new Error( - `Failed to retrieve newly stored highlight snapshot ${snapshotId}`, - ); + return created; + } + + private assignIdsForEntry( + entry: DocumentElementCacheEntry, + elements: InteractiveElement[], + ): InteractiveElement[] { + const assignedIds = new Array(elements.length); + + const elementsByStableKey = elements + .map((element, index) => ({ + element, + index, + identityKey: buildElementIdentityKey(element), + })) + .sort((left, right) => { + const keyOrder = left.identityKey.localeCompare(right.identityKey); + if (keyOrder !== 0) { + return keyOrder; + } + return left.index - right.index; + }); + + for (const { element, index, identityKey } of elementsByStableKey) { + let elementId = entry.idByIdentityKey.get(identityKey); + if (!elementId) { + if (element.id && !entry.usedIds.has(element.id)) { + elementId = element.id; + } else { + const { hash } = generateUniqueHash( + element.selector, + entry.usedIds, + element.html, + ); + elementId = hash; + } + entry.idByIdentityKey.set(identityKey, elementId); + entry.usedIds.add(elementId); + } + assignedIds[index] = elementId; } - console.log( - `📁 [ElementCache] Stored highlight inventory ${inventoryId} and snapshot ${snapshotId} for conversation ${conversationId}, tab ${tabId} (${pages.length} pages, ${totalElements} total elements)`, + return elements.map((element, index) => + this.cloneElement(element, assignedIds[index] || element.id), ); - return snapshotPage; } - getSnapshotPage( - conversationId: string, - tabId: number, - snapshotId: number, - ): HighlightSnapshotPage | undefined { - this.cleanupExpired(); - - const snapshot = this.getSnapshotView(conversationId, tabId, snapshotId); - if (!snapshot) { - return undefined; - } + storeHighlightResult(options: { + conversationId: string; + tabId: number; + documentId: string; + elementType: ElementType; + keywords?: string[]; + totalElements: number; + totalPages: number; + page: number; + pages: InteractiveElement[][]; + }): StoredHighlightPage { + const { + conversationId, + tabId, + documentId, + elementType, + keywords = [], + totalElements, + totalPages, + page, + pages, + } = options; - const inventory = this.getInventory( + const entry = this.getOrCreateEntry({ conversationId, tabId, - snapshot.inventoryId, + documentId, + elementType, + keywords, + totalElements, + totalPages, + }); + + const assignedPages = pages.map((pageElements) => + this.assignIdsForEntry(entry, pageElements), ); - if (!inventory) { - return undefined; + + for (const pageElements of assignedPages) { + for (const element of pageElements) { + entry.elementsById.set(element.id, this.cloneElement(element, element.id)); + } } - this.touchInventory(inventory); + this.touchEntry(entry); - const pageIndex = Math.max(0, snapshot.page - 1); - const elements = inventory.pages[pageIndex] ?? []; + console.log( + `📁 [ElementCache] Stored ${assignedPages.length} highlight pages for conversation ${conversationId}, tab ${tabId} (${totalElements} total elements on document ${documentId})`, + ); return { - snapshotId, - inventoryId: snapshot.inventoryId, - page: snapshot.page, - totalPages: inventory.pages.length, - totalElements: inventory.totalElements, - elementType: inventory.elementType, - keywords: [...inventory.keywords], - documentId: inventory.documentId, - elements: elements.map((element) => ({ - ...element, - bbox: { ...element.bbox }, - })), + page, + totalPages, + totalElements, + elementType, + keywords: [...keywords], + documentId, + elements: (assignedPages[Math.max(0, page - 1)] ?? []).map((element) => + this.cloneElement(element, element.id), + ), }; } getElementById( conversationId: string, tabId: number, - snapshotId: number, elementId: string, ): CachedElementLookup | undefined { - const snapshotPage = this.getSnapshotPage( - conversationId, - tabId, - snapshotId, - ); - if (!snapshotPage) { - return undefined; - } - - const element = snapshotPage.elements.find( - (candidate) => candidate.id === elementId, - ); - if (!element) { - return undefined; - } - - return { - snapshotId, - inventoryId: snapshotPage.inventoryId, - page: snapshotPage.page, - totalPages: snapshotPage.totalPages, - totalElements: snapshotPage.totalElements, - documentId: snapshotPage.documentId, - elementType: snapshotPage.elementType, - keywords: snapshotPage.keywords, - element, - }; - } - - getSnapshotView( - conversationId: string, - tabId: number, - snapshotId: number, - ): HighlightSnapshotViewEntry | undefined { - if (!conversationId) { - return undefined; - } - - const snapshotKey = this.buildSnapshotKey( - conversationId, - tabId, - snapshotId, - ); - const snapshot = this.snapshotViews.get(snapshotKey); - if (!snapshot) { - return undefined; - } - - if (snapshot.tabId !== tabId || this.isExpired(snapshot.createdAt)) { - this.snapshotViews.delete(snapshotKey); - console.log( - `⏰ [ElementCache] Snapshot expired or mismatched for key ${snapshotKey}`, - ); - return undefined; - } - - return snapshot; - } + this.cleanupExpired(); - getInventory( - conversationId: string, - tabId: number, - inventoryId: number, - ): HighlightInventoryEntry | undefined { - if (!conversationId) { + const key = this.buildDocumentKey(conversationId, tabId); + const entry = this.documents.get(key); + if (!entry || entry.tabId !== tabId) { return undefined; } - const inventoryKey = this.buildInventoryKey( - conversationId, - tabId, - inventoryId, - ); - const inventory = this.inventories.get(inventoryKey); - if (!inventory) { - return undefined; + this.touchEntry(entry); + const requestedElementId = elementId; + const normalizedRequestedElementId = + normalizeVisualElementIdInput(requestedElementId); + let resolvedElementId = requestedElementId; + let element = entry.elementsById.get(requestedElementId); + + if (!element && normalizedRequestedElementId !== requestedElementId) { + element = entry.elementsById.get(normalizedRequestedElementId); + if (element) { + resolvedElementId = normalizedRequestedElementId; + } } - if (inventory.tabId !== tabId || this.isExpired(inventory.lastAccessedAt)) { - this.removeInventoryByKey(inventoryKey); + if (!element) { return undefined; } - return inventory; + return { + documentId: entry.documentId, + elementType: entry.elementType, + keywords: [...entry.keywords], + totalElements: entry.totalElements, + totalPages: entry.totalPages, + requestedElementId, + resolvedElementId, + normalizedRequestedElementId, + elementIdCorrected: requestedElementId !== resolvedElementId, + element: this.cloneElement(element, element.id), + }; } invalidate(conversationId: string, tabId?: number): void { - const inventoryPrefix = - tabId !== undefined - ? `${conversationId}:${tabId}:inventory:` - : `${conversationId}:`; - const snapshotPrefix = - tabId !== undefined - ? `${conversationId}:${tabId}:snapshot:` - : `${conversationId}:`; - - const inventoryKeysToDelete = Array.from(this.inventories.keys()).filter( - (key) => key.startsWith(inventoryPrefix), - ); - const snapshotKeysToDelete = Array.from(this.snapshotViews.keys()).filter( - (key) => key.startsWith(snapshotPrefix), - ); + const keysToDelete = Array.from(this.documents.keys()).filter((key) => { + if (tabId === undefined) { + return key.startsWith(`${conversationId}:`); + } + return key === this.buildDocumentKey(conversationId, tabId); + }); - for (const key of inventoryKeysToDelete) { - this.inventories.delete(key); - } - for (const key of snapshotKeysToDelete) { - this.snapshotViews.delete(key); + for (const key of keysToDelete) { + this.documents.delete(key); } - if (inventoryKeysToDelete.length > 0 || snapshotKeysToDelete.length > 0) { + if (keysToDelete.length > 0) { const scope = tabId !== undefined ? `tab ${tabId}` : 'all tabs'; console.log( - `🗑️ [ElementCache] Invalidated ${inventoryKeysToDelete.length} inventories and ${snapshotKeysToDelete.length} snapshots for conversation ${conversationId} (${scope})`, + `🗑️ [ElementCache] Invalidated ${keysToDelete.length} document caches for conversation ${conversationId} (${scope})`, ); } } clearAll(): void { - this.inventories.clear(); - this.snapshotViews.clear(); + this.documents.clear(); console.log('🧹 [ElementCache] Cleared all caches'); } get size(): number { - return this.snapshotViews.size; + return this.documents.size; } } diff --git a/extension/src/commands/element-id.ts b/extension/src/commands/element-id.ts index 57513be..757c09b 100644 --- a/extension/src/commands/element-id.ts +++ b/extension/src/commands/element-id.ts @@ -1,17 +1,168 @@ import type { InteractiveElement } from '../types'; +export const ELEMENT_ID_CHARSET = + '123456789ACDEFHJKMNOPQRTUVWXY'; +export const ELEMENT_ID_LENGTH = 3; +const ELEMENT_ID_SPACE = ELEMENT_ID_CHARSET.length ** ELEMENT_ID_LENGTH; +const NORMALIZABLE_ELEMENT_ID_PATTERN = /^[0-9A-Za-z]{3}$/; +const AMBIGUOUS_ELEMENT_ID_CHAR_MAP: Record = { + '0': 'O', + o: 'O', + O: 'O', + i: '1', + I: '1', + l: '1', + L: '1', + z: '2', + Z: '2', + s: '5', + S: '5', + g: '6', + G: '6', + b: '8', + B: '8', +}; + +function encodeFixedVisualId(value: number): string { + let remaining = value; + const chars = Array.from( + { length: ELEMENT_ID_LENGTH }, + () => ELEMENT_ID_CHARSET[0], + ); + + for (let index = ELEMENT_ID_LENGTH - 1; index >= 0; index -= 1) { + chars[index] = ELEMENT_ID_CHARSET[remaining % ELEMENT_ID_CHARSET.length]; + remaining = Math.floor(remaining / ELEMENT_ID_CHARSET.length); + } + + return chars.join(''); +} + +/** + * Generate a short stable hash from a selector and optional HTML content. + * + * Uses FNV-1a for speed and reasonable distribution, then projects into the + * fixed 3-character visual-safe ID space used by highlight labels. + */ +export function generateShortHash( + cssPath: string, + html?: string, + salt: number = 0, +): string { + const FNV_PRIME = 0x01000193; + const FNV_OFFSET = 0x811c9dc5; + + let input = html ? `${cssPath}:${html}` : cssPath; + if (salt > 0) { + input = `${input}:${salt}`; + } + + let hash = FNV_OFFSET; + for (let index = 0; index < input.length; index += 1) { + hash ^= input.charCodeAt(index); + hash = Math.imul(hash, FNV_PRIME); + } + + return encodeFixedVisualId((hash >>> 0) % ELEMENT_ID_SPACE); +} + +export function generateUniqueHash( + cssPath: string, + existingHashes: Set, + html?: string, + maxAttempts: number = 512, +): { hash: string; salt: number } { + let salt = 0; + + while (salt < maxAttempts) { + const hash = generateShortHash(cssPath, html, salt); + if (!existingHashes.has(hash)) { + return { hash, salt }; + } + salt += 1; + } + + const fallbackSalt = Date.now(); + return { + hash: generateShortHash(cssPath, html, fallbackSalt), + salt: fallbackSalt, + }; +} + +export function normalizeVisualElementIdInput(value: string): string { + const compact = value.trim().replace(/\s+/g, ''); + if (!compact) { + return ''; + } + + if (!NORMALIZABLE_ELEMENT_ID_PATTERN.test(compact)) { + return compact; + } + + return compact + .split('') + .map((char) => { + const mapped = AMBIGUOUS_ELEMENT_ID_CHAR_MAP[char]; + if (mapped) { + return mapped; + } + + return char.toUpperCase(); + }) + .join(''); +} + +export function buildElementIdentityKey(element: InteractiveElement): string { + return `${element.selector}\u0000${element.html ?? ''}`; +} + /** - * Reassign element IDs to page-local sequential numbers. + * Assign short hash IDs that stay stable for the same selector/content. * - * IDs are intentionally page-local: each highlight snapshot page starts at 1 - * again, and the page-local IDs must be paired with highlight_snapshot_id. + * IDs are opaque references, not sequence numbers. */ -export function assignSequentialElementIds( +export function assignHashedElementIds( elements: InteractiveElement[], ): InteractiveElement[] { - return elements.map((element, index) => ({ - ...element, - bbox: { ...element.bbox }, - id: String(index + 1), - })); + const existingHashes = new Set(); + const assignedIds = new Array(elements.length); + + const elementsByStableKey = elements + .map((element, index) => ({ + element, + index, + identityKey: buildElementIdentityKey(element), + })) + .sort((left, right) => { + const keyOrder = left.identityKey.localeCompare(right.identityKey); + if (keyOrder !== 0) { + return keyOrder; + } + return left.index - right.index; + }); + + for (const { element, index } of elementsByStableKey) { + const { hash } = generateUniqueHash( + element.selector, + existingHashes, + element.html, + ); + existingHashes.add(hash); + assignedIds[index] = hash; + } + + return elements.map((element, index) => { + const assignedId = assignedIds[index]; + if (!assignedId) { + throw new Error( + `Failed to assign an element ID for selector "${element.selector}"`, + ); + } + + return { + ...element, + bbox: { ...element.bbox }, + id: assignedId, + }; + }); } diff --git a/extension/src/commands/single-highlight.ts b/extension/src/commands/single-highlight.ts index 98c3a54..e9517f7 100644 --- a/extension/src/commands/single-highlight.ts +++ b/extension/src/commands/single-highlight.ts @@ -6,15 +6,23 @@ import type { InteractiveElement } from '../types'; // Visual style for single-element confirmation -const CONFIRMATION_COLOR = '#FF6600'; // Orange border +const CONFIRMATION_COLOR = '#FFD400'; // Yellow border +const CONFIRMATION_TEXT_COLOR = '#111111'; +const CONFIRMATION_BANNER_COLOR = 'rgba(255, 212, 0, 0.5)'; +const CONFIRMATION_BANNER_BORDER_COLOR = 'rgba(17, 17, 17, 0.18)'; const BASE_BOX_PADDING = 2; -const BASE_LINE_WIDTH = 3; +const BASE_LINE_WIDTH = 4; const BASE_CONTEXT_PADDING_X = 96; const BASE_CONTEXT_PADDING_Y = 112; const BASE_MIN_CROP_WIDTH = 520; const BASE_MIN_CROP_HEIGHT = 320; const MIN_CROP_WIDTH_RATIO = 0.58; const MIN_CROP_HEIGHT_RATIO = 0.58; +const BASE_BANNER_FONT_SIZE = 22; +const BASE_BANNER_PADDING_X = 12; +const BASE_BANNER_PADDING_Y = 12; +const BASE_BANNER_MARGIN = 14; +const BASE_BANNER_GAP = 12; interface DeviceRect { x: number; @@ -39,7 +47,12 @@ interface ConfirmationPreviewLayout { export async function highlightSingleElement( screenshotDataUrl: string, element: InteractiveElement, - options?: { scale?: number; viewportWidth?: number; viewportHeight?: number }, + options?: { + intendedAction?: 'click' | 'keyboard_input'; + scale?: number; + viewportWidth?: number; + viewportHeight?: number; + }, ): Promise { console.log( `🎨 [SingleHighlight] Drawing highlight for element ${element.id}...`, @@ -199,6 +212,12 @@ export async function highlightSingleElement( // Draw the single element bounding box drawSingleBoundingBox(ctx, previewLayout.element, scale); + drawConfirmationBanner( + ctx, + previewLayout.element, + options?.intendedAction, + scale, + ); const resultBlob = await canvas.convertToBlob({ type: 'image/png' }); @@ -224,6 +243,105 @@ export async function highlightSingleElement( } } +export function formatConfirmationOperationLabel( + intendedAction?: 'click' | 'keyboard_input', +): string { + switch (intendedAction) { + case 'click': + return 'click'; + case 'keyboard_input': + return 'type into'; + default: + return 'interact with'; + } +} + +export function getConfirmationPromptText( + intendedAction?: 'click' | 'keyboard_input', +): string { + return `Is this the element you wanted to ${formatConfirmationOperationLabel(intendedAction)}?`; +} + +export function calculateConfirmationBannerLayout(options: { + canvasWidth: number; + canvasHeight: number; + elementRect: DeviceRect; + message: string; + scale: number; + textWidth?: number; +}): DeviceRect { + const { + canvasWidth, + canvasHeight, + elementRect, + message, + scale, + textWidth, + } = options; + const fontSize = Math.max(16, Math.round(BASE_BANNER_FONT_SIZE * scale)); + const paddingX = Math.max(12, Math.round(BASE_BANNER_PADDING_X * scale)); + const paddingY = Math.max(8, Math.round(BASE_BANNER_PADDING_Y * scale)); + const margin = Math.max(10, Math.round(BASE_BANNER_MARGIN * scale)); + const gap = Math.max(8, Math.round(BASE_BANNER_GAP * scale)); + const estimatedTextWidth = Math.ceil(message.length * fontSize * 0.6); + const resolvedTextWidth = Math.ceil(textWidth ?? estimatedTextWidth); + const width = Math.min( + canvasWidth - margin * 2, + resolvedTextWidth + paddingX * 2, + ); + const height = fontSize + paddingY * 2; + + const clampX = (value: number): number => + clamp(value, margin, Math.max(margin, canvasWidth - width - margin)); + const clampY = (value: number): number => + clamp(value, margin, Math.max(margin, canvasHeight - height - margin)); + const centeredX = clampX(elementRect.x + elementRect.width / 2 - width / 2); + const centeredY = clampY(elementRect.y + elementRect.height / 2 - height / 2); + + if (elementRect.y - gap - height >= margin) { + return { + x: centeredX, + y: elementRect.y - gap - height, + width, + height, + }; + } + + if (elementRect.y + elementRect.height + gap + height <= canvasHeight - margin) { + return { + x: centeredX, + y: elementRect.y + elementRect.height + gap, + width, + height, + }; + } + + if (elementRect.x + elementRect.width + gap + width <= canvasWidth - margin) { + return { + x: elementRect.x + elementRect.width + gap, + y: centeredY, + width, + height, + }; + } + + if (elementRect.x - gap - width >= margin) { + return { + x: elementRect.x - gap - width, + y: centeredY, + width, + height, + }; + } + + return { + x: centeredX, + y: clampY(elementRect.y + elementRect.height + gap), + width, + height, + }; +} + /** * Calculate a focused preview crop around the target element. */ @@ -311,10 +429,63 @@ function drawSingleBoundingBox( `[SingleHighlight] Drawing confirmation bbox at (${x}, ${y}, ${width}, ${height}) scale=${scale}`, ); - // Draw bounding box with orange color + // Draw bounding box with a bright yellow confirmation color. + ctx.save(); ctx.strokeStyle = CONFIRMATION_COLOR; ctx.lineWidth = lineWidth; + ctx.shadowColor = 'rgba(255, 212, 0, 0.7)'; + ctx.shadowBlur = 12 * scale; ctx.strokeRect(x, y, width, height); + ctx.restore(); +} + +function drawConfirmationBanner( + ctx: OffscreenCanvasRenderingContext2D, + elementRect: DeviceRect, + intendedAction: 'click' | 'keyboard_input' | undefined, + scale: number, +): void { + const message = getConfirmationPromptText(intendedAction); + const fontSize = Math.max(16, Math.round(BASE_BANNER_FONT_SIZE * scale)); + const paddingX = Math.max(12, Math.round(BASE_BANNER_PADDING_X * scale)); + + ctx.save(); + ctx.font = `700 ${fontSize}px sans-serif`; + ctx.textBaseline = 'middle'; + const measuredTextWidth = ctx.measureText(message).width; + const bannerRect = calculateConfirmationBannerLayout({ + canvasWidth: ctx.canvas.width, + canvasHeight: ctx.canvas.height, + elementRect, + message, + scale, + textWidth: measuredTextWidth, + }); + + ctx.fillStyle = CONFIRMATION_BANNER_COLOR; + ctx.fillRect( + bannerRect.x, + bannerRect.y, + bannerRect.width, + bannerRect.height, + ); + ctx.strokeStyle = CONFIRMATION_BANNER_BORDER_COLOR; + ctx.lineWidth = Math.max(1, scale); + ctx.strokeRect( + bannerRect.x, + bannerRect.y, + bannerRect.width, + bannerRect.height, + ); + + ctx.fillStyle = CONFIRMATION_TEXT_COLOR; + ctx.fillText( + message, + bannerRect.x + paddingX, + bannerRect.y + bannerRect.height / 2, + bannerRect.width - paddingX * 2, + ); + ctx.restore(); } function clamp(value: number, min: number, max: number): number { diff --git a/extension/src/types.ts b/extension/src/types.ts index 0685fd4..10ed783 100644 --- a/extension/src/types.ts +++ b/extension/src/types.ts @@ -130,10 +130,8 @@ export interface HighlightElementsCommand extends BaseCommand { export interface ClickElementCommand extends BaseCommand { type: 'click_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; /** * Target tab ID (optional - auto-resolved from conversation if not provided) * Note: Required in Python models, but optional here as extension auto-resolves it @@ -143,10 +141,8 @@ export interface ClickElementCommand extends BaseCommand { export interface HoverElementCommand extends BaseCommand { type: 'hover_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; /** * Target tab ID (optional - auto-resolved from conversation if not provided) * Note: Required in Python models, but optional here as extension auto-resolves it @@ -156,10 +152,8 @@ export interface HoverElementCommand extends BaseCommand { export interface ScrollElementCommand extends BaseCommand { type: 'scroll_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string). If not provided, scrolls the entire page */ + /** Element ID from highlight response (short opaque string). If not provided, scrolls the entire page */ element_id?: string; - /** Highlight snapshot ID returned by highlight_elements. Required when element_id is provided */ - highlight_snapshot_id?: number; direction?: ScrollDirection; /** Scroll amount relative to page/element height (0.5 = half page, 1.0 = full page) */ scroll_amount?: number; @@ -174,10 +168,8 @@ export type SwipeDirection = 'next' | 'prev'; export interface SwipeElementCommand extends BaseCommand { type: 'swipe_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; direction?: SwipeDirection; /** Number of swipe steps for carousel/swiper interactions */ swipe_count?: number; @@ -190,10 +182,8 @@ export interface SwipeElementCommand extends BaseCommand { export interface KeyboardInputCommand extends BaseCommand { type: 'keyboard_input'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; text: string; /** * Target tab ID (optional - auto-resolved from conversation if not provided) @@ -204,10 +194,8 @@ export interface KeyboardInputCommand extends BaseCommand { export interface SelectElementCommand extends BaseCommand { type: 'select_element'; - /** Element ID from the referenced highlight snapshot response (page-local numeric string) */ + /** Element ID from highlight response (short opaque string) */ element_id: string; - /** Highlight snapshot ID returned by highlight_elements */ - highlight_snapshot_id: number; /** Option value(s) to select. Use string for single select, array for multi-select (` dropdown element by its visual ID. - If your goal is to **pick a value** from a list of options → use `select` ```json -{ "action": "select", "highlight_snapshot_id": 17, "element_id": "5", "value": "option1", "tab_id": 123 } +{ "action": "select", "element_id": "M8P", "value": "option1", "tab_id": 123 } // → Executes immediately and returns the resulting screenshot ``` **Parameters**: -- `highlight_snapshot_id`: (required) Highlight snapshot ID from highlight tool response - `element_id`: (required) Element ID from highlight tool response - `value`: (required) Option value to select (matches `value` attribute of `