Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ repos:
hooks:
- id: black
name: black
entry: uv run black
entry: uv run --extra dev black
language: system
types_or: [python, pyi]
require_serial: true
Expand Down
7 changes: 6 additions & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,9 @@ Elements are paginated to ensure **no visual overlap** in each screenshot:

- `highlight_elements` now uses a **snapshot-first** readiness check instead of page-side polling loops.
- Reason: OpenBrowser intentionally keeps automated tabs in the browser background, and Chrome may heavily throttle hidden-tab timers. A page-side `setTimeout` stability loop can therefore take far longer than its nominal budget and become the main cause of highlight timeouts.
- In practice, the main cause of unstable first-highlight screenshots is often **missing warmup**, not a bad readiness classifier. A background tab may answer lightweight `Runtime.evaluate` probes while still sitting in a partially painted / partially decoded state.
- A screenshot-style warmup is therefore the default precondition for `highlight_elements`. It helps force hidden-tab paint/compositor/image-decode work before interactive-element detection runs.
- If `highlight_elements` keeps returning `not_ready` but `tab view` immediately makes the next highlight succeed, treat that as a warmup issue first.
- The extension samples viewport readiness signals once per attempt: document readiness, viewport text/media density, pending images, and loading placeholders such as skeleton/shimmer/spinner indicators.
- Readiness is graded as `ready`, `provisionally_ready`, or `not_ready`.
- If readiness is `not_ready`, the extension performs only a couple of short **background-side** retries before proceeding or returning the latest result.
Expand All @@ -213,7 +216,7 @@ Elements are paginated to ensure **no visual overlap** in each screenshot:
```
# Highlight mixed elements first (default)
highlight_elements() → Page 1 of any interactive elements
highlight_elements(page=2) → Page 2 of the same any inventory
highlight_elements(page=2) → Page 2 of the current page state's any results
highlight_elements(element_type="any", page=1) → Explicit any-first discovery

# Highlight other types (one at a time)
Expand Down Expand Up @@ -312,6 +315,8 @@ cd extension && npm run build

OpenBrowser has explicit screenshot control for maximum flexibility:

- Screenshots also serve as a practical page warmup mechanism for background tabs. They can unblock page paint and media decode work that passive DOM/readiness inspection does not reliably trigger on its own.

### Commands That Return Screenshots

| Command | Auto-Screenshot | Notes |
Expand Down
10 changes: 8 additions & 2 deletions eval/bluebook/js/bluebook.js
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,12 @@ window.tracker = new AgentTracker('bluebook.life', 'hard');
return notes;
}

function stabilizeDefaultFeedOrder(notes) {
keepNoteAwayFromTop(notes, 'note-openclaw-config', 18);
keepNoteAwayFromTop(notes, 'note-arigato-ai', 24);
return notes;
}

function getCurrentNote() {
return state.notes.find((note) => note.id === state.currentNoteId) || null;
}
Expand Down Expand Up @@ -1059,7 +1065,7 @@ window.tracker = new AgentTracker('bluebook.life', 'hard');
state.notes[swapIndex] = temp;
}

keepNoteAwayFromTop(state.notes, 'note-openclaw-config', 18);
stabilizeDefaultFeedOrder(state.notes);
}

function handleFeedReload() {
Expand Down Expand Up @@ -1467,7 +1473,7 @@ window.tracker = new AgentTracker('bluebook.life', 'hard');

function initialize() {
state.notes = buildNotes();
keepNoteAwayFromTop(state.notes, 'note-openclaw-config', 18);
stabilizeDefaultFeedOrder(state.notes);
state.query = getSearchQueryFromUrl();

cacheDom();
Expand Down
14 changes: 9 additions & 5 deletions eval/dataset/cloudstack.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@ criteria:
event_type: click
element_id: "das-agent-toggle"
page: "/cloudstack/das.html"
alternative:
event_type: click
element_id: "open-chat-btn"
page: "/cloudstack/das.html"
alternatives:
- event_type: click
element_id: "open-chat-btn"
page: "/cloudstack/das.html"
- event_type: click
element_text: "DAS Agent"
parent_text_contains: "AI"
page: "/cloudstack/das.html"
- type: greet_das_agent
description: "Send a greeting message to DAS agent"
points: 1
Expand All @@ -40,4 +44,4 @@ criteria:
event_type: click
element_id: "send-btn"
page: "/cloudstack/das.html"
optional: true
optional: true
14 changes: 9 additions & 5 deletions eval/dataset/cloudstack_interactive.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,14 @@ criteria:
event_type: click
element_id: "das-agent-toggle"
page: "/cloudstack/das.html"
alternative:
event_type: click
element_id: "open-chat-btn"
page: "/cloudstack/das.html"
alternatives:
- event_type: click
element_id: "open-chat-btn"
page: "/cloudstack/das.html"
- event_type: click
element_text: "DAS Agent"
parent_text_contains: "AI"
page: "/cloudstack/das.html"

# New: Send initial greeting
- type: greet_das_agent
Expand Down Expand Up @@ -114,4 +118,4 @@ criteria:
event_type: count_min
condition: "chat_interactions"
count: 3
page: "/cloudstack/das.html"
page: "/cloudstack/das.html"
25 changes: 14 additions & 11 deletions eval/evaluate_browser_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,17 +610,15 @@ def start_openbrowser(self) -> bool:
return True

root_dir = EVAL_DIR.parent
logger.error(
f"""
logger.error(f"""
❌ OpenBrowser server is not running!
Please start the OpenBrowser server manually with:

cd {root_dir}
uv run local-chrome-server serve

The server should start on port 8765 (REST API) and 8766 (WebSocket).
"""
)
""")
return False

except Exception as e:
Expand All @@ -637,8 +635,7 @@ def start_eval_server(self) -> bool:

eval_dir = EVAL_DIR
root_dir = EVAL_DIR.parent
logger.error(
f"""
logger.error(f"""
❌ Eval server is not running!
Please start the eval server manually with:

Expand All @@ -650,8 +647,7 @@ def start_eval_server(self) -> bool:
uv run python eval/server.py

The server should start on port 16605.
"""
)
""")
return False

except Exception as e:
Expand Down Expand Up @@ -1304,6 +1300,7 @@ def _evaluate_criteria(
expected = criterion.get("expected")
points = criterion.get("points", 1)
alternative = criterion.get("alternative")
alternatives = criterion.get("alternatives", [])
optional = criterion.get("optional", False)

# For optional criteria, we give the points automatically (treat as satisfied)
Expand All @@ -1314,9 +1311,15 @@ def _evaluate_criteria(
)
continue

if self._check_criterion(expected, track_events, sse_events) or (
alternative
and self._check_criterion(alternative, track_events, sse_events)
candidate_expectations = [expected]
if alternative:
candidate_expectations.append(alternative)
if alternatives:
candidate_expectations.extend(alternatives)

if any(
candidate and self._check_criterion(candidate, track_events, sse_events)
for candidate in candidate_expectations
):
score += points
logger.debug(
Expand Down
Loading
Loading