softpudding · softpudding · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
     hooks:
       - id: black
         name: black
-        entry: uv run black
+        entry: uv run --extra dev black
         language: system
         types_or: [python, pyi]
         require_serial: true

diff --git a/AGENTS.md b/AGENTS.md
@@ -204,6 +204,9 @@ Elements are paginated to ensure **no visual overlap** in each screenshot:
 
 - `highlight_elements` now uses a **snapshot-first** readiness check instead of page-side polling loops.
 - Reason: OpenBrowser intentionally keeps automated tabs in the browser background, and Chrome may heavily throttle hidden-tab timers. A page-side `setTimeout` stability loop can therefore take far longer than its nominal budget and become the main cause of highlight timeouts.
+- In practice, the main cause of unstable first-highlight screenshots is often **missing warmup**, not a bad readiness classifier. A background tab may answer lightweight `Runtime.evaluate` probes while still sitting in a partially painted / partially decoded state.
+- A screenshot-style warmup is therefore the default precondition for `highlight_elements`. It helps force hidden-tab paint/compositor/image-decode work before interactive-element detection runs.
+- If `highlight_elements` keeps returning `not_ready` but `tab view` immediately makes the next highlight succeed, treat that as a warmup issue first.
 - The extension samples viewport readiness signals once per attempt: document readiness, viewport text/media density, pending images, and loading placeholders such as skeleton/shimmer/spinner indicators.
 - Readiness is graded as `ready`, `provisionally_ready`, or `not_ready`.
 - If readiness is `not_ready`, the extension performs only a couple of short **background-side** retries before proceeding or returning the latest result.
@@ -213,7 +216,7 @@ Elements are paginated to ensure **no visual overlap** in each screenshot:
 ```
 # Highlight mixed elements first (default)
 highlight_elements()                              → Page 1 of any interactive elements
-highlight_elements(page=2)                         → Page 2 of the same any inventory
+highlight_elements(page=2)                         → Page 2 of the current page state's any results
 highlight_elements(element_type="any", page=1)    → Explicit any-first discovery
 
 # Highlight other types (one at a time)
@@ -312,6 +315,8 @@ cd extension && npm run build
 
 OpenBrowser has explicit screenshot control for maximum flexibility:
 
+- Screenshots also serve as a practical page warmup mechanism for background tabs. They can unblock page paint and media decode work that passive DOM/readiness inspection does not reliably trigger on its own.
+
 ### Commands That Return Screenshots
 
 | Command | Auto-Screenshot | Notes |

diff --git a/eval/bluebook/js/bluebook.js b/eval/bluebook/js/bluebook.js
@@ -612,6 +612,12 @@ window.tracker = new AgentTracker('bluebook.life', 'hard');
     return notes;
   }
 
+  function stabilizeDefaultFeedOrder(notes) {
+    keepNoteAwayFromTop(notes, 'note-openclaw-config', 18);
+    keepNoteAwayFromTop(notes, 'note-arigato-ai', 24);
+    return notes;
+  }
+
   function getCurrentNote() {
     return state.notes.find((note) => note.id === state.currentNoteId) || null;
   }
@@ -1059,7 +1065,7 @@ window.tracker = new AgentTracker('bluebook.life', 'hard');
       state.notes[swapIndex] = temp;
     }
 
-    keepNoteAwayFromTop(state.notes, 'note-openclaw-config', 18);
+    stabilizeDefaultFeedOrder(state.notes);
   }
 
   function handleFeedReload() {
@@ -1467,7 +1473,7 @@ window.tracker = new AgentTracker('bluebook.life', 'hard');
 
   function initialize() {
     state.notes = buildNotes();
-    keepNoteAwayFromTop(state.notes, 'note-openclaw-config', 18);
+    stabilizeDefaultFeedOrder(state.notes);
     state.query = getSearchQueryFromUrl();
 
     cacheDom();

diff --git a/eval/dataset/cloudstack.yaml b/eval/dataset/cloudstack.yaml
@@ -21,10 +21,14 @@ criteria:
       event_type: click
       element_id: "das-agent-toggle"
       page: "/cloudstack/das.html"
-    alternative:
-      event_type: click
-      element_id: "open-chat-btn"
-      page: "/cloudstack/das.html"
+    alternatives:
+      - event_type: click
+        element_id: "open-chat-btn"
+        page: "/cloudstack/das.html"
+      - event_type: click
+        element_text: "DAS Agent"
+        parent_text_contains: "AI"
+        page: "/cloudstack/das.html"
   - type: greet_das_agent
     description: "Send a greeting message to DAS agent"
     points: 1
@@ -40,4 +44,4 @@ criteria:
       event_type: click
       element_id: "send-btn"
       page: "/cloudstack/das.html"
-    optional: true
+    optional: true
diff --git a/eval/dataset/cloudstack_interactive.yaml b/eval/dataset/cloudstack_interactive.yaml
@@ -24,10 +24,14 @@ criteria:
       event_type: click
       element_id: "das-agent-toggle"
       page: "/cloudstack/das.html"
-    alternative:
-      event_type: click
-      element_id: "open-chat-btn"
-      page: "/cloudstack/das.html"
+    alternatives:
+      - event_type: click
+        element_id: "open-chat-btn"
+        page: "/cloudstack/das.html"
+      - event_type: click
+        element_text: "DAS Agent"
+        parent_text_contains: "AI"
+        page: "/cloudstack/das.html"
 
   # New: Send initial greeting
   - type: greet_das_agent
@@ -114,4 +118,4 @@ criteria:
       event_type: count_min
       condition: "chat_interactions"
       count: 3
-      page: "/cloudstack/das.html"
+      page: "/cloudstack/das.html"
diff --git a/eval/evaluate_browser_agent.py b/eval/evaluate_browser_agent.py
@@ -610,17 +610,15 @@ def start_openbrowser(self) -> bool:
                 return True
 
             root_dir = EVAL_DIR.parent
-            logger.error(
-                f"""
+            logger.error(f"""
 ❌ OpenBrowser server is not running!
    Please start the OpenBrowser server manually with:
 
    cd {root_dir}
    uv run local-chrome-server serve
 
    The server should start on port 8765 (REST API) and 8766 (WebSocket).
-"""
-            )
+""")
             return False
 
         except Exception as e:
@@ -637,8 +635,7 @@ def start_eval_server(self) -> bool:
 
             eval_dir = EVAL_DIR
             root_dir = EVAL_DIR.parent
-            logger.error(
-                f"""
+            logger.error(f"""
 ❌ Eval server is not running!
    Please start the eval server manually with:
 
@@ -650,8 +647,7 @@ def start_eval_server(self) -> bool:
    uv run python eval/server.py
 
    The server should start on port 16605.
-"""
-            )
+""")
             return False
 
         except Exception as e:
@@ -1304,6 +1300,7 @@ def _evaluate_criteria(
             expected = criterion.get("expected")
             points = criterion.get("points", 1)
             alternative = criterion.get("alternative")
+            alternatives = criterion.get("alternatives", [])
             optional = criterion.get("optional", False)
 
             # For optional criteria, we give the points automatically (treat as satisfied)
@@ -1314,9 +1311,15 @@ def _evaluate_criteria(
                 )
                 continue
 
-            if self._check_criterion(expected, track_events, sse_events) or (
-                alternative
-                and self._check_criterion(alternative, track_events, sse_events)
+            candidate_expectations = [expected]
+            if alternative:
+                candidate_expectations.append(alternative)
+            if alternatives:
+                candidate_expectations.extend(alternatives)
+
+            if any(
+                candidate and self._check_criterion(candidate, track_events, sse_events)
+                for candidate in candidate_expectations
             ):
                 score += points
                 logger.debug(